From 91da337e5d506f2c065d20529d105ca40090e320 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 19 Jul 2024 14:55:53 -0400
Subject: nfsd: don't set SVC_SOCK_ANONYMOUS when creating nfsd sockets

When creating nfsd sockets via the netlink interface, we do want to
register with the portmapper. Don't set SVC_SOCK_ANONYMOUS.

Reported-by: Steve Dickson <steved@redhat.com>
Fixes: 16a471177496 ("NFSD: add listener-{set,get} netlink command")
Cc: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 9e0ea6fc2aa3..34eb2c2cbcde 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2069,8 +2069,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
 			continue;
 		}
 
-		ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa,
-					      SVC_SOCK_ANONYMOUS,
+		ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa, 0,
 					      get_current_cred());
 		/* always save the latest error */
 		if (ret < 0)
-- 
cgit 


From e61dd678601eac53d501dda1eb1bcffec7b11bd6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 30 Jul 2024 18:21:32 -0400
Subject: bcachefs: Fix double free of ca->buckets_nouse

Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Fixes: ffcbec6076 ("bcachefs: Kill opts.buckets_nouse")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 0455a1001fec..e7fa2de35014 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -1193,7 +1193,6 @@ static void bch2_dev_free(struct bch_dev *ca)
 	if (ca->kobj.state_in_sysfs)
 		kobject_del(&ca->kobj);
 
-	kfree(ca->buckets_nouse);
 	bch2_free_super(&ca->disk_sb);
 	bch2_dev_allocator_background_exit(ca);
 	bch2_dev_journal_exit(ca);
-- 
cgit 


From 33eb1e5db351e2c0e652d878b66b8a6d4d013135 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 8 Mar 2024 13:40:30 +1030
Subject: btrfs: factor out stripe length calculation into a helper

Currently there are two locations which need to calculate the real
length of a stripe (which can be at the end of a chunk, and the chunk
size may not always be 64K aligned).

Factor them into a helper as we're going to have a third user soon.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 14a8d7100018..439545710d76 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1648,14 +1648,20 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
 	}
 }
 
+static u32 stripe_length(const struct scrub_stripe *stripe)
+{
+	ASSERT(stripe->bg);
+
+	return min(BTRFS_STRIPE_LEN,
+		   stripe->bg->start + stripe->bg->length - stripe->logical);
+}
+
 static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
 					    struct scrub_stripe *stripe)
 {
 	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
 	struct btrfs_bio *bbio = NULL;
-	unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
-				      stripe->bg->length - stripe->logical) >>
-				  fs_info->sectorsize_bits;
+	unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
 	u64 stripe_len = BTRFS_STRIPE_LEN;
 	int mirror = stripe->mirror_num;
 	int i;
@@ -1729,9 +1735,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
 {
 	struct btrfs_fs_info *fs_info = sctx->fs_info;
 	struct btrfs_bio *bbio;
-	unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
-				      stripe->bg->length - stripe->logical) >>
-				  fs_info->sectorsize_bits;
+	unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
 	int mirror = stripe->mirror_num;
 
 	ASSERT(stripe->bg);
-- 
cgit 


From 63447b7dd40c6a9ae8d3bb70c11f4c46731823e3 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 8 Mar 2024 13:40:31 +1030
Subject: btrfs: scrub: update last_physical after scrubbing one stripe

Currently sctx->stat.last_physical only got updated in the following
cases:

- When the last stripe of a non-RAID56 chunk is scrubbed
  This implies a pitfall, if the last stripe is at the chunk boundary,
  and we finished the scrub of the whole chunk, we won't update
  last_physical at all until the next chunk.

- When a P/Q stripe of a RAID56 chunk is scrubbed

This leads the following two problems:

- sctx->stat.last_physical is not updated for a almost full chunk
  This is especially bad, affecting scrub resume, as the resume would
  start from last_physical, causing unnecessary re-scrub.

- "btrfs scrub status" will not report any progress for a long time

Fix the problem by properly updating @last_physical after each stripe is
scrubbed.

And since we're here, for the sake of consistency, use spin lock to
protect the update of @last_physical, just like all the remaining
call sites touching sctx->stat.

Reported-by: Michel Palleau <michel.palleau@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAMFk-+igFTv2E8svg=cQ6o3e6CrR5QwgQ3Ok9EyRaEvvthpqCQ@mail.gmail.com/
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 439545710d76..0de9162ff481 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1875,6 +1875,9 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
 		stripe = &sctx->stripes[i];
 
 		wait_scrub_stripe_io(stripe);
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.last_physical = stripe->physical + stripe_length(stripe);
+		spin_unlock(&sctx->stat_lock);
 		scrub_reset_stripe(stripe);
 	}
 out:
@@ -2143,7 +2146,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
 					 cur_physical, &found_logical);
 		if (ret > 0) {
 			/* No more extent, just update the accounting */
+			spin_lock(&sctx->stat_lock);
 			sctx->stat.last_physical = physical + logical_length;
+			spin_unlock(&sctx->stat_lock);
 			ret = 0;
 			break;
 		}
@@ -2340,6 +2345,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 			stripe_logical += chunk_logical;
 			ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
 							 map, stripe_logical);
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.last_physical = min(physical + BTRFS_STRIPE_LEN,
+						       physical_end);
+			spin_unlock(&sctx->stat_lock);
 			if (ret)
 				goto out;
 			goto next;
-- 
cgit 


From 872617a0896fc7510b0b8f25d323670424461cfc Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Thu, 18 Jul 2024 14:46:23 -0700
Subject: btrfs: implement launder_folio for clearing dirty page reserve

In the buffered write path, dirty pages can be said to "own" the qgroup
reservation until they create an ordered_extent. It is possible for
there to be outstanding dirty pages when a transaction is aborted, in
which case there is no cancellation path for freeing this reservation
and it is leaked.

We do already walk the list of outstanding delalloc inodes in
btrfs_destroy_delalloc_inodes() and call invalidate_inode_pages2() on them.

This does *not* call btrfs_invalidate_folio(), as one might guess, but
rather calls launder_folio() and release_folio(). Since this is a
reservation associated with dirty pages only, rather than something
associated with the private bit (ordered_extent is cancelled separately
already in the cleanup transaction path), implementing this release
should be done via launder_folio.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d4e0a65494a..1659ed3a0ba9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7202,6 +7202,12 @@ static void wait_subpage_spinlock(struct page *page)
 	spin_unlock_irq(&subpage->lock);
 }
 
+static int btrfs_launder_folio(struct folio *folio)
+{
+	return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
+				      PAGE_SIZE, NULL);
+}
+
 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
 {
 	if (try_release_extent_mapping(&folio->page, gfp_flags)) {
@@ -10137,6 +10143,7 @@ static const struct address_space_operations btrfs_aops = {
 	.writepages	= btrfs_writepages,
 	.readahead	= btrfs_readahead,
 	.invalidate_folio = btrfs_invalidate_folio,
+	.launder_folio	= btrfs_launder_folio,
 	.release_folio	= btrfs_release_folio,
 	.migrate_folio	= btrfs_migrate_folio,
 	.dirty_folio	= filemap_dirty_folio,
-- 
cgit 


From 30479f31d44d47ed00ae0c7453d9b253537005b2 Mon Sep 17 00:00:00 2001
From: Boris Burkov <boris@bur.io>
Date: Fri, 19 Jul 2024 16:49:08 -0700
Subject: btrfs: fix qgroup reserve leaks in cow_file_range

In the buffered write path, the dirty page owns the qgroup reserve until
it creates an ordered_extent.

Therefore, any errors that occur before the ordered_extent is created
must free that reservation, or else the space is leaked. The fstest
generic/475 exercises various IO error paths, and is able to trigger
errors in cow_file_range where we fail to get to allocating the ordered
extent. Note that because we *do* clear delalloc, we are likely to
remove the inode from the delalloc list, so the inodes/pages to not have
invalidate/launder called on them in the commit abort path.

This results in failures at the unmount stage of the test that look like:

  BTRFS: error (device dm-8 state EA) in cleanup_transaction:2018: errno=-5 IO failure
  BTRFS: error (device dm-8 state EA) in btrfs_replace_file_extents:2416: errno=-5 IO failure
  BTRFS warning (device dm-8 state EA): qgroup 0/5 has unreleased space, type 0 rsv 28672
  ------------[ cut here ]------------
  WARNING: CPU: 3 PID: 22588 at fs/btrfs/disk-io.c:4333 close_ctree+0x222/0x4d0 [btrfs]
  Modules linked in: btrfs blake2b_generic libcrc32c xor zstd_compress raid6_pq
  CPU: 3 PID: 22588 Comm: umount Kdump: loaded Tainted: G W          6.10.0-rc7-gab56fde445b8 #21
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
  RIP: 0010:close_ctree+0x222/0x4d0 [btrfs]
  RSP: 0018:ffffb4465283be00 EFLAGS: 00010202
  RAX: 0000000000000001 RBX: ffffa1a1818e1000 RCX: 0000000000000001
  RDX: 0000000000000000 RSI: ffffb4465283bbe0 RDI: ffffa1a19374fcb8
  RBP: ffffa1a1818e13c0 R08: 0000000100028b16 R09: 0000000000000000
  R10: 0000000000000003 R11: 0000000000000003 R12: ffffa1a18ad7972c
  R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
  FS:  00007f9168312b80(0000) GS:ffffa1a4afcc0000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f91683c9140 CR3: 000000010acaa000 CR4: 00000000000006f0
  Call Trace:
   <TASK>
   ? close_ctree+0x222/0x4d0 [btrfs]
   ? __warn.cold+0x8e/0xea
   ? close_ctree+0x222/0x4d0 [btrfs]
   ? report_bug+0xff/0x140
   ? handle_bug+0x3b/0x70
   ? exc_invalid_op+0x17/0x70
   ? asm_exc_invalid_op+0x1a/0x20
   ? close_ctree+0x222/0x4d0 [btrfs]
   generic_shutdown_super+0x70/0x160
   kill_anon_super+0x11/0x40
   btrfs_kill_super+0x11/0x20 [btrfs]
   deactivate_locked_super+0x2e/0xa0
   cleanup_mnt+0xb5/0x150
   task_work_run+0x57/0x80
   syscall_exit_to_user_mode+0x121/0x130
   do_syscall_64+0xab/0x1a0
   entry_SYSCALL_64_after_hwframe+0x77/0x7f
  RIP: 0033:0x7f916847a887
  ---[ end trace 0000000000000000 ]---
  BTRFS error (device dm-8 state EA): qgroup reserved space leaked

Cases 2 and 3 in the out_reserve path both pertain to this type of leak
and must free the reserved qgroup data. Because it is already an error
path, I opted not to handle the possible errors in
btrfs_free_qgroup_data.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1659ed3a0ba9..ceee5422089b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1585,6 +1585,7 @@ out_unlock:
 					     locked_page, &cached,
 					     clear_bits,
 					     page_ops);
+		btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
 		start += cur_alloc_size;
 	}
 
@@ -1598,6 +1599,7 @@ out_unlock:
 		clear_bits |= EXTENT_CLEAR_DATA_RESV;
 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
 					     &cached, clear_bits, page_ops);
+		btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
 	}
 	return ret;
 }
@@ -2259,6 +2261,7 @@ error:
 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
 					     PAGE_START_WRITEBACK |
 					     PAGE_END_WRITEBACK);
+		btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
 	}
 	btrfs_free_path(path);
 	return ret;
-- 
cgit 


From 1e7bec1f7d6533f08bc1c4ee94930c02361db86c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Mon, 29 Jul 2024 11:05:48 -0400
Subject: btrfs: emit a warning about space cache v1 being deprecated

We've been wanting to get rid of this for a while, add a message to
indicate that this feature is going away and when so we can finally have
a date when we're going to remove it.  The output looks like this

BTRFS warning (device nvme0n1): space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2

Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Neal Gompa <neal@gompa.dev>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 08d33cb372fb..83478deada3b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -683,8 +683,11 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
 		ret = false;
 
 	if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
-		if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE))
+		if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
 			btrfs_info(info, "disk space caching is enabled");
+			btrfs_warn(info,
+"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
+		}
 		if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
 			btrfs_info(info, "using free-space-tree");
 	}
-- 
cgit 


From e0391e92f9ab4fb3dbdeb139c967dcfa7ac4b115 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 2 Aug 2024 09:38:51 +0100
Subject: btrfs: fix double inode unlock for direct IO sync writes

If we do a direct IO sync write, at btrfs_sync_file(), and we need to skip
inode logging or we get an error starting a transaction or an error when
flushing delalloc, we end up unlocking the inode when we shouldn't under
the 'out_release_extents' label, and then unlock it again at
btrfs_direct_write().

Fix that by checking if we have to skip inode unlocking under that label.

Reported-by: syzbot+7dbbb74af6291b5a5a8b@syzkaller.appspotmail.com
Link: https://lore.kernel.org/linux-btrfs/000000000000dfd631061eaeb4bc@google.com/
Fixes: 939b656bc8ab ("btrfs: fix corruption after buffer fault in during direct IO append write")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9f10a9f23fcc..9914419f3b7d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1868,7 +1868,10 @@ out:
 
 out_release_extents:
 	btrfs_release_log_ctx_extents(&ctx);
-	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
+	if (skip_ilock)
+		up_write(&inode->i_mmap_lock);
+	else
+		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
 	goto out;
 }
 
-- 
cgit 


From 12653ec36112ab55fa06c01db7c4432653d30a8d Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 19 Jul 2024 18:56:46 +0930
Subject: btrfs: avoid using fixed char array size for tree names
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[BUG]
There is a bug report that using the latest trunk GCC 15, btrfs would cause
unterminated-string-initialization warning:

  linux-6.6/fs/btrfs/print-tree.c:29:49: error: initializer-string for array of ‘char’ is too long [-Werror=unterminated-string-initialization]
   29 |         { BTRFS_BLOCK_GROUP_TREE_OBJECTID,      "BLOCK_GROUP_TREE"      },
      |
      ^~~~~~~~~~~~~~~~~~

[CAUSE]
To print tree names we have an array of root_name_map structure, which
uses "char name[16];" to store the name string of a tree.

But the following trees have names exactly at 16 chars length:
- "BLOCK_GROUP_TREE"
- "RAID_STRIPE_TREE"

This means we will have no space for the terminating '\0', and can lead
to unexpected access when printing the name.

[FIX]
Instead of "char name[16];" use "const char *" instead.

Since the name strings are all read-only data, and are all NULL
terminated by default, there is not much need to bother the length at
all.

Reported-by: Sam James <sam@gentoo.org>
Reported-by: Alejandro Colomar <alx@kernel.org>
Fixes: edde81f1abf29 ("btrfs: add raid stripe tree pretty printer")
Fixes: 9c54e80ddc6bd ("btrfs: add code to support the block group root")
CC: stable@vger.kernel.org # 6.1+
Suggested-by: Alejandro Colomar <alx@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Alejandro Colomar <alx@kernel.org>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/print-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 32dcea662da3..fc821aa446f0 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -14,7 +14,7 @@
 
 struct root_name_map {
 	u64 id;
-	char name[16];
+	const char *name;
 };
 
 static const struct root_name_map root_map[] = {
-- 
cgit 


From 9a2fa1472083580b6c66bdaf291f591e1170123a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 3 Aug 2024 18:02:00 -0400
Subject: fix bitmap corruption on close_range() with CLOSE_RANGE_UNSHARE

copy_fd_bitmaps(new, old, count) is expected to copy the first
count/BITS_PER_LONG bits from old->full_fds_bits[] and fill
the rest with zeroes.  What it does is copying enough words
(BITS_TO_LONGS(count/BITS_PER_LONG)), then memsets the rest.
That works fine, *if* all bits past the cutoff point are
clear.  Otherwise we are risking garbage from the last word
we'd copied.

For most of the callers that is true - expand_fdtable() has
count equal to old->max_fds, so there's no open descriptors
past count, let alone fully occupied words in ->open_fds[],
which is what bits in ->full_fds_bits[] correspond to.

The other caller (dup_fd()) passes sane_fdtable_size(old_fdt, max_fds),
which is the smallest multiple of BITS_PER_LONG that covers all
opened descriptors below max_fds.  In the common case (copying on
fork()) max_fds is ~0U, so all opened descriptors will be below
it and we are fine, by the same reasons why the call in expand_fdtable()
is safe.

Unfortunately, there is a case where max_fds is less than that
and where we might, indeed, end up with junk in ->full_fds_bits[] -
close_range(from, to, CLOSE_RANGE_UNSHARE) with
	* descriptor table being currently shared
	* 'to' being above the current capacity of descriptor table
	* 'from' being just under some chunk of opened descriptors.
In that case we end up with observably wrong behaviour - e.g. spawn
a child with CLONE_FILES, get all descriptors in range 0..127 open,
then close_range(64, ~0U, CLOSE_RANGE_UNSHARE) and watch dup(0) ending
up with descriptor #128, despite #64 being observably not open.

The minimally invasive fix would be to deal with that in dup_fd().
If this proves to add measurable overhead, we can go that way, but
let's try to fix copy_fd_bitmaps() first.

* new helper: bitmap_copy_and_expand(to, from, bits_to_copy, size).
* make copy_fd_bitmaps() take the bitmap size in words, rather than
bits; it's 'count' argument is always a multiple of BITS_PER_LONG,
so we are not losing any information, and that way we can use the
same helper for all three bitmaps - compiler will see that count
is a multiple of BITS_PER_LONG for the large ones, so it'll generate
plain memcpy()+memset().

Reproducer added to tools/testing/selftests/core/close_range_test.c

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c                                       | 30 +++++++++------------
 include/linux/bitmap.h                          | 12 +++++++++
 tools/testing/selftests/core/close_range_test.c | 35 +++++++++++++++++++++++++
 3 files changed, 60 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index a11e59b5d602..655338effe9c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -46,27 +46,23 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 #define BITBIT_NR(nr)	BITS_TO_LONGS(BITS_TO_LONGS(nr))
 #define BITBIT_SIZE(nr)	(BITBIT_NR(nr) * sizeof(long))
 
+#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
 /*
  * Copy 'count' fd bits from the old table to the new table and clear the extra
  * space if any.  This does not copy the file pointers.  Called with the files
  * spinlock held for write.
  */
-static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
-			    unsigned int count)
+static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+			    unsigned int copy_words)
 {
-	unsigned int cpy, set;
-
-	cpy = count / BITS_PER_BYTE;
-	set = (nfdt->max_fds - count) / BITS_PER_BYTE;
-	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
-	memset((char *)nfdt->open_fds + cpy, 0, set);
-	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
-	memset((char *)nfdt->close_on_exec + cpy, 0, set);
-
-	cpy = BITBIT_SIZE(count);
-	set = BITBIT_SIZE(nfdt->max_fds) - cpy;
-	memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
-	memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+	unsigned int nwords = fdt_words(nfdt);
+
+	bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
+			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+	bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
+			copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
+	bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
+			copy_words, nwords);
 }
 
 /*
@@ -84,7 +80,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 	memcpy(nfdt->fd, ofdt->fd, cpy);
 	memset((char *)nfdt->fd + cpy, 0, set);
 
-	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+	copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
 }
 
 /*
@@ -379,7 +375,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 		open_files = sane_fdtable_size(old_fdt, max_fds);
 	}
 
-	copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+	copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);
 
 	old_fds = old_fdt->fd;
 	new_fds = new_fdt->fd;
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 8c4768c44a01..d3b66d77df7a 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -270,6 +270,18 @@ static inline void bitmap_copy_clear_tail(unsigned long *dst,
 		dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
 }
 
+static inline void bitmap_copy_and_extend(unsigned long *to,
+					  const unsigned long *from,
+					  unsigned int count, unsigned int size)
+{
+	unsigned int copy = BITS_TO_LONGS(count);
+
+	memcpy(to, from, copy * sizeof(long));
+	if (count % BITS_PER_LONG)
+		to[copy - 1] &= BITMAP_LAST_WORD_MASK(count);
+	memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long));
+}
+
 /*
  * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64
  * machines the order of hi and lo parts of numbers match the bitmap structure.
diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
index 991c473e3859..12b4eb9d0434 100644
--- a/tools/testing/selftests/core/close_range_test.c
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -589,4 +589,39 @@ TEST(close_range_cloexec_unshare_syzbot)
 	EXPECT_EQ(close(fd3), 0);
 }
 
+TEST(close_range_bitmap_corruption)
+{
+	pid_t pid;
+	int status;
+	struct __clone_args args = {
+		.flags = CLONE_FILES,
+		.exit_signal = SIGCHLD,
+	};
+
+	/* get the first 128 descriptors open */
+	for (int i = 2; i < 128; i++)
+		EXPECT_GE(dup2(0, i), 0);
+
+	/* get descriptor table shared */
+	pid = sys_clone3(&args, sizeof(args));
+	ASSERT_GE(pid, 0);
+
+	if (pid == 0) {
+		/* unshare and truncate descriptor table down to 64 */
+		if (sys_close_range(64, ~0U, CLOSE_RANGE_UNSHARE))
+			exit(EXIT_FAILURE);
+
+		ASSERT_EQ(fcntl(64, F_GETFD), -1);
+		/* ... and verify that the range 64..127 is not
+		   stuck "fully used" according to secondary bitmap */
+		EXPECT_EQ(dup(0), 64)
+			exit(EXIT_FAILURE);
+		exit(EXIT_SUCCESS);
+	}
+
+	EXPECT_EQ(waitpid(pid, &status, 0), pid);
+	EXPECT_EQ(true, WIFEXITED(status));
+	EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
 TEST_HARNESS_MAIN
-- 
cgit 


From 7442b5cdf259e2fb112560904b7002ce48d15578 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 30 Jul 2024 20:33:25 -0400
Subject: bcachefs: Don't rely on implicit unsigned -> signed integer
 conversion

implicit integer conversion is a fertile source of bugs, and we really
would rather not have the min()/max() macros doing it implicitly.
bcachefs appears to be the only place in the kernel where this happens,
so let's fix it.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 8d2b62c9588e..f13e619b4b21 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -166,8 +166,8 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
 	 * avoid overflowing LRU_TIME_BITS on a corrupted fs, when
 	 * bucket_sectors_dirty is (much) bigger than bucket_size
 	 */
-	u64 d = min(bch2_bucket_sectors_dirty(a),
-		    ca->mi.bucket_size);
+	u64 d = min_t(s64, bch2_bucket_sectors_dirty(a),
+		      ca->mi.bucket_size);
 
 	return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
 }
-- 
cgit 


From 90b211fa2da3f36939e84b6426988832a62caf4b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 30 Jul 2024 20:35:59 -0400
Subject: bcachefs: Add a comment for bucket helper types

We've had bugs in the past with incorrect integer conversions in disk
accounting code, which is why bucket helpers now always return s64s; add
a comment explaining this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index f13e619b4b21..96a0444ea78f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -82,6 +82,14 @@ static inline bool bucket_data_type_mismatch(enum bch_data_type bucket,
 		bucket_data_type(bucket) != bucket_data_type(ptr);
 }
 
+/*
+ * It is my general preference to use unsigned types for unsigned quantities -
+ * however, these helpers are used in disk accounting calculations run by
+ * triggers where the output will be negated and added to an s64. unsigned is
+ * right out even though all these quantities will fit in 32 bits, since it
+ * won't be sign extended correctly; u64 will negate "correctly", but s64 is the
+ * simpler option here.
+ */
 static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a)
 {
 	return a.stripe_sectors + a.dirty_sectors + a.cached_sectors;
-- 
cgit 


From 02026e8931366158d7395f87afeb0b535210dbee Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Aug 2024 22:49:31 -0400
Subject: bcachefs: Add missing bch2_trans_begin() call

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/io_read.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 4531c9ab3e12..7ee3b75480df 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -406,6 +406,7 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 	bch2_trans_iter_init(trans, &iter, rbio->data_btree,
 			     rbio->read_pos, BTREE_ITER_slots);
 retry:
+	bch2_trans_begin(trans);
 	rbio->bio.bi_status = 0;
 
 	k = bch2_btree_iter_peek_slot(&iter);
-- 
cgit 


From c1e4446247b2a8919649fb9aae2d86f53bf3d1e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Aug 2024 21:02:34 -0400
Subject: bcachefs: Improved allocator debugging for ec

chasing down a device removal deadlock with erasure coding

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  6 ++++--
 fs/bcachefs/alloc_foreground.h |  2 +-
 fs/bcachefs/ec.c               | 31 ++++++++++++++++++++-----------
 fs/bcachefs/sysfs.c            |  6 +++++-
 4 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 618d2ff0292e..8683fe4fae5b 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1603,7 +1603,8 @@ void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct ope
 	prt_newline(out);
 }
 
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct bch_dev *ca)
 {
 	struct open_bucket *ob;
 
@@ -1613,7 +1614,8 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
 	     ob++) {
 		spin_lock(&ob->lock);
-		if (ob->valid && !ob->on_partial_list)
+		if (ob->valid && !ob->on_partial_list &&
+		    (!ca || ob->dev == ca->dev_idx))
 			bch2_open_bucket_to_text(out, c, ob);
 		spin_unlock(&ob->lock);
 	}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 6da9e7e29026..c78a64ec0553 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -223,7 +223,7 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
 void bch2_open_bucket_to_text(struct printbuf *, struct bch_fs *, struct open_bucket *);
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *, struct bch_dev *);
 void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 9b5b5c9a6c63..8d0cca2f14ed 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -2235,6 +2235,23 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 	mutex_unlock(&c->ec_stripes_heap_lock);
 }
 
+static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
+				    struct ec_stripe_new *s)
+{
+	prt_printf(out, "\tidx %llu blocks %u+%u allocated %u ref %u %u %s obs",
+		   s->idx, s->nr_data, s->nr_parity,
+		   bitmap_weight(s->blocks_allocated, s->nr_data),
+		   atomic_read(&s->ref[STRIPE_REF_io]),
+		   atomic_read(&s->ref[STRIPE_REF_stripe]),
+		   bch2_watermarks[s->h->watermark]);
+
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+	unsigned i;
+	for_each_set_bit(i, s->blocks_gotten, v->nr_blocks)
+		prt_printf(out, " %u", s->blocks[i]);
+	prt_newline(out);
+}
+
 void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct ec_stripe_head *h;
@@ -2247,23 +2264,15 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
 		       bch2_watermarks[h->watermark]);
 
 		if (h->s)
-			prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n",
-			       h->s->idx, h->s->nr_data, h->s->nr_parity,
-			       bitmap_weight(h->s->blocks_allocated,
-					     h->s->nr_data));
+			bch2_new_stripe_to_text(out, c, h->s);
 	}
 	mutex_unlock(&c->ec_stripe_head_lock);
 
 	prt_printf(out, "in flight:\n");
 
 	mutex_lock(&c->ec_stripe_new_lock);
-	list_for_each_entry(s, &c->ec_stripe_new_list, list) {
-		prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n",
-			   s->idx, s->nr_data, s->nr_parity,
-			   atomic_read(&s->ref[STRIPE_REF_io]),
-			   atomic_read(&s->ref[STRIPE_REF_stripe]),
-			   bch2_watermarks[s->h->watermark]);
-	}
+	list_for_each_entry(s, &c->ec_stripe_new_list, list)
+		bch2_new_stripe_to_text(out, c, s);
 	mutex_unlock(&c->ec_stripe_new_lock);
 }
 
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 1c0d1fb20276..f393023a3ae2 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -367,7 +367,7 @@ SHOW(bch2_fs)
 		bch2_stripes_heap_to_text(out, c);
 
 	if (attr == &sysfs_open_buckets)
-		bch2_open_buckets_to_text(out, c);
+		bch2_open_buckets_to_text(out, c, NULL);
 
 	if (attr == &sysfs_open_buckets_partial)
 		bch2_open_buckets_partial_to_text(out, c);
@@ -811,6 +811,9 @@ SHOW(bch2_dev)
 	if (attr == &sysfs_alloc_debug)
 		bch2_dev_alloc_debug_to_text(out, ca);
 
+	if (attr == &sysfs_open_buckets)
+		bch2_open_buckets_to_text(out, c, ca);
+
 	return 0;
 }
 
@@ -892,6 +895,7 @@ struct attribute *bch2_dev_files[] = {
 
 	/* debug: */
 	&sysfs_alloc_debug,
+	&sysfs_open_buckets,
 	NULL
 };
 
-- 
cgit 


From 2caca9fb166f82937110368768002511628e6e1f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 6 Aug 2024 23:30:48 -0400
Subject: bcachefs: ec should not allocate from ro devs

This fixes a device removal deadlock when using erasure coding.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/ec.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 8d0cca2f14ed..84f1cbf6497f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1809,6 +1809,9 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
 	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
 	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
 
+	/* * We bypass the sector allocator which normally does this: */
+	bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
+
 	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
 		__clear_bit(v->ptrs[i].dev, devs.d);
 		if (i < h->s->nr_data)
-- 
cgit 


From 0df2ac59bebfac221463ef57ed3554899b41d75f Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@grsecurity.net>
Date: Wed, 7 Aug 2024 13:51:38 +0200
Subject: tracefs: Fix inode allocation

The leading comment above alloc_inode_sb() is pretty explicit about it:

  /*
   * This must be used for allocating filesystems specific inodes to set
   * up the inode reclaim context correctly.
   */

Switch tracefs over to alloc_inode_sb() to make sure inodes are properly
linked.

Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/20240807115143.45927-2-minipli@grsecurity.net
Fixes: ba37ff75e04b ("eventfs: Implement tracefs_inode_cache")
Signed-off-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 1028ab6d9a74..21a7e51fc3c1 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -42,7 +42,7 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb)
 	struct tracefs_inode *ti;
 	unsigned long flags;
 
-	ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL);
+	ti = alloc_inode_sb(sb, tracefs_inode_cachep, GFP_KERNEL);
 	if (!ti)
 		return NULL;
 
-- 
cgit 


From 12c20c65d0460cf34f9a665d8f0c0d77d45a3829 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@grsecurity.net>
Date: Tue, 23 Jul 2024 14:25:21 +0200
Subject: eventfs: Don't return NULL in eventfs_create_dir()

Commit 77a06c33a22d ("eventfs: Test for ei->is_freed when accessing
ei->dentry") added another check, testing if the parent was freed after
we released the mutex. If so, the function returns NULL. However, all
callers expect it to either return a valid pointer or an error pointer,
at least since commit 5264a2f4bb3b ("tracing: Fix a NULL vs IS_ERR() bug
in event_subsystem_dir()"). Returning NULL will therefore fail the error
condition check in the caller.

Fix this by substituting the NULL return value with a fitting error
pointer.

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: stable@vger.kernel.org
Fixes: 77a06c33a22d ("eventfs: Test for ei->is_freed when accessing ei->dentry")
Link: https://lore.kernel.org/20240723122522.2724-1-minipli@grsecurity.net
Reviewed-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Ajay Kaher <ajay.kaher@broadcom.com>
Signed-off-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 5d88c184f0fc..a9c28a1d5dc8 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -736,7 +736,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
 	/* Was the parent freed? */
 	if (list_empty(&ei->list)) {
 		cleanup_ei(ei);
-		ei = NULL;
+		ei = ERR_PTR(-EBUSY);
 	}
 	return ei;
 }
-- 
cgit 


From 8e556432477e97ad6179c61b61a32bf5f1af2355 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@grsecurity.net>
Date: Tue, 23 Jul 2024 23:07:53 +0200
Subject: eventfs: Use SRCU for freeing eventfs_inodes

To mirror the SRCU lock held in eventfs_iterate() when iterating over
eventfs inodes, use call_srcu() to free them too.

This was accidentally(?) degraded to RCU in commit 43aa6f97c2d0
("eventfs: Get rid of dentry pointers without refcounts").

Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/20240723210755.8970-1-minipli@grsecurity.net
Fixes: 43aa6f97c2d0 ("eventfs: Get rid of dentry pointers without refcounts")
Signed-off-by: Mathias Krause <minipli@grsecurity.net>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index a9c28a1d5dc8..01e99e98457d 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -112,7 +112,7 @@ static void release_ei(struct kref *ref)
 			entry->release(entry->name, ei->data);
 	}
 
-	call_rcu(&ei->rcu, free_ei_rcu);
+	call_srcu(&eventfs_srcu, &ei->rcu, free_ei_rcu);
 }
 
 static inline void put_ei(struct eventfs_inode *ei)
-- 
cgit 


From 0b6743bd60a56a701070b89fb80c327a44b7b3e2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 7 Aug 2024 18:54:02 -0400
Subject: tracefs: Use generic inode RCU for synchronizing freeing

With structure layout randomization enabled for 'struct inode' we need to
avoid overlapping any of the RCU-used / initialized-only-once members,
e.g. i_lru or i_sb_list to not corrupt related list traversals when making
use of the rcu_head.

For an unlucky structure layout of 'struct inode' we may end up with the
following splat when running the ftrace selftests:

[<...>] list_del corruption, ffff888103ee2cb0->next (tracefs_inode_cache+0x0/0x4e0 [slab object]) is NULL (prev is tracefs_inode_cache+0x78/0x4e0 [slab object])
[<...>] ------------[ cut here ]------------
[<...>] kernel BUG at lib/list_debug.c:54!
[<...>] invalid opcode: 0000 [#1] PREEMPT SMP KASAN
[<...>] CPU: 3 PID: 2550 Comm: mount Tainted: G                 N  6.8.12-grsec+ #122 ed2f536ca62f28b087b90e3cc906a8d25b3ddc65
[<...>] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
[<...>] RIP: 0010:[<ffffffff84656018>] __list_del_entry_valid_or_report+0x138/0x3e0
[<...>] Code: 48 b8 99 fb 65 f2 ff ff ff ff e9 03 5c d9 fc cc 48 b8 99 fb 65 f2 ff ff ff ff e9 33 5a d9 fc cc 48 b8 99 fb 65 f2 ff ff ff ff <0f> 0b 4c 89 e9 48 89 ea 48 89 ee 48 c7 c7 60 8f dd 89 31 c0 e8 2f
[<...>] RSP: 0018:fffffe80416afaf0 EFLAGS: 00010283
[<...>] RAX: 0000000000000098 RBX: ffff888103ee2cb0 RCX: 0000000000000000
[<...>] RDX: ffffffff84655fe8 RSI: ffffffff89dd8b60 RDI: 0000000000000001
[<...>] RBP: ffff888103ee2cb0 R08: 0000000000000001 R09: fffffbd0082d5f25
[<...>] R10: fffffe80416af92f R11: 0000000000000001 R12: fdf99c16731d9b6d
[<...>] R13: 0000000000000000 R14: ffff88819ad4b8b8 R15: 0000000000000000
[<...>] RBX: tracefs_inode_cache+0x0/0x4e0 [slab object]
[<...>] RDX: __list_del_entry_valid_or_report+0x108/0x3e0
[<...>] RSI: __func__.47+0x4340/0x4400
[<...>] RBP: tracefs_inode_cache+0x0/0x4e0 [slab object]
[<...>] RSP: process kstack fffffe80416afaf0+0x7af0/0x8000 [mount 2550 2550]
[<...>] R09: kasan shadow of process kstack fffffe80416af928+0x7928/0x8000 [mount 2550 2550]
[<...>] R10: process kstack fffffe80416af92f+0x792f/0x8000 [mount 2550 2550]
[<...>] R14: tracefs_inode_cache+0x78/0x4e0 [slab object]
[<...>] FS:  00006dcb380c1840(0000) GS:ffff8881e0600000(0000) knlGS:0000000000000000
[<...>] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[<...>] CR2: 000076ab72b30e84 CR3: 000000000b088004 CR4: 0000000000360ef0 shadow CR4: 0000000000360ef0
[<...>] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[<...>] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[<...>] ASID: 0003
[<...>] Stack:
[<...>]  ffffffff818a2315 00000000f5c856ee ffffffff896f1840 ffff888103ee2cb0
[<...>]  ffff88812b6b9750 0000000079d714b6 fffffbfff1e9280b ffffffff8f49405f
[<...>]  0000000000000001 0000000000000000 ffff888104457280 ffffffff8248b392
[<...>] Call Trace:
[<...>]  <TASK>
[<...>]  [<ffffffff818a2315>] ? lock_release+0x175/0x380 fffffe80416afaf0
[<...>]  [<ffffffff8248b392>] list_lru_del+0x152/0x740 fffffe80416afb48
[<...>]  [<ffffffff8248ba93>] list_lru_del_obj+0x113/0x280 fffffe80416afb88
[<...>]  [<ffffffff8940fd19>] ? _atomic_dec_and_lock+0x119/0x200 fffffe80416afb90
[<...>]  [<ffffffff8295b244>] iput_final+0x1c4/0x9a0 fffffe80416afbb8
[<...>]  [<ffffffff8293a52b>] dentry_unlink_inode+0x44b/0xaa0 fffffe80416afbf8
[<...>]  [<ffffffff8293fefc>] __dentry_kill+0x23c/0xf00 fffffe80416afc40
[<...>]  [<ffffffff8953a85f>] ? __this_cpu_preempt_check+0x1f/0xa0 fffffe80416afc48
[<...>]  [<ffffffff82949ce5>] ? shrink_dentry_list+0x1c5/0x760 fffffe80416afc70
[<...>]  [<ffffffff82949b71>] ? shrink_dentry_list+0x51/0x760 fffffe80416afc78
[<...>]  [<ffffffff82949da8>] shrink_dentry_list+0x288/0x760 fffffe80416afc80
[<...>]  [<ffffffff8294ae75>] shrink_dcache_sb+0x155/0x420 fffffe80416afcc8
[<...>]  [<ffffffff8953a7c3>] ? debug_smp_processor_id+0x23/0xa0 fffffe80416afce0
[<...>]  [<ffffffff8294ad20>] ? do_one_tree+0x140/0x140 fffffe80416afcf8
[<...>]  [<ffffffff82997349>] ? do_remount+0x329/0xa00 fffffe80416afd18
[<...>]  [<ffffffff83ebf7a1>] ? security_sb_remount+0x81/0x1c0 fffffe80416afd38
[<...>]  [<ffffffff82892096>] reconfigure_super+0x856/0x14e0 fffffe80416afd70
[<...>]  [<ffffffff815d1327>] ? ns_capable_common+0xe7/0x2a0 fffffe80416afd90
[<...>]  [<ffffffff82997436>] do_remount+0x416/0xa00 fffffe80416afdd0
[<...>]  [<ffffffff829b2ba4>] path_mount+0x5c4/0x900 fffffe80416afe28
[<...>]  [<ffffffff829b25e0>] ? finish_automount+0x13a0/0x13a0 fffffe80416afe60
[<...>]  [<ffffffff82903812>] ? user_path_at_empty+0xb2/0x140 fffffe80416afe88
[<...>]  [<ffffffff829b2ff5>] do_mount+0x115/0x1c0 fffffe80416afeb8
[<...>]  [<ffffffff829b2ee0>] ? path_mount+0x900/0x900 fffffe80416afed8
[<...>]  [<ffffffff8272461c>] ? __kasan_check_write+0x1c/0xa0 fffffe80416afee0
[<...>]  [<ffffffff829b31cf>] __do_sys_mount+0x12f/0x280 fffffe80416aff30
[<...>]  [<ffffffff829b36cd>] __x64_sys_mount+0xcd/0x2e0 fffffe80416aff70
[<...>]  [<ffffffff819f8818>] ? syscall_trace_enter+0x218/0x380 fffffe80416aff88
[<...>]  [<ffffffff8111655e>] x64_sys_call+0x5d5e/0x6720 fffffe80416affa8
[<...>]  [<ffffffff8952756d>] do_syscall_64+0xcd/0x3c0 fffffe80416affb8
[<...>]  [<ffffffff8100119b>] entry_SYSCALL_64_safe_stack+0x4c/0x87 fffffe80416affe8
[<...>]  </TASK>
[<...>]  <PTREGS>
[<...>] RIP: 0033:[<00006dcb382ff66a>] vm_area_struct[mount 2550 2550 file 6dcb38225000-6dcb3837e000 22 55(read|exec|mayread|mayexec)]+0x0/0xb8 [userland map]
[<...>] Code: 48 8b 0d 29 18 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d f6 17 0d 00 f7 d8 64 89 01 48
[<...>] RSP: 002b:0000763d68192558 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
[<...>] RAX: ffffffffffffffda RBX: 00006dcb38433264 RCX: 00006dcb382ff66a
[<...>] RDX: 000017c3e0d11210 RSI: 000017c3e0d1a5a0 RDI: 000017c3e0d1ae70
[<...>] RBP: 000017c3e0d10fb0 R08: 000017c3e0d11260 R09: 00006dcb383d1be0
[<...>] R10: 000000000020002e R11: 0000000000000246 R12: 0000000000000000
[<...>] R13: 000017c3e0d1ae70 R14: 000017c3e0d11210 R15: 000017c3e0d10fb0
[<...>] RBX: vm_area_struct[mount 2550 2550 file 6dcb38433000-6dcb38434000 5b 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] RCX: vm_area_struct[mount 2550 2550 file 6dcb38225000-6dcb3837e000 22 55(read|exec|mayread|mayexec)]+0x0/0xb8 [userland map]
[<...>] RDX: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] RSI: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] RDI: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] RBP: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] RSP: vm_area_struct[mount 2550 2550 anon 763d68173000-763d68195000 7ffffffdd 100133(read|write|mayread|maywrite|growsdown|account)]+0x0/0xb8 [userland map]
[<...>] R08: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] R09: vm_area_struct[mount 2550 2550 file 6dcb383d1000-6dcb383d3000 1cd 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] R13: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] R14: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>] R15: vm_area_struct[mount 2550 2550 anon 17c3e0d0f000-17c3e0d31000 17c3e0d0f 100033(read|write|mayread|maywrite|account)]+0x0/0xb8 [userland map]
[<...>]  </PTREGS>
[<...>] Modules linked in:
[<...>] ---[ end trace 0000000000000000 ]---

The list debug message as well as RBX's symbolic value point out that the
object in question was allocated from 'tracefs_inode_cache' and that the
list's '->next' member is at offset 0. Dumping the layout of the relevant
parts of 'struct tracefs_inode' gives the following:

  struct tracefs_inode {
    union {
      struct inode {
        struct list_head {
          struct list_head * next;                    /*     0     8 */
          struct list_head * prev;                    /*     8     8 */
        } i_lru;
        [...]
      } vfs_inode;
      struct callback_head {
        void (*func)(struct callback_head *);         /*     0     8 */
        struct callback_head * next;                  /*     8     8 */
      } rcu;
    };
    [...]
  };

Above shows that 'vfs_inode.i_lru' overlaps with 'rcu' which will
destroy the 'i_lru' list as soon as the 'rcu' member gets used, e.g. in
call_rcu() or later when calling the RCU callback. This will disturb
concurrent list traversals as well as object reuse which assumes these
list heads will keep their integrity.

For reproduction, the following diff manually overlays 'i_lru' with
'rcu' as, otherwise, one would require some good portion of luck for
gambling an unlucky RANDSTRUCT seed:

  --- a/include/linux/fs.h
  +++ b/include/linux/fs.h
  @@ -629,6 +629,7 @@ struct inode {
   	umode_t			i_mode;
   	unsigned short		i_opflags;
   	kuid_t			i_uid;
  +	struct list_head	i_lru;		/* inode LRU list */
   	kgid_t			i_gid;
   	unsigned int		i_flags;

  @@ -690,7 +691,6 @@ struct inode {
   	u16			i_wb_frn_avg_time;
   	u16			i_wb_frn_history;
   #endif
  -	struct list_head	i_lru;		/* inode LRU list */
   	struct list_head	i_sb_list;
   	struct list_head	i_wb_list;	/* backing dev writeback list */
   	union {

The tracefs inode does not need to supply its own RCU delayed destruction
of its inode. The inode code itself offers both a "destroy_inode()"
callback that gets called when the last reference of the inode is
released, and the "free_inode()" which is called after a RCU
synchronization period from the "destroy_inode()".

The tracefs code can unlink the inode from its list in the destroy_inode()
callback, and the simply free it from the free_inode() callback. This
should provide the same protection.

Link: https://lore.kernel.org/all/20240807115143.45927-3-minipli@grsecurity.net/

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Ajay Kaher <ajay.kaher@broadcom.com>
Cc: Ilkka =?utf-8?b?TmF1bGFww6TDpA==?= <digirigawa@gmail.com>
Link: https://lore.kernel.org/20240807185402.61410544@gandalf.local.home
Fixes: baa23a8d4360 ("tracefs: Reset permissions on remount if permissions are options")
Reported-by: Mathias Krause <minipli@grsecurity.net>
Reported-by: Brad Spengler <spender@grsecurity.net>
Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/inode.c    | 10 ++++------
 fs/tracefs/internal.h |  5 +----
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 21a7e51fc3c1..1748dff58c3b 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -53,15 +53,14 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb)
 	return &ti->vfs_inode;
 }
 
-static void tracefs_free_inode_rcu(struct rcu_head *rcu)
+static void tracefs_free_inode(struct inode *inode)
 {
-	struct tracefs_inode *ti;
+	struct tracefs_inode *ti = get_tracefs(inode);
 
-	ti = container_of(rcu, struct tracefs_inode, rcu);
 	kmem_cache_free(tracefs_inode_cachep, ti);
 }
 
-static void tracefs_free_inode(struct inode *inode)
+static void tracefs_destroy_inode(struct inode *inode)
 {
 	struct tracefs_inode *ti = get_tracefs(inode);
 	unsigned long flags;
@@ -69,8 +68,6 @@ static void tracefs_free_inode(struct inode *inode)
 	spin_lock_irqsave(&tracefs_inode_lock, flags);
 	list_del_rcu(&ti->list);
 	spin_unlock_irqrestore(&tracefs_inode_lock, flags);
-
-	call_rcu(&ti->rcu, tracefs_free_inode_rcu);
 }
 
 static ssize_t default_read_file(struct file *file, char __user *buf,
@@ -437,6 +434,7 @@ static int tracefs_drop_inode(struct inode *inode)
 static const struct super_operations tracefs_super_operations = {
 	.alloc_inode    = tracefs_alloc_inode,
 	.free_inode     = tracefs_free_inode,
+	.destroy_inode  = tracefs_destroy_inode,
 	.drop_inode     = tracefs_drop_inode,
 	.statfs		= simple_statfs,
 	.show_options	= tracefs_show_options,
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index f704d8348357..d83c2a25f288 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -10,10 +10,7 @@ enum {
 };
 
 struct tracefs_inode {
-	union {
-		struct inode            vfs_inode;
-		struct rcu_head		rcu;
-	};
+	struct inode            vfs_inode;
 	/* The below gets initialized with memset_after(ti, 0, vfs_inode) */
 	struct list_head	list;
 	unsigned long           flags;
-- 
cgit 


From 6d496e02b4a70926c3bd4e7ab6249ff262eb3bc0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Aug 2024 18:03:54 -0400
Subject: bcachefs: Add missing path_traverse() to btree_iter_next_node()

This fixes a bug exposed by the next path - we pop an assert in
path_set_should_be_locked().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_iter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 36872207f09b..aa8a049071f4 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1921,6 +1921,11 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 	bch2_trans_verify_not_in_restart(trans);
 	bch2_btree_iter_verify(iter);
 
+	ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+	if (ret)
+		goto err;
+
+
 	struct btree_path *path = btree_iter_path(trans, iter);
 
 	/* already at end? */
-- 
cgit 


From cecf72798b25fcb00303392407fccf500a746747 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Aug 2024 13:58:57 -0400
Subject: bcachefs: Make allocator stuck timeout configurable, ratelimit
 messages

Limit these messages to once every 2 minutes to avoid spamming logs;
with multiple devices the output can be quite significant.

Also, up the default timeout to 30 seconds from 10 seconds.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c | 26 ++++++++++++++++++++++++--
 fs/bcachefs/alloc_foreground.h |  7 ++++++-
 fs/bcachefs/bcachefs.h         |  2 ++
 fs/bcachefs/bcachefs_format.h  |  2 ++
 fs/bcachefs/io_misc.c          |  6 +-----
 fs/bcachefs/io_write.c         |  5 +----
 fs/bcachefs/opts.h             |  5 +++++
 fs/bcachefs/super-io.c         |  4 ++++
 8 files changed, 45 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 8683fe4fae5b..02de5ad2be2c 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1758,11 +1758,12 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	prt_printf(out, "buckets to invalidate\t%llu\r\n",	should_invalidate_buckets(ca, stats));
 }
 
-void bch2_print_allocator_stuck(struct bch_fs *c)
+static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
 {
 	struct printbuf buf = PRINTBUF;
 
-	prt_printf(&buf, "Allocator stuck? Waited for 10 seconds\n");
+	prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n",
+		   c->opts.allocator_stuck_timeout);
 
 	prt_printf(&buf, "Allocator debug:\n");
 	printbuf_indent_add(&buf, 2);
@@ -1792,3 +1793,24 @@ void bch2_print_allocator_stuck(struct bch_fs *c)
 	bch2_print_string_as_lines(KERN_ERR, buf.buf);
 	printbuf_exit(&buf);
 }
+
+static inline unsigned allocator_wait_timeout(struct bch_fs *c)
+{
+	if (c->allocator_last_stuck &&
+	    time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies))
+		return 0;
+
+	return c->opts.allocator_stuck_timeout * HZ;
+}
+
+void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+	unsigned t = allocator_wait_timeout(c);
+
+	if (t && closure_sync_timeout(cl, t)) {
+		c->allocator_last_stuck = jiffies;
+		bch2_print_allocator_stuck(c);
+	}
+
+	closure_sync(cl);
+}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index c78a64ec0553..386d231ceca3 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -231,6 +231,11 @@ void bch2_write_points_to_text(struct printbuf *, struct bch_fs *);
 void bch2_fs_alloc_debug_to_text(struct printbuf *, struct bch_fs *);
 void bch2_dev_alloc_debug_to_text(struct printbuf *, struct bch_dev *);
 
-void bch2_print_allocator_stuck(struct bch_fs *);
+void __bch2_wait_on_allocator(struct bch_fs *, struct closure *);
+static inline void bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl)
+{
+	if (cl->closure_get_happened)
+		__bch2_wait_on_allocator(c, cl);
+}
 
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 91361a167dcd..eedf2d6045e7 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -893,6 +893,8 @@ struct bch_fs {
 	struct bch_fs_usage_base __percpu *usage;
 	u64 __percpu		*online_reserved;
 
+	unsigned long		allocator_last_stuck;
+
 	struct io_clock		io_clock[2];
 
 	/* JOURNAL SEQ BLACKLIST */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 74a60b1a4ddf..ad893684db52 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -836,6 +836,8 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
 
 LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
 					struct bch_sb, flags[5],  0, 16);
+LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
+					struct bch_sb, flags[5], 16, 32);
 
 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 2cf6297756f8..177ed331c00b 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -126,11 +126,7 @@ err_noprint:
 
 	if (closure_nr_remaining(&cl) != 1) {
 		bch2_trans_unlock_long(trans);
-
-		if (closure_sync_timeout(&cl, HZ * 10)) {
-			bch2_print_allocator_stuck(c);
-			closure_sync(&cl);
-		}
+		bch2_wait_on_allocator(c, &cl);
 	}
 
 	return ret;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index d31c8d006d97..1d4761d15002 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1503,10 +1503,7 @@ err:
 	if ((op->flags & BCH_WRITE_SYNC) ||
 	    (!(op->flags & BCH_WRITE_SUBMITTED) &&
 	     !(op->flags & BCH_WRITE_IN_WORKER))) {
-		if (closure_sync_timeout(&op->cl, HZ * 10)) {
-			bch2_print_allocator_stuck(c);
-			closure_sync(&op->cl);
-		}
+		bch2_wait_on_allocator(c, &op->cl);
 
 		__bch2_write_index(op);
 
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 60b93018501f..cda1725702ea 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -391,6 +391,11 @@ enum fsck_err_opts {
 	  OPT_BOOL(),							\
 	  BCH_SB_JOURNAL_TRANSACTION_NAMES, true,			\
 	  NULL,		"Log transaction function names in journal")	\
+	x(allocator_stuck_timeout,	u16,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(0, U16_MAX),						\
+	  BCH_SB_ALLOCATOR_STUCK_TIMEOUT, 30,				\
+	  NULL,		"Default timeout in seconds for stuck allocator messages")\
 	x(noexcl,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 8bc819832790..c8c2ccbdfbb5 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -414,6 +414,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 
 		if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb))
 			SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version));
+
+		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2 &&
+		    !BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb))
+			SET_BCH_SB_ALLOCATOR_STUCK_TIMEOUT(sb, 30);
 	}
 
 	for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
-- 
cgit 


From 73dc1656f41a42849e43b945fe44d4e3d55eb6c3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Aug 2024 16:40:14 -0400
Subject: bcachefs: Use bch2_wait_on_allocator() in btree node alloc path

If the allocator gets stuck, we need to know why.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_update_interior.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 31ee50184be2..e61f9695771e 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1264,7 +1264,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
 
 			bch2_trans_unlock(trans);
-			closure_sync(&cl);
+			bch2_wait_on_allocator(c, &cl);
 		} while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
 	}
 
-- 
cgit 


From 1b5487aefb1ce7a6b1f15a33297d1231306b4122 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Wed, 31 Jul 2024 21:38:50 -0500
Subject: smb3: fix setting SecurityFlags when encryption is required

Setting encryption as required in security flags was broken.
For example (to require all mounts to be encrypted by setting):

  "echo 0x400c5 > /proc/fs/cifs/SecurityFlags"

Would return "Invalid argument" and log "Unsupported security flags"
This patch fixes that (e.g. allowing overriding the default for
SecurityFlags  0x00c5, including 0x40000 to require seal, ie
SMB3.1.1 encryption) so now that works and forces encryption
on subsequent mounts.

Acked-by: Bharath SM <bharathsm@microsoft.com>
Cc: stable@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 Documentation/admin-guide/cifs/usage.rst | 2 +-
 fs/smb/client/cifs_debug.c               | 2 +-
 fs/smb/client/cifsglob.h                 | 8 ++++----
 fs/smb/client/smb2pdu.c                  | 3 +++
 4 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst
index fd4b56c0996f..c09674a75a9e 100644
--- a/Documentation/admin-guide/cifs/usage.rst
+++ b/Documentation/admin-guide/cifs/usage.rst
@@ -742,7 +742,7 @@ SecurityFlags		Flags which control security negotiation and
 			  may use NTLMSSP               		0x00080
 			  must use NTLMSSP           			0x80080
 			  seal (packet encryption)			0x00040
-			  must seal (not implemented yet)               0x40040
+			  must seal                                     0x40040
 
 cifsFYI			If set to non-zero value, additional debug information
 			will be logged to the system error log.  This field
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index c71ae5c04306..4a20e92474b2 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -1072,7 +1072,7 @@ static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
 static void
 cifs_security_flags_handle_must_flags(unsigned int *flags)
 {
-	unsigned int signflags = *flags & CIFSSEC_MUST_SIGN;
+	unsigned int signflags = *flags & (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL);
 
 	if ((*flags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5)
 		*flags = CIFSSEC_MUST_KRB5;
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index f6d1f075987f..b9f46d29a441 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1881,7 +1881,7 @@ static inline bool is_replayable_error(int error)
 #define   CIFSSEC_MAY_SIGN	0x00001
 #define   CIFSSEC_MAY_NTLMV2	0x00004
 #define   CIFSSEC_MAY_KRB5	0x00008
-#define   CIFSSEC_MAY_SEAL	0x00040 /* not supported yet */
+#define   CIFSSEC_MAY_SEAL	0x00040
 #define   CIFSSEC_MAY_NTLMSSP	0x00080 /* raw ntlmssp with ntlmv2 */
 
 #define   CIFSSEC_MUST_SIGN	0x01001
@@ -1891,11 +1891,11 @@ require use of the stronger protocol */
 #define   CIFSSEC_MUST_NTLMV2	0x04004
 #define   CIFSSEC_MUST_KRB5	0x08008
 #ifdef CONFIG_CIFS_UPCALL
-#define   CIFSSEC_MASK          0x8F08F /* flags supported if no weak allowed */
+#define   CIFSSEC_MASK          0xCF0CF /* flags supported if no weak allowed */
 #else
-#define	  CIFSSEC_MASK          0x87087 /* flags supported if no weak allowed */
+#define	  CIFSSEC_MASK          0xC70C7 /* flags supported if no weak allowed */
 #endif /* UPCALL */
-#define   CIFSSEC_MUST_SEAL	0x40040 /* not supported yet */
+#define   CIFSSEC_MUST_SEAL	0x40040
 #define   CIFSSEC_MUST_NTLMSSP	0x80080 /* raw ntlmssp with ntlmv2 */
 
 #define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP | CIFSSEC_MAY_SEAL)
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 9a06b5594669..83facb54276a 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -82,6 +82,9 @@ int smb3_encryption_required(const struct cifs_tcon *tcon)
 	if (tcon->seal &&
 	    (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
 		return 1;
+	if (((global_secflags & CIFSSEC_MUST_SEAL) == CIFSSEC_MUST_SEAL) &&
+	    (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
+		return 1;
 	return 0;
 }
 
-- 
cgit 


From bdcffe4be7cb90ccd12c49924dad9e2eda11b57a Mon Sep 17 00:00:00 2001
From: Xiaxi Shen <shenxiaxi26@gmail.com>
Date: Wed, 7 Aug 2024 09:53:20 -0700
Subject: Fix spelling errors in Server Message Block

Fixed typos in various files under fs/smb/client/

Signed-off-by: Xiaxi Shen <shenxiaxi26@gmail.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h  | 4 ++--
 fs/smb/client/misc.c      | 2 +-
 fs/smb/client/smbdirect.c | 8 ++++----
 fs/smb/client/transport.c | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index b9f46d29a441..5c9b3e6cd95f 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -345,7 +345,7 @@ struct smb_version_operations {
 	/* connect to a server share */
 	int (*tree_connect)(const unsigned int, struct cifs_ses *, const char *,
 			    struct cifs_tcon *, const struct nls_table *);
-	/* close tree connecion */
+	/* close tree connection */
 	int (*tree_disconnect)(const unsigned int, struct cifs_tcon *);
 	/* get DFS referrals */
 	int (*get_dfs_refer)(const unsigned int, struct cifs_ses *,
@@ -816,7 +816,7 @@ struct TCP_Server_Info {
 	 * Protected by @refpath_lock and @srv_lock.  The @refpath_lock is
 	 * mostly used for not requiring a copy of @leaf_fullpath when getting
 	 * cached or new DFS referrals (which might also sleep during I/O).
-	 * While @srv_lock is held for making string and NULL comparions against
+	 * While @srv_lock is held for making string and NULL comparisons against
 	 * both fields as in mount(2) and cache refresh.
 	 *
 	 * format: \\HOST\SHARE[\OPTIONAL PATH]
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index b28ff62f1f15..3fe5bfc389d0 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -352,7 +352,7 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
 				 * on simple responses (wct, bcc both zero)
 				 * in particular have seen this on
 				 * ulogoffX and FindClose. This leaves
-				 * one byte of bcc potentially unitialized
+				 * one byte of bcc potentially uninitialized
 				 */
 				/* zero rest of bcc */
 				tmp[sizeof(struct smb_hdr)+1] = 0;
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index d74e829de51c..7bcc379014ca 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -406,7 +406,7 @@ static void smbd_post_send_credits(struct work_struct *work)
 			else
 				response = get_empty_queue_buffer(info);
 			if (!response) {
-				/* now switch to emtpy packet queue */
+				/* now switch to empty packet queue */
 				if (use_receive_queue) {
 					use_receive_queue = 0;
 					continue;
@@ -618,7 +618,7 @@ out:
 
 /*
  * Test if FRWR (Fast Registration Work Requests) is supported on the device
- * This implementation requries FRWR on RDMA read/write
+ * This implementation requires FRWR on RDMA read/write
  * return value: true if it is supported
  */
 static bool frwr_is_supported(struct ib_device_attr *attrs)
@@ -2177,7 +2177,7 @@ cleanup_entries:
  * MR available in the list. It may access the list while the
  * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock
  * as they never modify the same places. However, there may be several CPUs
- * issueing I/O trying to get MR at the same time, mr_list_lock is used to
+ * issuing I/O trying to get MR at the same time, mr_list_lock is used to
  * protect this situation.
  */
 static struct smbd_mr *get_mr(struct smbd_connection *info)
@@ -2311,7 +2311,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
 	/*
 	 * There is no need for waiting for complemtion on ib_post_send
 	 * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
-	 * on the next ib_post_send when we actaully send I/O to remote peer
+	 * on the next ib_post_send when we actually send I/O to remote peer
 	 */
 	rc = ib_post_send(info->id->qp, &reg_wr->wr, NULL);
 	if (!rc)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index adfe0d058701..6e68aaf5bd20 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -1289,7 +1289,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
 out:
 	/*
 	 * This will dequeue all mids. After this it is important that the
-	 * demultiplex_thread will not process any of these mids any futher.
+	 * demultiplex_thread will not process any of these mids any further.
 	 * This is prevented above by using a noop callback that will not
 	 * wake this thread except for the very last PDU.
 	 */
-- 
cgit 


From f39bae2e028b841732ca81d8131d27b48a6051ad Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Aug 2024 15:42:23 -0400
Subject: bcachefs: Switch to .get_inode_acl()

.set_acl() requires a dentry, and if one isn't passed it marks the VFS
inode as not having an ACL.

This has been causing inodes with ACLs to have them "disappear" on
bcachefs filesystem, depending on which path those inodes get pulled
into the cache from.

Switching to .get_inode_acl(), like other local filesystems, fixes this.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/acl.c | 11 +++++++----
 fs/bcachefs/acl.h |  2 +-
 fs/bcachefs/fs.c  |  8 ++++----
 3 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index a7b425d3c8a0..331a17f3f113 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -272,16 +272,19 @@ bch2_acl_to_xattr(struct btree_trans *trans,
 	return xattr;
 }
 
-struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
-			       struct dentry *dentry, int type)
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
 {
-	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_inode_info *inode = to_bch_ei(vinode);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
-	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
 	struct posix_acl *acl = NULL;
+
+	if (rcu)
+		return ERR_PTR(-ECHILD);
+
+	struct btree_trans *trans = bch2_trans_get(c);
 retry:
 	bch2_trans_begin(trans);
 
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
index 27e7eec0f278..fe730a6bf0c1 100644
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
@@ -28,7 +28,7 @@ void bch2_acl_to_text(struct printbuf *, const void *, size_t);
 
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
 
-struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+struct posix_acl *bch2_get_acl(struct inode *, int, bool);
 
 int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
 		       struct bch_inode_unpacked *,
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 3a5f49affa0a..15fc41e63b6c 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -1199,7 +1199,7 @@ static const struct inode_operations bch_file_inode_operations = {
 	.fiemap		= bch2_fiemap,
 	.listxattr	= bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
+	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
 #endif
 };
@@ -1219,7 +1219,7 @@ static const struct inode_operations bch_dir_inode_operations = {
 	.tmpfile	= bch2_tmpfile,
 	.listxattr	= bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
+	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
 #endif
 };
@@ -1241,7 +1241,7 @@ static const struct inode_operations bch_symlink_inode_operations = {
 	.setattr	= bch2_setattr,
 	.listxattr	= bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
+	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
 #endif
 };
@@ -1251,7 +1251,7 @@ static const struct inode_operations bch_special_inode_operations = {
 	.setattr	= bch2_setattr,
 	.listxattr	= bch2_xattr_list,
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
-	.get_acl	= bch2_get_acl,
+	.get_inode_acl	= bch2_get_acl,
 	.set_acl	= bch2_set_acl,
 #endif
 };
-- 
cgit 


From 36bb22a08a69d9984a8399c07310d18b115eae20 Mon Sep 17 00:00:00 2001
From: Gleb Korobeynikov <gkorobeynikov@astralinux.ru>
Date: Thu, 8 Aug 2024 18:47:48 +0300
Subject: cifs: cifs_inval_name_dfs_link_error: correct the check for fullpath

Replace the always-true check tcon->origin_fullpath with
check of server->leaf_fullpath

See https://bugzilla.kernel.org/show_bug.cgi?id=219083

The check of the new @tcon will always be true during mounting,
since @tcon->origin_fullpath will only be set after the tree is
connected to the latest common resource, as well as checking if
the prefix paths from it are fully accessible.

Fixes: 3ae872de4107 ("smb: client: fix shared DFS root mounts with different prefixes")
Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Gleb Korobeynikov <gkorobeynikov@astralinux.ru>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/misc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 3fe5bfc389d0..c6f11e6f9eb9 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -1234,6 +1234,7 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid,
 				   const char *full_path,
 				   bool *islink)
 {
+	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_ses *ses = tcon->ses;
 	size_t len;
 	char *path;
@@ -1250,12 +1251,12 @@ int cifs_inval_name_dfs_link_error(const unsigned int xid,
 	    !is_tcon_dfs(tcon))
 		return 0;
 
-	spin_lock(&tcon->tc_lock);
-	if (!tcon->origin_fullpath) {
-		spin_unlock(&tcon->tc_lock);
+	spin_lock(&server->srv_lock);
+	if (!server->leaf_fullpath) {
+		spin_unlock(&server->srv_lock);
 		return 0;
 	}
-	spin_unlock(&tcon->tc_lock);
+	spin_unlock(&server->srv_lock);
 
 	/*
 	 * Slow path - tcon is DFS and @full_path has prefix path, so attempt
-- 
cgit 


From a018c1b636e79b60149b41151ded7c2606d8606e Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Mon, 5 Aug 2024 08:56:18 +0900
Subject: ksmbd: override fsids for share path check

Sangsoo reported that a DAC denial error occurred when accessing
files through the ksmbd thread. This patch override fsids for share
path check.

Reported-by: Sangsoo Lee <constant.lee@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/mgmt/share_config.c | 15 ++++++++++++---
 fs/smb/server/mgmt/share_config.h |  4 +++-
 fs/smb/server/mgmt/tree_connect.c |  9 +++++----
 fs/smb/server/mgmt/tree_connect.h |  4 ++--
 fs/smb/server/smb2pdu.c           |  2 +-
 fs/smb/server/smb_common.c        |  9 +++++++--
 fs/smb/server/smb_common.h        |  2 ++
 7 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c
index e0a6b758094f..d8d03070ae44 100644
--- a/fs/smb/server/mgmt/share_config.c
+++ b/fs/smb/server/mgmt/share_config.c
@@ -15,6 +15,7 @@
 #include "share_config.h"
 #include "user_config.h"
 #include "user_session.h"
+#include "../connection.h"
 #include "../transport_ipc.h"
 #include "../misc.h"
 
@@ -120,12 +121,13 @@ static int parse_veto_list(struct ksmbd_share_config *share,
 	return 0;
 }
 
-static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
+static struct ksmbd_share_config *share_config_request(struct ksmbd_work *work,
 						       const char *name)
 {
 	struct ksmbd_share_config_response *resp;
 	struct ksmbd_share_config *share = NULL;
 	struct ksmbd_share_config *lookup;
+	struct unicode_map *um = work->conn->um;
 	int ret;
 
 	resp = ksmbd_ipc_share_config_request(name);
@@ -181,7 +183,14 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
 				      KSMBD_SHARE_CONFIG_VETO_LIST(resp),
 				      resp->veto_list_sz);
 		if (!ret && share->path) {
+			if (__ksmbd_override_fsids(work, share)) {
+				kill_share(share);
+				share = NULL;
+				goto out;
+			}
+
 			ret = kern_path(share->path, 0, &share->vfs_path);
+			ksmbd_revert_fsids(work);
 			if (ret) {
 				ksmbd_debug(SMB, "failed to access '%s'\n",
 					    share->path);
@@ -214,7 +223,7 @@ out:
 	return share;
 }
 
-struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work,
 						  const char *name)
 {
 	struct ksmbd_share_config *share;
@@ -227,7 +236,7 @@ struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
 
 	if (share)
 		return share;
-	return share_config_request(um, name);
+	return share_config_request(work, name);
 }
 
 bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h
index 5f591751b923..d4ac2dd4de20 100644
--- a/fs/smb/server/mgmt/share_config.h
+++ b/fs/smb/server/mgmt/share_config.h
@@ -11,6 +11,8 @@
 #include <linux/path.h>
 #include <linux/unicode.h>
 
+struct ksmbd_work;
+
 struct ksmbd_share_config {
 	char			*name;
 	char			*path;
@@ -68,7 +70,7 @@ static inline void ksmbd_share_config_put(struct ksmbd_share_config *share)
 	__ksmbd_share_config_put(share);
 }
 
-struct ksmbd_share_config *ksmbd_share_config_get(struct unicode_map *um,
+struct ksmbd_share_config *ksmbd_share_config_get(struct ksmbd_work *work,
 						  const char *name);
 bool ksmbd_share_veto_filename(struct ksmbd_share_config *share,
 			       const char *filename);
diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c
index d2c81a8a11dd..94a52a75014a 100644
--- a/fs/smb/server/mgmt/tree_connect.c
+++ b/fs/smb/server/mgmt/tree_connect.c
@@ -16,17 +16,18 @@
 #include "user_session.h"
 
 struct ksmbd_tree_conn_status
-ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
-			const char *share_name)
+ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name)
 {
 	struct ksmbd_tree_conn_status status = {-ENOENT, NULL};
 	struct ksmbd_tree_connect_response *resp = NULL;
 	struct ksmbd_share_config *sc;
 	struct ksmbd_tree_connect *tree_conn = NULL;
 	struct sockaddr *peer_addr;
+	struct ksmbd_conn *conn = work->conn;
+	struct ksmbd_session *sess = work->sess;
 	int ret;
 
-	sc = ksmbd_share_config_get(conn->um, share_name);
+	sc = ksmbd_share_config_get(work, share_name);
 	if (!sc)
 		return status;
 
@@ -61,7 +62,7 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
 		struct ksmbd_share_config *new_sc;
 
 		ksmbd_share_config_del(sc);
-		new_sc = ksmbd_share_config_get(conn->um, share_name);
+		new_sc = ksmbd_share_config_get(work, share_name);
 		if (!new_sc) {
 			pr_err("Failed to update stale share config\n");
 			status.ret = -ESTALE;
diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h
index 6377a70b811c..a42cdd051041 100644
--- a/fs/smb/server/mgmt/tree_connect.h
+++ b/fs/smb/server/mgmt/tree_connect.h
@@ -13,6 +13,7 @@
 struct ksmbd_share_config;
 struct ksmbd_user;
 struct ksmbd_conn;
+struct ksmbd_work;
 
 enum {
 	TREE_NEW = 0,
@@ -50,8 +51,7 @@ static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn,
 struct ksmbd_session;
 
 struct ksmbd_tree_conn_status
-ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
-			const char *share_name);
+ksmbd_tree_conn_connect(struct ksmbd_work *work, const char *share_name);
 void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon);
 
 int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 37a39ab4ee65..54154d36ea2f 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1955,7 +1955,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
 	ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n",
 		    name, treename);
 
-	status = ksmbd_tree_conn_connect(conn, sess, name);
+	status = ksmbd_tree_conn_connect(work, name);
 	if (status.ret == KSMBD_TREE_CONN_STATUS_OK)
 		rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id);
 	else
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 474dadf6b7b8..13818ecb6e1b 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -732,10 +732,10 @@ bool is_asterisk(char *p)
 	return p && p[0] == '*';
 }
 
-int ksmbd_override_fsids(struct ksmbd_work *work)
+int __ksmbd_override_fsids(struct ksmbd_work *work,
+		struct ksmbd_share_config *share)
 {
 	struct ksmbd_session *sess = work->sess;
-	struct ksmbd_share_config *share = work->tcon->share_conf;
 	struct cred *cred;
 	struct group_info *gi;
 	unsigned int uid;
@@ -775,6 +775,11 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
 	return 0;
 }
 
+int ksmbd_override_fsids(struct ksmbd_work *work)
+{
+	return __ksmbd_override_fsids(work, work->tcon->share_conf);
+}
+
 void ksmbd_revert_fsids(struct ksmbd_work *work)
 {
 	const struct cred *cred;
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index f1092519c0c2..4a3148b0167f 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -447,6 +447,8 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn,
 int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command);
 
 int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp);
+int __ksmbd_override_fsids(struct ksmbd_work *work,
+			   struct ksmbd_share_config *share);
 int ksmbd_override_fsids(struct ksmbd_work *work);
 void ksmbd_revert_fsids(struct ksmbd_work *work);
 
-- 
cgit 


From f6bd41280a44dcc2e0a25ed72617d25f586974a7 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Mon, 5 Aug 2024 08:57:03 +0900
Subject: ksmbd: override fsids for smb2_query_info()

Sangsoo reported that a DAC denial error occurred when accessing
files through the ksmbd thread. This patch override fsids for
smb2_query_info().

Reported-by: Sangsoo Lee <constant.lee@samsung.com>
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 54154d36ea2f..2df1354288e6 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -5596,6 +5596,11 @@ int smb2_query_info(struct ksmbd_work *work)
 
 	ksmbd_debug(SMB, "GOT query info request\n");
 
+	if (ksmbd_override_fsids(work)) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
 	switch (req->InfoType) {
 	case SMB2_O_INFO_FILE:
 		ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n");
@@ -5614,6 +5619,7 @@ int smb2_query_info(struct ksmbd_work *work)
 			    req->InfoType);
 		rc = -EOPNOTSUPP;
 	}
+	ksmbd_revert_fsids(work);
 
 	if (!rc) {
 		rsp->StructureSize = cpu_to_le16(9);
@@ -5623,6 +5629,7 @@ int smb2_query_info(struct ksmbd_work *work)
 					le32_to_cpu(rsp->OutputBufferLength));
 	}
 
+err_out:
 	if (rc < 0) {
 		if (rc == -EACCES)
 			rsp->hdr.Status = STATUS_ACCESS_DENIED;
-- 
cgit 


From 077e47372309dcbe3a150754ea9c6f15cc838d6b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Aug 2024 23:19:59 -0400
Subject: bcachefs: bch2_accounting_invalid()

Implement bch2_accounting_invalid(); check for junk at the end, and
replicas accounting entries in particular need to be checked or we'll
pop asserts later.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c        | 65 +++++++++++++++++++++++++++++++++++-
 fs/bcachefs/disk_accounting_format.h |  9 +++--
 fs/bcachefs/replicas.c               |  1 -
 fs/bcachefs/sb-errors_format.h       |  6 +++-
 4 files changed, 73 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index dcdd59249c23..046ac92b6639 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -114,11 +114,74 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
 	return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
 }
 
+static inline bool is_zero(char *start, char *end)
+{
+	BUG_ON(start > end);
+
+	for (; start < end; start++)
+		if (*start)
+			return false;
+	return true;
+}
+
+#define field_end(p, member)	(((void *) (&p.member)) + sizeof(p.member))
+
 int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
 			    enum bch_validate_flags flags,
 			    struct printbuf *err)
 {
-	return 0;
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+	void *end = &acc_k + 1;
+	int ret = 0;
+
+	switch (acc_k.type) {
+	case BCH_DISK_ACCOUNTING_nr_inodes:
+		end = field_end(acc_k, nr_inodes);
+		break;
+	case BCH_DISK_ACCOUNTING_persistent_reserved:
+		end = field_end(acc_k, persistent_reserved);
+		break;
+	case BCH_DISK_ACCOUNTING_replicas:
+		bkey_fsck_err_on(!acc_k.replicas.nr_devs,
+				 c, err, accounting_key_replicas_nr_devs_0,
+				 "accounting key replicas entry with nr_devs=0");
+
+		bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
+				 (acc_k.replicas.nr_required > 1 &&
+				  acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
+				 c, err, accounting_key_replicas_nr_required_bad,
+				 "accounting key replicas entry with bad nr_required");
+
+		for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
+			bkey_fsck_err_on(acc_k.replicas.devs[i] > acc_k.replicas.devs[i + 1],
+					 c, err, accounting_key_replicas_devs_unsorted,
+					 "accounting key replicas entry with unsorted devs");
+
+		end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
+		break;
+	case BCH_DISK_ACCOUNTING_dev_data_type:
+		end = field_end(acc_k, dev_data_type);
+		break;
+	case BCH_DISK_ACCOUNTING_compression:
+		end = field_end(acc_k, compression);
+		break;
+	case BCH_DISK_ACCOUNTING_snapshot:
+		end = field_end(acc_k, snapshot);
+		break;
+	case BCH_DISK_ACCOUNTING_btree:
+		end = field_end(acc_k, btree);
+		break;
+	case BCH_DISK_ACCOUNTING_rebalance_work:
+		end = field_end(acc_k, rebalance_work);
+		break;
+	}
+
+	bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
+			 c, err, accounting_key_junk_at_end,
+			 "junk at end of accounting key");
+fsck_err:
+	return ret;
 }
 
 void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index cba417060b33..848f06cc809d 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -124,10 +124,6 @@ struct bch_dev_data_type {
 	__u8			data_type;
 };
 
-struct bch_dev_stripe_buckets {
-	__u8			dev;
-};
-
 struct bch_acct_compression {
 	__u8			type;
 };
@@ -140,6 +136,9 @@ struct bch_acct_btree {
 	__u32			id;
 };
 
+struct bch_acct_rebalance_work {
+};
+
 struct disk_accounting_pos {
 	union {
 	struct {
@@ -149,10 +148,10 @@ struct disk_accounting_pos {
 		struct bch_persistent_reserved	persistent_reserved;
 		struct bch_replicas_entry_v1	replicas;
 		struct bch_dev_data_type	dev_data_type;
-		struct bch_dev_stripe_buckets	dev_stripe_buckets;
 		struct bch_acct_compression	compression;
 		struct bch_acct_snapshot	snapshot;
 		struct bch_acct_btree		btree;
+		struct bch_acct_rebalance_work	rebalance_work;
 		};
 	};
 		struct bpos			_pad;
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 10c96cb2047a..1223b710755d 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -24,7 +24,6 @@ static int bch2_memcmp(const void *l, const void *r,  const void *priv)
 static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-	BUG_ON(e->data_type >= BCH_DATA_NR);
 	BUG_ON(!e->nr_devs);
 	BUG_ON(e->nr_required > 1 &&
 	       e->nr_required >= e->nr_devs);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index d1b2f2aa397a..d3a498617303 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -287,7 +287,11 @@ enum bch_fsck_flags {
 	x(accounting_replicas_not_marked,			273,	0)		\
 	x(invalid_btree_id,					274,	0)		\
 	x(alloc_key_io_time_bad,				275,	0)		\
-	x(alloc_key_fragmentation_lru_wrong,			276,	FSCK_AUTOFIX)
+	x(alloc_key_fragmentation_lru_wrong,			276,	FSCK_AUTOFIX)	\
+	x(accounting_key_junk_at_end,				277,	0)		\
+	x(accounting_key_replicas_nr_devs_0,			278,	0)		\
+	x(accounting_key_replicas_nr_required_bad,		279,	0)		\
+	x(accounting_key_replicas_devs_unsorted,		280,	0)		\
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit 


From 1a9e219db15e62760cfcc107ab6df3796d353605 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 8 Aug 2024 23:44:00 -0400
Subject: bcachefs: improve bch2_dev_usage_to_text()

Add a line for capacity

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_foreground.c |  2 +-
 fs/bcachefs/buckets.c          | 12 ++++++++----
 fs/bcachefs/buckets.h          |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 02de5ad2be2c..8563c2d26847 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -1740,7 +1740,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 	printbuf_tabstop_push(out, 16);
 	printbuf_tabstop_push(out, 16);
 
-	bch2_dev_usage_to_text(out, &stats);
+	bch2_dev_usage_to_text(out, ca, &stats);
 
 	prt_newline(out);
 
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 2650a0d24663..9f7004e941ce 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -71,17 +71,21 @@ bch2_fs_usage_read_short(struct bch_fs *c)
 	return ret;
 }
 
-void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
+void bch2_dev_usage_to_text(struct printbuf *out,
+			    struct bch_dev *ca,
+			    struct bch_dev_usage *usage)
 {
 	prt_printf(out, "\tbuckets\rsectors\rfragmented\r\n");
 
 	for (unsigned i = 0; i < BCH_DATA_NR; i++) {
 		bch2_prt_data_type(out, i);
 		prt_printf(out, "\t%llu\r%llu\r%llu\r\n",
-			usage->d[i].buckets,
-			usage->d[i].sectors,
-			usage->d[i].fragmented);
+			   usage->d[i].buckets,
+			   usage->d[i].sectors,
+			   usage->d[i].fragmented);
 	}
+
+	prt_printf(out, "capacity\t%llu\r\n", ca->mi.nbuckets);
 }
 
 static int bch2_check_fix_ptr(struct btree_trans *trans,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 2d35eeb24a2d..edbdffd508fc 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -212,7 +212,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 	return ret;
 }
 
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
 
 static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
 {
-- 
cgit 


From 8a2491db7bea6ad88ec568731eafd583501f1c96 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 9 Aug 2024 00:25:25 -0400
Subject: bcachefs: bcachefs_metadata_version_disk_accounting_v3

bcachefs_metadata_version_disk_accounting_v2 erroneously had padding
bytes in disk_accounting_key, which is a problem because we have to
guarantee that all unused bytes in disk_accounting_key are zeroed.

Fortunately 6.11 isn't out yet, so it's cheap to fix this by spinning a
new version.

Reported-by: Gabriel de Perthuis <g2p.code@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h        |  3 ++-
 fs/bcachefs/disk_accounting_format.h |  8 ++++----
 fs/bcachefs/sb-downgrade.c           | 27 ++++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index ad893684db52..b25f86356728 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -675,7 +675,8 @@ struct bch_sb_field_ext {
 	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
 	x(mi_btree_bitmap,		BCH_VERSION(1,  7))		\
 	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))		\
-	x(disk_accounting_v2,		BCH_VERSION(1,  9))
+	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
+	x(disk_accounting_v3,		BCH_VERSION(1, 10))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 848f06cc809d..a93cf26ff4a9 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -130,11 +130,11 @@ struct bch_acct_compression {
 
 struct bch_acct_snapshot {
 	__u32			id;
-};
+} __packed;
 
 struct bch_acct_btree {
 	__u32			id;
-};
+} __packed;
 
 struct bch_acct_rebalance_work {
 };
@@ -152,8 +152,8 @@ struct disk_accounting_pos {
 		struct bch_acct_snapshot	snapshot;
 		struct bch_acct_btree		btree;
 		struct bch_acct_rebalance_work	rebalance_work;
-		};
-	};
+		} __packed;
+	} __packed;
 		struct bpos			_pad;
 	};
 };
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index dfbbd33c8731..6c4469f53313 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -61,12 +61,37 @@
 	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
 	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
 	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
-	  BCH_FSCK_ERR_accounting_mismatch)
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(disk_accounting_v3,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_bkey_version_in_future,			\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
+	  BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad,	\
+	  BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted,	\
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
 	  0)							\
 	x(disk_accounting_v2,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
+	  BCH_FSCK_ERR_dev_usage_fragmented_wrong,		\
+	  BCH_FSCK_ERR_fs_usage_hidden_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_btree_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_data_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_cached_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_reserved_wrong,			\
+	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
+	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
+	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
+	  BCH_FSCK_ERR_bkey_version_in_future)			\
+	x(disk_accounting_v3,					\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
 	  BCH_FSCK_ERR_dev_usage_buckets_wrong,			\
 	  BCH_FSCK_ERR_dev_usage_sectors_wrong,			\
-- 
cgit 


From 3eb3cd5992f7a0c37edc8d05b4c38c98758d8671 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Wed, 7 Aug 2024 12:51:23 -0700
Subject: binfmt_flat: Fix corruption when not offsetting data start

Commit 04d82a6d0881 ("binfmt_flat: allow not offsetting data start")
introduced a RISC-V specific variant of the FLAT format which does
not allocate any space for the (obsolete) array of shared library
pointers. However, it did not disable the code which initializes the
array, resulting in the corruption of sizeof(long) bytes before the DATA
segment, generally the end of the TEXT segment.

Introduce MAX_SHARED_LIBS_UPDATE which depends on the state of
CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET to guard the initialization of
the shared library pointer region so that it will only be initialized
if space is reserved for it.

Fixes: 04d82a6d0881 ("binfmt_flat: allow not offsetting data start")
Co-developed-by: Stefan O'Rear <sorear@fastmail.com>
Signed-off-by: Stefan O'Rear <sorear@fastmail.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Acked-by: Greg Ungerer <gerg@linux-m68k.org>
Link: https://lore.kernel.org/r/20240807195119.it.782-kees@kernel.org
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/binfmt_flat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index c26545d71d39..cd6d5bbb4b9d 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -72,8 +72,10 @@
 
 #ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
 #define DATA_START_OFFSET_WORDS		(0)
+#define MAX_SHARED_LIBS_UPDATE		(0)
 #else
 #define DATA_START_OFFSET_WORDS		(MAX_SHARED_LIBS)
+#define MAX_SHARED_LIBS_UPDATE		(MAX_SHARED_LIBS)
 #endif
 
 struct lib_info {
@@ -880,7 +882,7 @@ static int load_flat_binary(struct linux_binprm *bprm)
 		return res;
 
 	/* Update data segment pointers for all libraries */
-	for (i = 0; i < MAX_SHARED_LIBS; i++) {
+	for (i = 0; i < MAX_SHARED_LIBS_UPDATE; i++) {
 		if (!libinfo.lib_list[i].loaded)
 			continue;
 		for (j = 0; j < MAX_SHARED_LIBS; j++) {
-- 
cgit 


From 98055bc3595500bcf2126b93b1595354bdb86a66 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Mon, 27 May 2024 21:17:32 +0100
Subject: netfs: Fault in smaller chunks for non-large folio mappings

As in commit 4e527d5841e2 ("iomap: fault in smaller chunks for non-large
folio mappings"), we can see a performance loss for filesystems
which have not yet been converted to large folios.

Fixes: c38f4e96e605 ("netfs: Provide func to copy data to pagecache for buffered write")
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20240527201735.1898381-1-willy@infradead.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/buffered_write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 4726c315453c..ca53c5d1622e 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -184,7 +184,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 	unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
 	ssize_t written = 0, ret, ret2;
 	loff_t i_size, pos = iocb->ki_pos, from, to;
-	size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+	size_t max_chunk = mapping_max_folio_size(mapping);
 	bool maybe_trouble = false;
 
 	if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
-- 
cgit 


From 3f65f3c099bcb27949e712f39ba836f21785924a Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 29 Jul 2024 15:48:12 -0700
Subject: filelock: fix name of file_lease slab cache

When struct file_lease was split out from struct file_lock, the name of
the file_lock slab cache was copied to the new slab cache for
file_lease. This name conflict causes confusion in /proc/slabinfo and
/sys/kernel/slab. In particular, it caused failures in drgn's test case
for slab cache merging.

Link: https://github.com/osandov/drgn/blob/9ad29fd86499eb32847473e928b6540872d3d59a/tests/linux_kernel/helpers/test_slab.py#L81
Fixes: c69ff4071935 ("filelock: split leases out of struct file_lock")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Link: https://lore.kernel.org/r/2d1d053da1cafb3e7940c4f25952da4f0af34e38.1722293276.git.osandov@fb.com
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/locks.c b/fs/locks.c
index 9afb16e0683f..e45cad40f8b6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2984,7 +2984,7 @@ static int __init filelock_init(void)
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
-	filelease_cache = kmem_cache_create("file_lock_cache",
+	filelease_cache = kmem_cache_create("file_lease_cache",
 			sizeof(struct file_lease), 0, SLAB_PANIC, NULL);
 
 	for_each_possible_cpu(i) {
-- 
cgit 


From f71aa06398aabc2e3eaac25acdf3d62e0094ba70 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Mon, 29 Jul 2024 17:19:30 +0100
Subject: fs/netfs/fscache_cookie: add missing "n_accesses" check

This fixes a NULL pointer dereference bug due to a data race which
looks like this:

  BUG: kernel NULL pointer dereference, address: 0000000000000008
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  PGD 0 P4D 0
  Oops: 0000 [#1] SMP PTI
  CPU: 33 PID: 16573 Comm: kworker/u97:799 Not tainted 6.8.7-cm4all1-hp+ #43
  Hardware name: HP ProLiant DL380 Gen9/ProLiant DL380 Gen9, BIOS P89 10/17/2018
  Workqueue: events_unbound netfs_rreq_write_to_cache_work
  RIP: 0010:cachefiles_prepare_write+0x30/0xa0
  Code: 57 41 56 45 89 ce 41 55 49 89 cd 41 54 49 89 d4 55 53 48 89 fb 48 83 ec 08 48 8b 47 08 48 83 7f 10 00 48 89 34 24 48 8b 68 20 <48> 8b 45 08 4c 8b 38 74 45 49 8b 7f 50 e8 4e a9 b0 ff 48 8b 73 10
  RSP: 0018:ffffb4e78113bde0 EFLAGS: 00010286
  RAX: ffff976126be6d10 RBX: ffff97615cdb8438 RCX: 0000000000020000
  RDX: ffff97605e6c4c68 RSI: ffff97605e6c4c60 RDI: ffff97615cdb8438
  RBP: 0000000000000000 R08: 0000000000278333 R09: 0000000000000001
  R10: ffff97605e6c4600 R11: 0000000000000001 R12: ffff97605e6c4c68
  R13: 0000000000020000 R14: 0000000000000001 R15: ffff976064fe2c00
  FS:  0000000000000000(0000) GS:ffff9776dfd40000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000008 CR3: 000000005942c002 CR4: 00000000001706f0
  Call Trace:
   <TASK>
   ? __die+0x1f/0x70
   ? page_fault_oops+0x15d/0x440
   ? search_module_extables+0xe/0x40
   ? fixup_exception+0x22/0x2f0
   ? exc_page_fault+0x5f/0x100
   ? asm_exc_page_fault+0x22/0x30
   ? cachefiles_prepare_write+0x30/0xa0
   netfs_rreq_write_to_cache_work+0x135/0x2e0
   process_one_work+0x137/0x2c0
   worker_thread+0x2e9/0x400
   ? __pfx_worker_thread+0x10/0x10
   kthread+0xcc/0x100
   ? __pfx_kthread+0x10/0x10
   ret_from_fork+0x30/0x50
   ? __pfx_kthread+0x10/0x10
   ret_from_fork_asm+0x1b/0x30
   </TASK>
  Modules linked in:
  CR2: 0000000000000008
  ---[ end trace 0000000000000000 ]---

This happened because fscache_cookie_state_machine() was slow and was
still running while another process invoked fscache_unuse_cookie();
this led to a fscache_cookie_lru_do_one() call, setting the
FSCACHE_COOKIE_DO_LRU_DISCARD flag, which was picked up by
fscache_cookie_state_machine(), withdrawing the cookie via
cachefiles_withdraw_cookie(), clearing cookie->cache_priv.

At the same time, yet another process invoked
cachefiles_prepare_write(), which found a NULL pointer in this code
line:

  struct cachefiles_object *object = cachefiles_cres_object(cres);

The next line crashes, obviously:

  struct cachefiles_cache *cache = object->volume->cache;

During cachefiles_prepare_write(), the "n_accesses" counter is
non-zero (via fscache_begin_operation()).  The cookie must not be
withdrawn until it drops to zero.

The counter is checked by fscache_cookie_state_machine() before
switching to FSCACHE_COOKIE_STATE_RELINQUISHING and
FSCACHE_COOKIE_STATE_WITHDRAWING (in "case
FSCACHE_COOKIE_STATE_FAILED"), but not for
FSCACHE_COOKIE_STATE_LRU_DISCARDING ("case
FSCACHE_COOKIE_STATE_ACTIVE").

This patch adds the missing check.  With a non-zero access counter,
the function returns and the next fscache_end_cookie_access() call
will queue another fscache_cookie_state_machine() call to handle the
still-pending FSCACHE_COOKIE_DO_LRU_DISCARD.

Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning")
Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20240729162002.3436763-2-dhowells@redhat.com
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: stable@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/fscache_cookie.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..d4d4b3a8b106 100644
--- a/fs/netfs/fscache_cookie.c
+++ b/fs/netfs/fscache_cookie.c
@@ -741,6 +741,10 @@ again_locked:
 			spin_lock(&cookie->lock);
 		}
 		if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) {
+			if (atomic_read(&cookie->n_accesses) != 0)
+				/* still being accessed: postpone it */
+				break;
+
 			__fscache_set_cookie_state(cookie,
 						   FSCACHE_COOKIE_STATE_LRU_DISCARDING);
 			wake = true;
-- 
cgit 


From 64a7ce76fb901bf9f9c36cf5d681328fc0fd4b5a Mon Sep 17 00:00:00 2001
From: yangerkun <yangerkun@huawei.com>
Date: Wed, 31 Jul 2024 12:38:35 +0800
Subject: libfs: fix infinite directory reads for offset dir

After we switch tmpfs dir operations from simple_dir_operations to
simple_offset_dir_operations, every rename happened will fill new dentry
to dest dir's maple tree(&SHMEM_I(inode)->dir_offsets->mt) with a free
key starting with octx->newx_offset, and then set newx_offset equals to
free key + 1. This will lead to infinite readdir combine with rename
happened at the same time, which fail generic/736 in xfstests(detail show
as below).

1. create 5000 files(1 2 3...) under one dir
2. call readdir(man 3 readdir) once, and get one entry
3. rename(entry, "TEMPFILE"), then rename("TEMPFILE", entry)
4. loop 2~3, until readdir return nothing or we loop too many
   times(tmpfs break test with the second condition)

We choose the same logic what commit 9b378f6ad48cf ("btrfs: fix infinite
directory reads") to fix it, record the last_index when we open dir, and
do not emit the entry which index >= last_index. The file->private_data
now used in offset dir can use directly to do this, and we also update
the last_index when we llseek the dir file.

Fixes: a2e459555c5f ("shmem: stable directory offsets")
Signed-off-by: yangerkun <yangerkun@huawei.com>
Link: https://lore.kernel.org/r/20240731043835.1828697-1-yangerkun@huawei.com
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
[brauner: only update last_index after seek when offset is zero like Jan suggested]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 8aa34870449f..02602d00939e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -450,6 +450,14 @@ void simple_offset_destroy(struct offset_ctx *octx)
 	mtree_destroy(&octx->mt);
 }
 
+static int offset_dir_open(struct inode *inode, struct file *file)
+{
+	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
+
+	file->private_data = (void *)ctx->next_offset;
+	return 0;
+}
+
 /**
  * offset_dir_llseek - Advance the read position of a directory descriptor
  * @file: an open directory whose position is to be updated
@@ -463,6 +471,9 @@ void simple_offset_destroy(struct offset_ctx *octx)
  */
 static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 {
+	struct inode *inode = file->f_inode;
+	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
+
 	switch (whence) {
 	case SEEK_CUR:
 		offset += file->f_pos;
@@ -476,7 +487,8 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 	}
 
 	/* In this case, ->private_data is protected by f_pos_lock */
-	file->private_data = NULL;
+	if (!offset)
+		file->private_data = (void *)ctx->next_offset;
 	return vfs_setpos(file, offset, LONG_MAX);
 }
 
@@ -507,7 +519,7 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
 }
 
-static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
 {
 	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
 	struct dentry *dentry;
@@ -515,17 +527,21 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 	while (true) {
 		dentry = offset_find_next(octx, ctx->pos);
 		if (!dentry)
-			return ERR_PTR(-ENOENT);
+			return;
+
+		if (dentry2offset(dentry) >= last_index) {
+			dput(dentry);
+			return;
+		}
 
 		if (!offset_dir_emit(ctx, dentry)) {
 			dput(dentry);
-			break;
+			return;
 		}
 
 		ctx->pos = dentry2offset(dentry) + 1;
 		dput(dentry);
 	}
-	return NULL;
 }
 
 /**
@@ -552,22 +568,19 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 static int offset_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dir = file->f_path.dentry;
+	long last_index = (long)file->private_data;
 
 	lockdep_assert_held(&d_inode(dir)->i_rwsem);
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	/* In this case, ->private_data is protected by f_pos_lock */
-	if (ctx->pos == DIR_OFFSET_MIN)
-		file->private_data = NULL;
-	else if (file->private_data == ERR_PTR(-ENOENT))
-		return 0;
-	file->private_data = offset_iterate_dir(d_inode(dir), ctx);
+	offset_iterate_dir(d_inode(dir), ctx, last_index);
 	return 0;
 }
 
 const struct file_operations simple_offset_dir_operations = {
+	.open		= offset_dir_open,
 	.llseek		= offset_dir_llseek,
 	.iterate_shared	= offset_readdir,
 	.read		= generic_read_dir,
-- 
cgit 


From 889ced4c9388785952d78d20d338bda2df209bb5 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@redhat.com>
Date: Wed, 31 Jul 2024 09:39:02 +0200
Subject: netfs: clean up after renaming FSCACHE_DEBUG config

Commit 6b8e61472529 ("netfs: Rename CONFIG_FSCACHE_DEBUG to
CONFIG_NETFS_DEBUG") renames the config, but introduces two issues: First,
NETFS_DEBUG mistakenly depends on the non-existing config NETFS, whereas
the actual intended config is called NETFS_SUPPORT. Second, the config
renaming misses to adjust the documentation of the functionality of this
config.

Clean up those two points.

Signed-off-by: Lukas Bulwahn <lukas.bulwahn@redhat.com>
Link: https://lore.kernel.org/r/20240731073902.69262-1-lukas.bulwahn@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/caching/fscache.rst | 8 ++++----
 fs/netfs/Kconfig                              | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/caching/fscache.rst b/Documentation/filesystems/caching/fscache.rst
index a74d7b052dc1..de1f32526cc1 100644
--- a/Documentation/filesystems/caching/fscache.rst
+++ b/Documentation/filesystems/caching/fscache.rst
@@ -318,10 +318,10 @@ where the columns are:
 Debugging
 =========
 
-If CONFIG_FSCACHE_DEBUG is enabled, the FS-Cache facility can have runtime
-debugging enabled by adjusting the value in::
+If CONFIG_NETFS_DEBUG is enabled, the FS-Cache facility and NETFS support can
+have runtime debugging enabled by adjusting the value in::
 
-	/sys/module/fscache/parameters/debug
+	/sys/module/netfs/parameters/debug
 
 This is a bitmask of debugging streams to enable:
 
@@ -343,6 +343,6 @@ This is a bitmask of debugging streams to enable:
 The appropriate set of values should be OR'd together and the result written to
 the control file.  For example::
 
-	echo $((1|8|512)) >/sys/module/fscache/parameters/debug
+	echo $((1|8|512)) >/sys/module/netfs/parameters/debug
 
 will turn on all function entry debugging.
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index 1b78e8b65ebc..7701c037c328 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -24,7 +24,7 @@ config NETFS_STATS
 
 config NETFS_DEBUG
 	bool "Enable dynamic debugging netfslib and FS-Cache"
-	depends on NETFS
+	depends on NETFS_SUPPORT
 	help
 	  This permits debugging to be dynamically enabled in the local caching
 	  management module.  If this is set, the debugging output may be
-- 
cgit 


From 8e5ced7804cb9184c4a23f8054551240562a8eda Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 30 Jul 2024 17:01:40 +0100
Subject: netfs, ceph: Revert "netfs: Remove deprecated use of PG_private_2 as
 a second writeback flag"

This reverts commit ae678317b95e760607c7b20b97c9cd4ca9ed6e1a.

Revert the patch that removes the deprecated use of PG_private_2 in
netfslib for the moment as Ceph is actually still using this to track
data copied to the cache.

Fixes: ae678317b95e ("netfs: Remove deprecated use of PG_private_2 as a second writeback flag")
Reported-by: Max Kellermann <max.kellermann@ionos.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: ceph-devel@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
https: //lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ceph/addr.c               |  19 +++++-
 fs/netfs/buffered_read.c     |   8 ++-
 fs/netfs/io.c                | 144 +++++++++++++++++++++++++++++++++++++++++++
 include/trace/events/netfs.h |   1 +
 4 files changed, 170 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8c16bc5250ef..73b5a07bf94d 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -498,6 +498,11 @@ const struct netfs_request_ops ceph_netfs_ops = {
 };
 
 #ifdef CONFIG_CEPH_FSCACHE
+static void ceph_set_page_fscache(struct page *page)
+{
+	folio_start_private_2(page_folio(page)); /* [DEPRECATED] */
+}
+
 static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
 {
 	struct inode *inode = priv;
@@ -515,6 +520,10 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b
 			       ceph_fscache_write_terminated, inode, true, caching);
 }
 #else
+static inline void ceph_set_page_fscache(struct page *page)
+{
+}
+
 static inline void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, bool caching)
 {
 }
@@ -706,6 +715,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		len = wlen;
 
 	set_page_writeback(page);
+	if (caching)
+		ceph_set_page_fscache(page);
 	ceph_fscache_write_to_cache(inode, page_off, len, caching);
 
 	if (IS_ENCRYPTED(inode)) {
@@ -789,6 +800,8 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 		return AOP_WRITEPAGE_ACTIVATE;
 	}
 
+	folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
+
 	err = writepage_nounlock(page, wbc);
 	if (err == -ERESTARTSYS) {
 		/* direct memory reclaimer was killed by SIGKILL. return 0
@@ -1062,7 +1075,8 @@ get_more_pages:
 				unlock_page(page);
 				break;
 			}
-			if (PageWriteback(page)) {
+			if (PageWriteback(page) ||
+			    PagePrivate2(page) /* [DEPRECATED] */) {
 				if (wbc->sync_mode == WB_SYNC_NONE) {
 					doutc(cl, "%p under writeback\n", page);
 					unlock_page(page);
@@ -1070,6 +1084,7 @@ get_more_pages:
 				}
 				doutc(cl, "waiting on writeback %p\n", page);
 				wait_on_page_writeback(page);
+				folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
 			}
 
 			if (!clear_page_dirty_for_io(page)) {
@@ -1254,6 +1269,8 @@ new_request:
 			}
 
 			set_page_writeback(page);
+			if (caching)
+				ceph_set_page_fscache(page);
 			len += thp_size(page);
 		}
 		ceph_fscache_write_to_cache(inode, offset, len, caching);
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index a688d4c75d99..424048f9ed1f 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -466,7 +466,7 @@ retry:
 	if (!netfs_is_cache_enabled(ctx) &&
 	    netfs_skip_folio_read(folio, pos, len, false)) {
 		netfs_stat(&netfs_n_rh_write_zskip);
-		goto have_folio;
+		goto have_folio_no_wait;
 	}
 
 	rreq = netfs_alloc_request(mapping, file,
@@ -507,6 +507,12 @@ retry:
 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
 
 have_folio:
+	if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) {
+		ret = folio_wait_private_2_killable(folio);
+		if (ret < 0)
+			goto error;
+	}
+have_folio_no_wait:
 	*_folio = folio;
 	_leave(" = 0");
 	return 0;
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index c93851b98368..c179a1c73fa7 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -98,6 +98,146 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
 	netfs_put_request(rreq, was_async, netfs_rreq_trace_put_complete);
 }
 
+/*
+ * [DEPRECATED] Deal with the completion of writing the data to the cache.  We
+ * have to clear the PG_fscache bits on the folios involved and release the
+ * caller's ref.
+ *
+ * May be called in softirq mode and we inherit a ref from the caller.
+ */
+static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
+					  bool was_async)
+{
+	struct netfs_io_subrequest *subreq;
+	struct folio *folio;
+	pgoff_t unlocked = 0;
+	bool have_unlocked = false;
+
+	rcu_read_lock();
+
+	list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+		XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE);
+
+		xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) {
+			if (xas_retry(&xas, folio))
+				continue;
+
+			/* We might have multiple writes from the same huge
+			 * folio, but we mustn't unlock a folio more than once.
+			 */
+			if (have_unlocked && folio->index <= unlocked)
+				continue;
+			unlocked = folio_next_index(folio) - 1;
+			trace_netfs_folio(folio, netfs_folio_trace_end_copy);
+			folio_end_private_2(folio);
+			have_unlocked = true;
+		}
+	}
+
+	rcu_read_unlock();
+	netfs_rreq_completed(rreq, was_async);
+}
+
+static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
+				       bool was_async) /* [DEPRECATED] */
+{
+	struct netfs_io_subrequest *subreq = priv;
+	struct netfs_io_request *rreq = subreq->rreq;
+
+	if (IS_ERR_VALUE(transferred_or_error)) {
+		netfs_stat(&netfs_n_rh_write_failed);
+		trace_netfs_failure(rreq, subreq, transferred_or_error,
+				    netfs_fail_copy_to_cache);
+	} else {
+		netfs_stat(&netfs_n_rh_write_done);
+	}
+
+	trace_netfs_sreq(subreq, netfs_sreq_trace_write_term);
+
+	/* If we decrement nr_copy_ops to 0, the ref belongs to us. */
+	if (atomic_dec_and_test(&rreq->nr_copy_ops))
+		netfs_rreq_unmark_after_write(rreq, was_async);
+
+	netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+
+/*
+ * [DEPRECATED] Perform any outstanding writes to the cache.  We inherit a ref
+ * from the caller.
+ */
+static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
+{
+	struct netfs_cache_resources *cres = &rreq->cache_resources;
+	struct netfs_io_subrequest *subreq, *next, *p;
+	struct iov_iter iter;
+	int ret;
+
+	trace_netfs_rreq(rreq, netfs_rreq_trace_copy);
+
+	/* We don't want terminating writes trying to wake us up whilst we're
+	 * still going through the list.
+	 */
+	atomic_inc(&rreq->nr_copy_ops);
+
+	list_for_each_entry_safe(subreq, p, &rreq->subrequests, rreq_link) {
+		if (!test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+			list_del_init(&subreq->rreq_link);
+			netfs_put_subrequest(subreq, false,
+					     netfs_sreq_trace_put_no_copy);
+		}
+	}
+
+	list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+		/* Amalgamate adjacent writes */
+		while (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+			next = list_next_entry(subreq, rreq_link);
+			if (next->start != subreq->start + subreq->len)
+				break;
+			subreq->len += next->len;
+			list_del_init(&next->rreq_link);
+			netfs_put_subrequest(next, false,
+					     netfs_sreq_trace_put_merged);
+		}
+
+		ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
+					       subreq->len, rreq->i_size, true);
+		if (ret < 0) {
+			trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
+			trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
+			continue;
+		}
+
+		iov_iter_xarray(&iter, ITER_SOURCE, &rreq->mapping->i_pages,
+				subreq->start, subreq->len);
+
+		atomic_inc(&rreq->nr_copy_ops);
+		netfs_stat(&netfs_n_rh_write);
+		netfs_get_subrequest(subreq, netfs_sreq_trace_get_copy_to_cache);
+		trace_netfs_sreq(subreq, netfs_sreq_trace_write);
+		cres->ops->write(cres, subreq->start, &iter,
+				 netfs_rreq_copy_terminated, subreq);
+	}
+
+	/* If we decrement nr_copy_ops to 0, the usage ref belongs to us. */
+	if (atomic_dec_and_test(&rreq->nr_copy_ops))
+		netfs_rreq_unmark_after_write(rreq, false);
+}
+
+static void netfs_rreq_write_to_cache_work(struct work_struct *work) /* [DEPRECATED] */
+{
+	struct netfs_io_request *rreq =
+		container_of(work, struct netfs_io_request, work);
+
+	netfs_rreq_do_write_to_cache(rreq);
+}
+
+static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) /* [DEPRECATED] */
+{
+	rreq->work.func = netfs_rreq_write_to_cache_work;
+	if (!queue_work(system_unbound_wq, &rreq->work))
+		BUG();
+}
+
 /*
  * Handle a short read.
  */
@@ -275,6 +415,10 @@ again:
 	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 	wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
 
+	if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags) &&
+	    test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))
+		return netfs_rreq_write_to_cache(rreq);
+
 	netfs_rreq_completed(rreq, was_async);
 }
 
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index da23484268df..24ec3434d32e 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -145,6 +145,7 @@
 	EM(netfs_folio_trace_clear_g,		"clear-g")	\
 	EM(netfs_folio_trace_clear_s,		"clear-s")	\
 	EM(netfs_folio_trace_copy_to_cache,	"mark-copy")	\
+	EM(netfs_folio_trace_end_copy,		"end-copy")	\
 	EM(netfs_folio_trace_filled_gaps,	"filled-gaps")	\
 	EM(netfs_folio_trace_kill,		"kill")		\
 	EM(netfs_folio_trace_kill_cc,		"kill-cc")	\
-- 
cgit 


From 7b589a9b45ae32aa9d7bece597490e141198d7a6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 7 Aug 2024 19:38:46 +0100
Subject: netfs: Fix handling of USE_PGPRIV2 and WRITE_TO_CACHE flags

The NETFS_RREQ_USE_PGPRIV2 and NETFS_RREQ_WRITE_TO_CACHE flags aren't used
correctly.  The problem is that we try to set them up in the request
initialisation, but we the cache may be in the process of setting up still,
and so the state may not be correct.  Further, we secondarily sample the
cache state and make contradictory decisions later.

The issue arises because we set up the cache resources, which allows the
cache's ->prepare_read() to switch on NETFS_SREQ_COPY_TO_CACHE - which
triggers cache writing even if we didn't set the flags when allocating.

Fix this in the following way:

 (1) Drop NETFS_ICTX_USE_PGPRIV2 and instead set NETFS_RREQ_USE_PGPRIV2 in
     ->init_request() rather than trying to juggle that in
     netfs_alloc_request().

 (2) Repurpose NETFS_RREQ_USE_PGPRIV2 to merely indicate that if caching is
     to be done, then PG_private_2 is to be used rather than only setting
     it if we decide to cache and then having netfs_rreq_unlock_folios()
     set the non-PG_private_2 writeback-to-cache if it wasn't set.

 (3) Split netfs_rreq_unlock_folios() into two functions, one of which
     contains the deprecated code for using PG_private_2 to avoid
     accidentally doing the writeback path - and always use it if
     USE_PGPRIV2 is set.

 (4) As NETFS_ICTX_USE_PGPRIV2 is removed, make netfs_write_begin() always
     wait for PG_private_2.  This function is deprecated and only used by
     ceph anyway, and so label it so.

 (5) Drop the NETFS_RREQ_WRITE_TO_CACHE flag and use
     fscache_operation_valid() on the cache_resources instead.  This has
     the advantage of picking up the result of netfs_begin_cache_read() and
     fscache_begin_write_operation() - which are called after the object is
     initialised and will wait for the cache to come to a usable state.

Just reverting ae678317b95e[1] isn't a sufficient fix, so this need to be
applied on top of that.  Without this as well, things like:

 rcu: INFO: rcu_sched detected expedited stalls on CPUs/tasks: {

and:

 WARNING: CPU: 13 PID: 3621 at fs/ceph/caps.c:3386

may happen, along with some UAFs due to PG_private_2 not getting used to
wait on writeback completion.

Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty")
Reported-by: Max Kellermann <max.kellermann@ionos.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Hristo Venev <hristo@venev.name>
cc: Jeff Layton <jlayton@kernel.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: ceph-devel@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk/ [1]
Link: https://lore.kernel.org/r/1173209.1723152682@warthog.procyon.org.uk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ceph/addr.c               |   3 ++
 fs/ceph/inode.c              |   2 -
 fs/netfs/buffered_read.c     | 125 ++++++++++++++++++++++++++++++++++++-------
 fs/netfs/objects.c           |  10 ----
 fs/netfs/write_issue.c       |   4 +-
 fs/nfs/fscache.c             |   2 +
 fs/nfs/fscache.h             |   2 -
 include/linux/netfs.h        |   3 --
 include/trace/events/netfs.h |   1 +
 9 files changed, 116 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 73b5a07bf94d..cc0a2240de98 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -424,6 +424,9 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
 	struct ceph_netfs_request_data *priv;
 	int ret = 0;
 
+	/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+	__set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
+
 	if (rreq->origin != NETFS_READAHEAD)
 		return 0;
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8f8de8f33abb..71cd70514efa 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -577,8 +577,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
 	/* Set parameters for the netfs library */
 	netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
-	/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
-	__set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags);
 
 	spin_lock_init(&ci->i_ceph_lock);
 
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 424048f9ed1f..27c750d39476 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -9,6 +9,97 @@
 #include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 
+/*
+ * [DEPRECATED] Unlock the folios in a read operation for when the filesystem
+ * is using PG_private_2 and direct writing to the cache from here rather than
+ * marking the page for writeback.
+ *
+ * Note that we don't touch folio->private in this code.
+ */
+static void netfs_rreq_unlock_folios_pgpriv2(struct netfs_io_request *rreq,
+					     size_t *account)
+{
+	struct netfs_io_subrequest *subreq;
+	struct folio *folio;
+	pgoff_t start_page = rreq->start / PAGE_SIZE;
+	pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+	bool subreq_failed = false;
+
+	XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+	/* Walk through the pagecache and the I/O request lists simultaneously.
+	 * We may have a mixture of cached and uncached sections and we only
+	 * really want to write out the uncached sections.  This is slightly
+	 * complicated by the possibility that we might have huge pages with a
+	 * mixture inside.
+	 */
+	subreq = list_first_entry(&rreq->subrequests,
+				  struct netfs_io_subrequest, rreq_link);
+	subreq_failed = (subreq->error < 0);
+
+	trace_netfs_rreq(rreq, netfs_rreq_trace_unlock_pgpriv2);
+
+	rcu_read_lock();
+	xas_for_each(&xas, folio, last_page) {
+		loff_t pg_end;
+		bool pg_failed = false;
+		bool folio_started = false;
+
+		if (xas_retry(&xas, folio))
+			continue;
+
+		pg_end = folio_pos(folio) + folio_size(folio) - 1;
+
+		for (;;) {
+			loff_t sreq_end;
+
+			if (!subreq) {
+				pg_failed = true;
+				break;
+			}
+
+			if (!folio_started &&
+			    test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags) &&
+			    fscache_operation_valid(&rreq->cache_resources)) {
+				trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+				folio_start_private_2(folio);
+				folio_started = true;
+			}
+
+			pg_failed |= subreq_failed;
+			sreq_end = subreq->start + subreq->len - 1;
+			if (pg_end < sreq_end)
+				break;
+
+			*account += subreq->transferred;
+			if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
+				subreq = list_next_entry(subreq, rreq_link);
+				subreq_failed = (subreq->error < 0);
+			} else {
+				subreq = NULL;
+				subreq_failed = false;
+			}
+
+			if (pg_end == sreq_end)
+				break;
+		}
+
+		if (!pg_failed) {
+			flush_dcache_folio(folio);
+			folio_mark_uptodate(folio);
+		}
+
+		if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
+			if (folio->index == rreq->no_unlock_folio &&
+			    test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
+				_debug("no unlock");
+			else
+				folio_unlock(folio);
+		}
+	}
+	rcu_read_unlock();
+}
+
 /*
  * Unlock the folios in a read operation.  We need to set PG_writeback on any
  * folios we're going to write back before we unlock them.
@@ -35,6 +126,12 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 		}
 	}
 
+	/* Handle deprecated PG_private_2 case. */
+	if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
+		netfs_rreq_unlock_folios_pgpriv2(rreq, &account);
+		goto out;
+	}
+
 	/* Walk through the pagecache and the I/O request lists simultaneously.
 	 * We may have a mixture of cached and uncached sections and we only
 	 * really want to write out the uncached sections.  This is slightly
@@ -52,7 +149,6 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 		loff_t pg_end;
 		bool pg_failed = false;
 		bool wback_to_cache = false;
-		bool folio_started = false;
 
 		if (xas_retry(&xas, folio))
 			continue;
@@ -66,17 +162,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 				pg_failed = true;
 				break;
 			}
-			if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
-				if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE,
-							       &subreq->flags)) {
-					trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
-					folio_start_private_2(folio);
-					folio_started = true;
-				}
-			} else {
-				wback_to_cache |=
-					test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
-			}
+
+			wback_to_cache |= test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
 			pg_failed |= subreq_failed;
 			sreq_end = subreq->start + subreq->len - 1;
 			if (pg_end < sreq_end)
@@ -124,6 +211,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 	}
 	rcu_read_unlock();
 
+out:
 	task_io_account_read(account);
 	if (rreq->netfs_ops->done)
 		rreq->netfs_ops->done(rreq);
@@ -395,7 +483,7 @@ zero_out:
 }
 
 /**
- * netfs_write_begin - Helper to prepare for writing
+ * netfs_write_begin - Helper to prepare for writing [DEPRECATED]
  * @ctx: The netfs context
  * @file: The file to read from
  * @mapping: The mapping to read from
@@ -426,6 +514,9 @@ zero_out:
  * inode before calling this.
  *
  * This is usable whether or not caching is enabled.
+ *
+ * Note that this should be considered deprecated and netfs_perform_write()
+ * used instead.
  */
 int netfs_write_begin(struct netfs_inode *ctx,
 		      struct file *file, struct address_space *mapping,
@@ -507,11 +598,9 @@ retry:
 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
 
 have_folio:
-	if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) {
-		ret = folio_wait_private_2_killable(folio);
-		if (ret < 0)
-			goto error;
-	}
+	ret = folio_wait_private_2_killable(folio);
+	if (ret < 0)
+		goto error;
 have_folio_no_wait:
 	*_folio = folio;
 	_leave(" = 0");
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index f4a642727479..0294df70c3ff 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -24,10 +24,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	struct netfs_io_request *rreq;
 	mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool;
 	struct kmem_cache *cache = mempool->pool_data;
-	bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
-			      origin == NETFS_DIO_READ ||
-			      origin == NETFS_DIO_WRITE);
-	bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
 	int ret;
 
 	for (;;) {
@@ -56,12 +52,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	refcount_set(&rreq->ref, 1);
 
 	__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
-	if (cached) {
-		__set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
-		if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags))
-			/* Filesystem uses deprecated PG_private_2 marking. */
-			__set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
-	}
 	if (file && file->f_flags & O_NONBLOCK)
 		__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
 	if (rreq->netfs_ops->init_request) {
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 9258d30cffe3..3f7e37e50c7d 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -94,6 +94,8 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 {
 	struct netfs_io_request *wreq;
 	struct netfs_inode *ictx;
+	bool is_buffered = (origin == NETFS_WRITEBACK ||
+			    origin == NETFS_WRITETHROUGH);
 
 	wreq = netfs_alloc_request(mapping, file, start, 0, origin);
 	if (IS_ERR(wreq))
@@ -102,7 +104,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
 	_enter("R=%x", wreq->debug_id);
 
 	ictx = netfs_inode(wreq->inode);
-	if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+	if (is_buffered && netfs_is_cache_enabled(ictx))
 		fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
 
 	wreq->contiguity = wreq->start;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 7202ce84d0eb..bf29a65c5027 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -265,6 +265,8 @@ static int nfs_netfs_init_request(struct netfs_io_request *rreq, struct file *fi
 {
 	rreq->netfs_priv = get_nfs_open_context(nfs_file_open_context(file));
 	rreq->debug_id = atomic_inc_return(&nfs_netfs_debug_id);
+	/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+	__set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
 
 	return 0;
 }
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index fbed0027996f..e8adae1bc260 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -81,8 +81,6 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
 static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
 {
 	netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
-	/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
-	__set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags);
 }
 extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
 extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 5d0288938cc2..983816608f15 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -73,8 +73,6 @@ struct netfs_inode {
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 #define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
-#define NETFS_ICTX_USE_PGPRIV2	31		/* [DEPRECATED] Use PG_private_2 to mark
-						 * write to cache on read */
 };
 
 /*
@@ -269,7 +267,6 @@ struct netfs_io_request {
 #define NETFS_RREQ_DONT_UNLOCK_FOLIOS	3	/* Don't unlock the folios on completion */
 #define NETFS_RREQ_FAILED		4	/* The request failed */
 #define NETFS_RREQ_IN_PROGRESS		5	/* Unlocked when the request completes */
-#define NETFS_RREQ_WRITE_TO_CACHE	7	/* Need to write to the cache */
 #define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
 #define NETFS_RREQ_NONBLOCK		9	/* Don't block if possible (O_NONBLOCK) */
 #define NETFS_RREQ_BLOCKED		10	/* We blocked */
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 24ec3434d32e..606b4a0f92da 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -51,6 +51,7 @@
 	EM(netfs_rreq_trace_resubmit,		"RESUBMT")	\
 	EM(netfs_rreq_trace_set_pause,		"PAUSE  ")	\
 	EM(netfs_rreq_trace_unlock,		"UNLOCK ")	\
+	EM(netfs_rreq_trace_unlock_pgpriv2,	"UNLCK-2")	\
 	EM(netfs_rreq_trace_unmark,		"UNMARK ")	\
 	EM(netfs_rreq_trace_wait_ip,		"WAIT-IP")	\
 	EM(netfs_rreq_trace_wait_pause,		"WT-PAUS")	\
-- 
cgit 


From 42fac187b5c746227c92d024f1caf33bc1d337e4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 11 Apr 2024 16:41:20 -0400
Subject: btrfs: check delayed refs when we're checking if a ref exists

In the patch 78c52d9eb6b7 ("btrfs: check for refs on snapshot delete
resume") I added some code to handle file systems that had been
corrupted by a bug that incorrectly skipped updating the drop progress
key while dropping a snapshot.  This code would check to see if we had
already deleted our reference for a child block, and skip the deletion
if we had already.

Unfortunately there is a bug, as the check would only check the on-disk
references.  I made an incorrect assumption that blocks in an already
deleted snapshot that was having the deletion resume on mount wouldn't
be modified.

If we have 2 pending deleted snapshots that share blocks, we can easily
modify the rules for a block.  Take the following example

subvolume a exists, and subvolume b is a snapshot of subvolume a.  They
share references to block 1.  Block 1 will have 2 full references, one
for subvolume a and one for subvolume b, and it belongs to subvolume a
(btrfs_header_owner(block 1) == subvolume a).

When deleting subvolume a, we will drop our full reference for block 1,
and because we are the owner we will drop our full reference for all of
block 1's children, convert block 1 to FULL BACKREF, and add a shared
reference to all of block 1's children.

Then we will start the snapshot deletion of subvolume b.  We look up the
extent info for block 1, which checks delayed refs and tells us that
FULL BACKREF is set, so sets parent to the bytenr of block 1.  However
because this is a resumed snapshot deletion, we call into
check_ref_exists().  Because check_ref_exists() only looks at the disk,
it doesn't find the shared backref for the child of block 1, and thus
returns 0 and we skip deleting the reference for the child of block 1
and continue.  This orphans the child of block 1.

The fix is to lookup the delayed refs, similar to what we do in
btrfs_lookup_extent_info().  However we only care about whether the
reference exists or not.  If we fail to find our reference on disk, go
look up the bytenr in the delayed refs, and if it exists look for an
existing ref in the delayed ref head.  If that exists then we know we
can delete the reference safely and carry on.  If it doesn't exist we
know we have to skip over this block.

This bug has existed since I introduced this fix, however requires
having multiple deleted snapshots pending when we unmount.  We noticed
this in production because our shutdown path stops the container on the
system, which deletes a bunch of subvolumes, and then reboots the box.
This gives us plenty of opportunities to hit this issue.  Looking at the
history we've seen this occasionally in production, but we had a big
spike recently thanks to faster machines getting jobs with multiple
subvolumes in the job.

Chris Mason wrote a reproducer which does the following

mount /dev/nvme4n1 /btrfs
btrfs subvol create /btrfs/s1
simoop -E -f 4k -n 200000 -z /btrfs/s1
while(true) ; do
	btrfs subvol snap /btrfs/s1 /btrfs/s2
	simoop -f 4k -n 200000 -r 10 -z /btrfs/s2
	btrfs subvol snap /btrfs/s2 /btrfs/s3
	btrfs balance start -dusage=80 /btrfs
	btrfs subvol del /btrfs/s2 /btrfs/s3
	umount /btrfs
	btrfsck /dev/nvme4n1 || exit 1
	mount /dev/nvme4n1 /btrfs
done

On the second loop this would fail consistently, with my patch it has
been running for hours and hasn't failed.

I also used dm-log-writes to capture the state of the failure so I could
debug the problem.  Using the existing failure case to test my patch
validated that it fixes the problem.

Fixes: 78c52d9eb6b7 ("btrfs: check for refs on snapshot delete resume")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/delayed-ref.h |  2 ++
 fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++-----
 3 files changed, 114 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 2ac9296edccb..06a9e0542d70 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1134,6 +1134,73 @@ btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 byt
 	return find_ref_head(delayed_refs, bytenr, false);
 }
 
+static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
+{
+	int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY;
+
+	if (type < entry->type)
+		return -1;
+	if (type > entry->type)
+		return 1;
+
+	if (type == BTRFS_TREE_BLOCK_REF_KEY) {
+		if (root < entry->ref_root)
+			return -1;
+		if (root > entry->ref_root)
+			return 1;
+	} else {
+		if (parent < entry->parent)
+			return -1;
+		if (parent > entry->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * Check to see if a given root/parent reference is attached to the head.  This
+ * only checks for BTRFS_ADD_DELAYED_REF references that match, as that
+ * indicates the reference exists for the given root or parent.  This is for
+ * tree blocks only.
+ *
+ * @head: the head of the bytenr we're searching.
+ * @root: the root objectid of the reference if it is a normal reference.
+ * @parent: the parent if this is a shared backref.
+ */
+bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
+				 u64 root, u64 parent)
+{
+	struct rb_node *node;
+	bool found = false;
+
+	lockdep_assert_held(&head->mutex);
+
+	spin_lock(&head->lock);
+	node = head->ref_tree.rb_root.rb_node;
+	while (node) {
+		struct btrfs_delayed_ref_node *entry;
+		int ret;
+
+		entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
+		ret = find_comp(entry, root, parent);
+		if (ret < 0) {
+			node = node->rb_left;
+		} else if (ret > 0) {
+			node = node->rb_right;
+		} else {
+			/*
+			 * We only want to count ADD actions, as drops mean the
+			 * ref doesn't exist.
+			 */
+			if (entry->action == BTRFS_ADD_DELAYED_REF)
+				found = true;
+			break;
+		}
+	}
+	spin_unlock(&head->lock);
+	return found;
+}
+
 void __cold btrfs_delayed_ref_exit(void)
 {
 	kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ef15e998be03..05f634eb472d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -389,6 +389,8 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 				  enum btrfs_reserve_flush_enum flush);
 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
+bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
+				 u64 root, u64 parent);
 
 static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
 {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ff9f0d41987e..feec49e6f9c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5472,23 +5472,62 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, u64 bytenr, u64 parent,
 			    int level)
 {
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct btrfs_delayed_ref_head *head;
 	struct btrfs_path *path;
 	struct btrfs_extent_inline_ref *iref;
 	int ret;
+	bool exists = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-
+again:
 	ret = lookup_extent_backref(trans, path, &iref, bytenr,
 				    root->fs_info->nodesize, parent,
 				    btrfs_root_id(root), level, 0);
+	if (ret != -ENOENT) {
+		/*
+		 * If we get 0 then we found our reference, return 1, else
+		 * return the error if it's not -ENOENT;
+		 */
+		btrfs_free_path(path);
+		return (ret < 0 ) ? ret : 1;
+	}
+
+	/*
+	 * We could have a delayed ref with this reference, so look it up while
+	 * we're holding the path open to make sure we don't race with the
+	 * delayed ref running.
+	 */
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+	if (!head)
+		goto out;
+	if (!mutex_trylock(&head->mutex)) {
+		/*
+		 * We're contended, means that the delayed ref is running, get a
+		 * reference and wait for the ref head to be complete and then
+		 * try again.
+		 */
+		refcount_inc(&head->refs);
+		spin_unlock(&delayed_refs->lock);
+
+		btrfs_release_path(path);
+
+		mutex_lock(&head->mutex);
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref_head(head);
+		goto again;
+	}
+
+	exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
+	mutex_unlock(&head->mutex);
+out:
+	spin_unlock(&delayed_refs->lock);
 	btrfs_free_path(path);
-	if (ret == -ENOENT)
-		return 0;
-	if (ret < 0)
-		return ret;
-	return 1;
+	return exists ? 1 : 0;
 }
 
 /*
-- 
cgit 


From 31723c9542dba1681cc3720571fdf12ffe0eddd9 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Mon, 12 Aug 2024 08:52:44 +0930
Subject: btrfs: tree-checker: reject BTRFS_FT_UNKNOWN dir type

[REPORT]
There is a bug report that kernel is rejecting a mismatching inode mode
and its dir item:

  [ 1881.553937] BTRFS critical (device dm-0): inode mode mismatch with
  dir: inode mode=040700 btrfs type=2 dir type=0

[CAUSE]
It looks like the inode mode is correct, while the dir item type
0 is BTRFS_FT_UNKNOWN, which should not be generated by btrfs at all.

This may be caused by a memory bit flip.

[ENHANCEMENT]
Although tree-checker is not able to do any cross-leaf verification, for
this particular case we can at least reject any dir type with
BTRFS_FT_UNKNOWN.

So here we enhance the dir type check from [0, BTRFS_FT_MAX), to
(0, BTRFS_FT_MAX).
Although the existing corruption can not be fixed just by such enhanced
checking, it should prevent the same 0x2->0x0 bitflip for dir type to
reach disk in the future.

Reported-by: Kota <nospam@kota.moe>
Link: https://lore.kernel.org/linux-btrfs/CACsxjPYnQF9ZF-0OhH16dAx50=BXXOcP74MxBc3BG+xae4vTTw@mail.gmail.com/
CC: stable@vger.kernel.org # 5.4+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index a825fa598e3c..6f1e2f2215d9 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -569,9 +569,10 @@ static int check_dir_item(struct extent_buffer *leaf,
 
 		/* dir type check */
 		dir_type = btrfs_dir_ftype(leaf, di);
-		if (unlikely(dir_type >= BTRFS_FT_MAX)) {
+		if (unlikely(dir_type <= BTRFS_FT_UNKNOWN ||
+			     dir_type >= BTRFS_FT_MAX)) {
 			dir_item_err(leaf, slot,
-			"invalid dir item type, have %u expect [0, %u)",
+			"invalid dir item type, have %u expect (0, %u)",
 				dir_type, BTRFS_FT_MAX);
 			return -EUCLEAN;
 		}
-- 
cgit 


From ae1e766f623f7a2a889a0b09eb076dd9a60efbe9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Sun, 11 Aug 2024 11:53:42 +0100
Subject: btrfs: only run the extent map shrinker from kswapd tasks

Currently the extent map shrinker can be run by any task when attempting
to allocate memory and there's enough memory pressure to trigger it.

To avoid too much latency we stop iterating over extent maps and removing
them once the task needs to reschedule. This logic was introduced in commit
b3ebb9b7e92a ("btrfs: stop extent map shrinker if reschedule is needed").

While that solved high latency problems for some use cases, it's still
not enough because with a too high number of tasks entering the extent map
shrinker code, either due to memory allocations or because they are a
kswapd task, we end up having a very high level of contention on some
spin locks, namely:

1) The fs_info->fs_roots_radix_lock spin lock, which we need to find
   roots to iterate over their inodes;

2) The spin lock of the xarray used to track open inodes for a root
   (struct btrfs_root::inodes) - on 6.10 kernels and below, it used to
   be a red black tree and the spin lock was root->inode_lock;

3) The fs_info->delayed_iput_lock spin lock since the shrinker adds
   delayed iputs (calls btrfs_add_delayed_iput()).

Instead of allowing the extent map shrinker to be run by any task, make
it run only by kswapd tasks. This still solves the problem of running
into OOM situations due to an unbounded extent map creation, which is
simple to trigger by direct IO writes, as described in the changelog
of commit 956a17d9d050 ("btrfs: add a shrinker for extent maps"), and
by a similar case when doing buffered IO on files with a very large
number of holes (keeping the file open and creating many holes, whose
extent maps are only released when the file is closed).

Reported-by: kzd <kzd@56709.net>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219121
Reported-by: Octavia Togami <octavia.togami@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CAHPNGSSt-a4ZZWrtJdVyYnJFscFjP9S7rMcvEMaNSpR556DdLA@mail.gmail.com/
Fixes: 956a17d9d050 ("btrfs: add a shrinker for extent maps")
CC: stable@vger.kernel.org # 6.10+
Tested-by: kzd <kzd@56709.net>
Tested-by: Octavia Togami <octavia.togami@gmail.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c | 22 ++++++----------------
 fs/btrfs/super.c      | 10 ++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 23b65dc73c00..10ac5f657e38 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1147,8 +1147,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
 		return 0;
 
 	/*
-	 * We want to be fast because we can be called from any path trying to
-	 * allocate memory, so if the lock is busy we don't want to spend time
+	 * We want to be fast so if the lock is busy we don't want to spend time
 	 * waiting for it - either some task is about to do IO for the inode or
 	 * we may have another task shrinking extent maps, here in this code, so
 	 * skip this inode.
@@ -1191,9 +1190,7 @@ next:
 		/*
 		 * Stop if we need to reschedule or there's contention on the
 		 * lock. This is to avoid slowing other tasks trying to take the
-		 * lock and because the shrinker might be called during a memory
-		 * allocation path and we want to avoid taking a very long time
-		 * and slowing down all sorts of tasks.
+		 * lock.
 		 */
 		if (need_resched() || rwlock_needbreak(&tree->lock))
 			break;
@@ -1222,12 +1219,7 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
 		if (ctx->scanned >= ctx->nr_to_scan)
 			break;
 
-		/*
-		 * We may be called from memory allocation paths, so we don't
-		 * want to take too much time and slowdown tasks.
-		 */
-		if (need_resched())
-			break;
+		cond_resched();
 
 		inode = btrfs_find_first_inode(root, min_ino);
 	}
@@ -1285,14 +1277,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 							   ctx.last_ino);
 	}
 
-	/*
-	 * We may be called from memory allocation paths, so we don't want to
-	 * take too much time and slowdown tasks, so stop if we need reschedule.
-	 */
-	while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
+	while (ctx.scanned < ctx.nr_to_scan) {
 		struct btrfs_root *root;
 		unsigned long count;
 
+		cond_resched();
+
 		spin_lock(&fs_info->fs_roots_radix_lock);
 		count = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
 					       (void **)&root,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 83478deada3b..11044e9e2cb1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,6 +28,7 @@
 #include <linux/btrfs.h>
 #include <linux/security.h>
 #include <linux/fs_parser.h>
+#include <linux/swap.h>
 #include "messages.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -2409,6 +2410,15 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
 	const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 
+	/*
+	 * We may be called from any task trying to allocate memory and we don't
+	 * want to slow it down with scanning and dropping extent maps. It would
+	 * also cause heavy lock contention if many tasks concurrently enter
+	 * here. Therefore only allow kswapd tasks to scan and drop extent maps.
+	 */
+	if (!current_is_kswapd())
+		return 0;
+
 	return btrfs_free_extent_maps(fs_info, nr_to_scan);
 }
 
-- 
cgit 


From 46a6e10a1ab16cc71d4a3cab73e79aabadd6b8ea Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 12 Aug 2024 14:18:06 +0100
Subject: btrfs: send: allow cloning non-aligned extent if it ends at i_size

If we a find that an extent is shared but its end offset is not sector
size aligned, then we don't clone it and issue write operations instead.
This is because the reflink (remap_file_range) operation does not allow
to clone unaligned ranges, except if the end offset of the range matches
the i_size of the source and destination files (and the start offset is
sector size aligned).

While this is not incorrect because send can only guarantee that a file
has the same data in the source and destination snapshots, it's not
optimal and generates confusion and surprising behaviour for users.

For example, running this test:

  $ cat test.sh
  #!/bin/bash

  DEV=/dev/sdi
  MNT=/mnt/sdi

  mkfs.btrfs -f $DEV
  mount $DEV $MNT

  # Use a file size not aligned to any possible sector size.
  file_size=$((1 * 1024 * 1024 + 5)) # 1MB + 5 bytes
  dd if=/dev/random of=$MNT/foo bs=$file_size count=1
  cp --reflink=always $MNT/foo $MNT/bar

  btrfs subvolume snapshot -r $MNT/ $MNT/snap
  rm -f /tmp/send-test
  btrfs send -f /tmp/send-test $MNT/snap

  umount $MNT
  mkfs.btrfs -f $DEV
  mount $DEV $MNT

  btrfs receive -vv -f /tmp/send-test $MNT

  xfs_io -r -c "fiemap -v" $MNT/snap/bar

  umount $MNT

Gives the following result:

  (...)
  mkfile o258-7-0
  rename o258-7-0 -> bar
  write bar - offset=0 length=49152
  write bar - offset=49152 length=49152
  write bar - offset=98304 length=49152
  write bar - offset=147456 length=49152
  write bar - offset=196608 length=49152
  write bar - offset=245760 length=49152
  write bar - offset=294912 length=49152
  write bar - offset=344064 length=49152
  write bar - offset=393216 length=49152
  write bar - offset=442368 length=49152
  write bar - offset=491520 length=49152
  write bar - offset=540672 length=49152
  write bar - offset=589824 length=49152
  write bar - offset=638976 length=49152
  write bar - offset=688128 length=49152
  write bar - offset=737280 length=49152
  write bar - offset=786432 length=49152
  write bar - offset=835584 length=49152
  write bar - offset=884736 length=49152
  write bar - offset=933888 length=49152
  write bar - offset=983040 length=49152
  write bar - offset=1032192 length=16389
  chown bar - uid=0, gid=0
  chmod bar - mode=0644
  utimes bar
  utimes
  BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=06d640da-9ca1-604c-b87c-3375175a8eb3, stransid=7
  /mnt/sdi/snap/bar:
   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
     0: [0..2055]:       26624..28679      2056   0x1

There's no clone operation to clone extents from the file foo into file
bar and fiemap confirms there's no shared flag (0x2000).

So update send_write_or_clone() so that it proceeds with cloning if the
source and destination ranges end at the i_size of the respective files.

After this changes the result of the test is:

  (...)
  mkfile o258-7-0
  rename o258-7-0 -> bar
  clone bar - source=foo source offset=0 offset=0 length=1048581
  chown bar - uid=0, gid=0
  chmod bar - mode=0644
  utimes bar
  utimes
  BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=582420f3-ea7d-564e-bbe5-ce440d622190, stransid=7
  /mnt/sdi/snap/bar:
   EXT: FILE-OFFSET      BLOCK-RANGE      TOTAL FLAGS
     0: [0..2055]:       26624..28679      2056 0x2001

A test case for fstests will also follow up soon.

Link: https://github.com/kdave/btrfs-progs/issues/572#issuecomment-2282841416
CC: stable@vger.kernel.org # 5.10+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 52 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4ca711a773ef..7fc692fc76e1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6157,25 +6157,51 @@ static int send_write_or_clone(struct send_ctx *sctx,
 	u64 offset = key->offset;
 	u64 end;
 	u64 bs = sctx->send_root->fs_info->sectorsize;
+	struct btrfs_file_extent_item *ei;
+	u64 disk_byte;
+	u64 data_offset;
+	u64 num_bytes;
+	struct btrfs_inode_info info = { 0 };
 
 	end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
 	if (offset >= end)
 		return 0;
 
-	if (clone_root && IS_ALIGNED(end, bs)) {
-		struct btrfs_file_extent_item *ei;
-		u64 disk_byte;
-		u64 data_offset;
+	num_bytes = end - offset;
 
-		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-				    struct btrfs_file_extent_item);
-		disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
-		data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
-		ret = clone_range(sctx, path, clone_root, disk_byte,
-				  data_offset, offset, end - offset);
-	} else {
-		ret = send_extent_data(sctx, path, offset, end - offset);
-	}
+	if (!clone_root)
+		goto write_data;
+
+	if (IS_ALIGNED(end, bs))
+		goto clone_data;
+
+	/*
+	 * If the extent end is not aligned, we can clone if the extent ends at
+	 * the i_size of the inode and the clone range ends at the i_size of the
+	 * source inode, otherwise the clone operation fails with -EINVAL.
+	 */
+	if (end != sctx->cur_inode_size)
+		goto write_data;
+
+	ret = get_inode_info(clone_root->root, clone_root->ino, &info);
+	if (ret < 0)
+		return ret;
+
+	if (clone_root->offset + num_bytes == info.size)
+		goto clone_data;
+
+write_data:
+	ret = send_extent_data(sctx, path, offset, num_bytes);
+	sctx->cur_inode_next_write_offset = end;
+	return ret;
+
+clone_data:
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_file_extent_item);
+	disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+	data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+	ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset,
+			  num_bytes);
 	sctx->cur_inode_next_write_offset = end;
 	return ret;
 }
-- 
cgit 


From 2a0629834cd82f05d424bbc193374f9a43d1f87d Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Fri, 9 Aug 2024 11:16:28 +0800
Subject: vfs: Don't evict inode under the inode lru traversing context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The inode reclaiming process(See function prune_icache_sb) collects all
reclaimable inodes and mark them with I_FREEING flag at first, at that
time, other processes will be stuck if they try getting these inodes
(See function find_inode_fast), then the reclaiming process destroy the
inodes by function dispose_list(). Some filesystems(eg. ext4 with
ea_inode feature, ubifs with xattr) may do inode lookup in the inode
evicting callback function, if the inode lookup is operated under the
inode lru traversing context, deadlock problems may happen.

Case 1: In function ext4_evict_inode(), the ea inode lookup could happen
        if ea_inode feature is enabled, the lookup process will be stuck
	under the evicting context like this:

 1. File A has inode i_reg and an ea inode i_ea
 2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea
 3. Then, following three processes running like this:

    PA                              PB
 echo 2 > /proc/sys/vm/drop_caches
  shrink_slab
   prune_dcache_sb
   // i_reg is added into lru, lru->i_ea->i_reg
   prune_icache_sb
    list_lru_walk_one
     inode_lru_isolate
      i_ea->i_state |= I_FREEING // set inode state
     inode_lru_isolate
      __iget(i_reg)
      spin_unlock(&i_reg->i_lock)
      spin_unlock(lru_lock)
                                     rm file A
                                      i_reg->nlink = 0
      iput(i_reg) // i_reg->nlink is 0, do evict
       ext4_evict_inode
        ext4_xattr_delete_inode
         ext4_xattr_inode_dec_ref_all
          ext4_xattr_inode_iget
           ext4_iget(i_ea->i_ino)
            iget_locked
             find_inode_fast
              __wait_on_freeing_inode(i_ea) ----→ AA deadlock
    dispose_list // cannot be executed by prune_icache_sb
     wake_up_bit(&i_ea->i_state)

Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file
        deleting process holds BASEHD's wbuf->io_mutex while getting the
	xattr inode, which could race with inode reclaiming process(The
        reclaiming process could try locking BASEHD's wbuf->io_mutex in
	inode evicting function), then an ABBA deadlock problem would
	happen as following:

 1. File A has inode ia and a xattr(with inode ixa), regular file B has
    inode ib and a xattr.
 2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa
 3. Then, following three processes running like this:

        PA                PB                        PC
                echo 2 > /proc/sys/vm/drop_caches
                 shrink_slab
                  prune_dcache_sb
                  // ib and ia are added into lru, lru->ixa->ib->ia
                  prune_icache_sb
                   list_lru_walk_one
                    inode_lru_isolate
                     ixa->i_state |= I_FREEING // set inode state
                    inode_lru_isolate
                     __iget(ib)
                     spin_unlock(&ib->i_lock)
                     spin_unlock(lru_lock)
                                                   rm file B
                                                    ib->nlink = 0
 rm file A
  iput(ia)
   ubifs_evict_inode(ia)
    ubifs_jnl_delete_inode(ia)
     ubifs_jnl_write_inode(ia)
      make_reservation(BASEHD) // Lock wbuf->io_mutex
      ubifs_iget(ixa->i_ino)
       iget_locked
        find_inode_fast
         __wait_on_freeing_inode(ixa)
          |          iput(ib) // ib->nlink is 0, do evict
          |           ubifs_evict_inode
          |            ubifs_jnl_delete_inode(ib)
          ↓             ubifs_jnl_write_inode
     ABBA deadlock ←-----make_reservation(BASEHD)
                   dispose_list // cannot be executed by prune_icache_sb
                    wake_up_bit(&ixa->i_state)

Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING
to pin the inode in memory while inode_lru_isolate() reclaims its pages
instead of using ordinary inode reference. This way inode deletion
cannot be triggered from inode_lru_isolate() thus avoiding the deadlock.
evict() is made to wait for I_LRU_ISOLATING to be cleared before
proceeding with inode cleanup.

Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022
Fixes: e50e5129f384 ("ext4: xattr-in-inode support")
Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files")
Cc: stable@vger.kernel.org
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Link: https://lore.kernel.org/r/20240809031628.1069873-1-chengzhihao@huaweicloud.com
Reviewed-by: Jan Kara <jack@suse.cz>
Suggested-by: Jan Kara <jack@suse.cz>
Suggested-by: Mateusz Guzik <mjguzik@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c         | 39 +++++++++++++++++++++++++++++++++++++--
 include/linux/fs.h |  5 +++++
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/inode.c b/fs/inode.c
index 86670941884b..10c4619faeef 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -488,6 +488,39 @@ static void inode_lru_list_del(struct inode *inode)
 		this_cpu_dec(nr_unused);
 }
 
+static void inode_pin_lru_isolating(struct inode *inode)
+{
+	lockdep_assert_held(&inode->i_lock);
+	WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
+	inode->i_state |= I_LRU_ISOLATING;
+}
+
+static void inode_unpin_lru_isolating(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
+	inode->i_state &= ~I_LRU_ISOLATING;
+	smp_mb();
+	wake_up_bit(&inode->i_state, __I_LRU_ISOLATING);
+	spin_unlock(&inode->i_lock);
+}
+
+static void inode_wait_for_lru_isolating(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	if (inode->i_state & I_LRU_ISOLATING) {
+		DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING);
+		wait_queue_head_t *wqh;
+
+		wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING);
+		spin_unlock(&inode->i_lock);
+		__wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE);
+		spin_lock(&inode->i_lock);
+		WARN_ON(inode->i_state & I_LRU_ISOLATING);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
 /**
  * inode_sb_list_add - add inode to the superblock list of inodes
  * @inode: inode to add
@@ -657,6 +690,8 @@ static void evict(struct inode *inode)
 
 	inode_sb_list_del(inode);
 
+	inode_wait_for_lru_isolating(inode);
+
 	/*
 	 * Wait for flusher thread to be done with the inode so that filesystem
 	 * does not start destroying it while writeback is still running. Since
@@ -855,7 +890,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 	 * be under pressure before the cache inside the highmem zone.
 	 */
 	if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
-		__iget(inode);
+		inode_pin_lru_isolating(inode);
 		spin_unlock(&inode->i_lock);
 		spin_unlock(lru_lock);
 		if (remove_inode_buffers(inode)) {
@@ -867,7 +902,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 				__count_vm_events(PGINODESTEAL, reap);
 			mm_account_reclaimed_pages(reap);
 		}
-		iput(inode);
+		inode_unpin_lru_isolating(inode);
 		spin_lock(lru_lock);
 		return LRU_RETRY;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fd34b5755c0b..fb0426f349fc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2392,6 +2392,9 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
  *
  * I_PINNING_FSCACHE_WB	Inode is pinning an fscache object for writeback.
  *
+ * I_LRU_ISOLATING	Inode is pinned being isolated from LRU without holding
+ *			i_count.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC		(1 << 0)
@@ -2415,6 +2418,8 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 #define I_DONTCACHE		(1 << 16)
 #define I_SYNC_QUEUED		(1 << 17)
 #define I_PINNING_NETFS_WB	(1 << 18)
+#define __I_LRU_ISOLATING	19
+#define I_LRU_ISOLATING		(1 << __I_LRU_ISOLATING)
 
 #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
-- 
cgit 


From e3786b29c54cdae3490b07180a54e2461f42144c Mon Sep 17 00:00:00 2001
From: Dominique Martinet <asmadeus@codewreck.org>
Date: Thu, 8 Aug 2024 14:29:38 +0100
Subject: 9p: Fix DIO read through netfs

If a program is watching a file on a 9p mount, it won't see any change in
size if the file being exported by the server is changed directly in the
source filesystem, presumably because 9p doesn't have change notifications,
and because netfs skips the reads if the file is empty.

Fix this by attempting to read the full size specified when a DIO read is
requested (such as when 9p is operating in unbuffered mode) and dealing
with a short read if the EOF was less than the expected read.

To make this work, filesystems using netfslib must not set
NETFS_SREQ_CLEAR_TAIL if performing a DIO read where that read hit the EOF.
I don't want to mandatorily clear this flag in netfslib for DIO because,
say, ceph might make a read from an object that is not completely filled,
but does not reside at the end of file - and so we need to clear the
excess.

This can be tested by watching an empty file over 9p within a VM (such as
in the ktest framework):

        while true; do read content; if [ -n "$content" ]; then echo $content; break; fi; done < /host/tmp/foo

then writing something into the empty file.  The watcher should immediately
display the file content and break out of the loop.  Without this fix, it
remains in the loop indefinitely.

Fixes: 80105ed2fd27 ("9p: Use netfslib read/write_iter")
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218916
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/1229195.1723211769@warthog.procyon.org.uk
cc: Eric Van Hensbergen <ericvh@kernel.org>
cc: Latchesar Ionkov <lucho@ionkov.net>
cc: Christian Schoenebeck <linux_oss@crudebyte.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: ceph-devel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/9p/vfs_addr.c     |  3 ++-
 fs/afs/file.c        |  3 ++-
 fs/ceph/addr.c       |  6 ++++--
 fs/netfs/io.c        | 17 +++++++++++------
 fs/nfs/fscache.c     |  3 ++-
 fs/smb/client/file.c |  3 ++-
 6 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index a97ceb105cd8..24fdc74caeba 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -75,7 +75,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
 
 	/* if we just extended the file size, any portion not in
 	 * cache won't be on server and is zeroes */
-	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+	if (subreq->rreq->origin != NETFS_DIO_READ)
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 
 	netfs_subreq_terminated(subreq, err ?: total, false);
 }
diff --git a/fs/afs/file.c b/fs/afs/file.c
index c3f0c45ae9a9..ec1be0091fdb 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -242,7 +242,8 @@ static void afs_fetch_data_notify(struct afs_operation *op)
 
 	req->error = error;
 	if (subreq) {
-		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+		if (subreq->rreq->origin != NETFS_DIO_READ)
+			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 		netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
 		req->subreq = NULL;
 	} else if (req->done) {
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index cc0a2240de98..c4744a02db75 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -246,7 +246,8 @@ static void finish_netfs_read(struct ceph_osd_request *req)
 	if (err >= 0) {
 		if (sparse && err > 0)
 			err = ceph_sparse_ext_map_end(op);
-		if (err < subreq->len)
+		if (err < subreq->len &&
+		    subreq->rreq->origin != NETFS_DIO_READ)
 			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 		if (IS_ENCRYPTED(inode) && err > 0) {
 			err = ceph_fscrypt_decrypt_extents(inode,
@@ -282,7 +283,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
 	size_t len;
 	int mode;
 
-	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+	if (rreq->origin != NETFS_DIO_READ)
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 	__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
 
 	if (subreq->start >= inode->i_size)
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index c179a1c73fa7..5367caf3fa28 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -530,7 +530,8 @@ incomplete:
 
 	if (transferred_or_error == 0) {
 		if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
-			subreq->error = -ENODATA;
+			if (rreq->origin != NETFS_DIO_READ)
+				subreq->error = -ENODATA;
 			goto failed;
 		}
 	} else {
@@ -601,9 +602,14 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 			}
 			if (subreq->len > ictx->zero_point - subreq->start)
 				subreq->len = ictx->zero_point - subreq->start;
+
+			/* We limit buffered reads to the EOF, but let the
+			 * server deal with larger-than-EOF DIO/unbuffered
+			 * reads.
+			 */
+			if (subreq->len > rreq->i_size - subreq->start)
+				subreq->len = rreq->i_size - subreq->start;
 		}
-		if (subreq->len > rreq->i_size - subreq->start)
-			subreq->len = rreq->i_size - subreq->start;
 		if (rreq->rsize && subreq->len > rreq->rsize)
 			subreq->len = rreq->rsize;
 
@@ -739,11 +745,10 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 	do {
 		_debug("submit %llx + %llx >= %llx",
 		       rreq->start, rreq->submitted, rreq->i_size);
-		if (rreq->origin == NETFS_DIO_READ &&
-		    rreq->start + rreq->submitted >= rreq->i_size)
-			break;
 		if (!netfs_rreq_submit_slice(rreq, &io_iter))
 			break;
+		if (test_bit(NETFS_SREQ_NO_PROGRESS, &rreq->flags))
+			break;
 		if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
 		    test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
 			break;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index bf29a65c5027..7a558dea75c4 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -363,7 +363,8 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
 		return;
 
 	sreq = netfs->sreq;
-	if (test_bit(NFS_IOHDR_EOF, &hdr->flags))
+	if (test_bit(NFS_IOHDR_EOF, &hdr->flags) &&
+	    sreq->rreq->origin != NETFS_DIO_READ)
 		__set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
 
 	if (hdr->error)
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index b2405dd4d4d4..3f3842e7b44a 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -217,7 +217,8 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
 			goto out;
 	}
 
-	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+	if (subreq->rreq->origin != NETFS_DIO_READ)
+		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 
 	rc = rdata->server->ops->async_readv(rdata);
 out:
-- 
cgit 


From 810ee43d9cd245d138a2733d87a24858a23f577d Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@squashfs.org.uk>
Date: Mon, 12 Aug 2024 00:28:21 +0100
Subject: Squashfs: sanity check symbolic link size

Syzkiller reports a "KMSAN: uninit-value in pick_link" bug.

This is caused by an uninitialised page, which is ultimately caused
by a corrupted symbolic link size read from disk.

The reason why the corrupted symlink size causes an uninitialised
page is due to the following sequence of events:

1. squashfs_read_inode() is called to read the symbolic
   link from disk.  This assigns the corrupted value
   3875536935 to inode->i_size.

2. Later squashfs_symlink_read_folio() is called, which assigns
   this corrupted value to the length variable, which being a
   signed int, overflows producing a negative number.

3. The following loop that fills in the page contents checks that
   the copied bytes is less than length, which being negative means
   the loop is skipped, producing an uninitialised page.

This patch adds a sanity check which checks that the symbolic
link size is not larger than expected.

--

Signed-off-by: Phillip Lougher <phillip@squashfs.org.uk>
Link: https://lore.kernel.org/r/20240811232821.13903-1-phillip@squashfs.org.uk
Reported-by: Lizhi Xu <lizhi.xu@windriver.com>
Reported-by: syzbot+24ac24ff58dc5b0d26b9@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/000000000000a90e8c061e86a76b@google.com/
V2: fix spelling mistake.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/squashfs/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 16bd693d0b3a..d5918eba27e3 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -279,8 +279,13 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		if (err < 0)
 			goto failed_read;
 
-		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
+		if (inode->i_size > PAGE_SIZE) {
+			ERROR("Corrupted symlink\n");
+			return -EINVAL;
+		}
+
+		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		inode->i_op = &squashfs_symlink_inode_ops;
 		inode_nohighmem(inode);
 		inode->i_data.a_ops = &squashfs_symlink_aops;
-- 
cgit 


From 6252690f7e1b173b86a4c27dfc046b351ab423e7 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Fri, 9 Aug 2024 16:54:22 +0900
Subject: btrfs: fix invalid mapping of extent xarray state

In __extent_writepage_io(), we call btrfs_set_range_writeback() ->
folio_start_writeback(), which clears PAGECACHE_TAG_DIRTY mark from the
mapping xarray if the folio is not dirty. This worked fine before commit
97713b1a2ced ("btrfs: do not clear page dirty inside
extent_write_locked_range()").

After the commit, however, the folio is still dirty at this point, so the
mapping DIRTY tag is not cleared anymore. Then, __extent_writepage_io()
calls btrfs_folio_clear_dirty() to clear the folio's dirty flag. That
results in the page being unlocked with a "strange" state. The page is not
PageDirty, but the mapping tag is set as PAGECACHE_TAG_DIRTY.

This strange state looks like causing a hang with a call trace below when
running fstests generic/091 on a null_blk device. It is waiting for a folio
lock.

While I don't have an exact relation between this hang and the strange
state, fixing the state also fixes the hang. And, that state is worth
fixing anyway.

This commit reorders btrfs_folio_clear_dirty() and
btrfs_set_range_writeback() in __extent_writepage_io(), so that the
PAGECACHE_TAG_DIRTY tag is properly removed from the xarray.

  [464.274] task:fsx             state:D stack:0     pid:3034  tgid:3034  ppid:2853   flags:0x00004002
  [464.286] Call Trace:
  [464.291]  <TASK>
  [464.295]  __schedule+0x10ed/0x6260
  [464.301]  ? __pfx___blk_flush_plug+0x10/0x10
  [464.308]  ? __submit_bio+0x37c/0x450
  [464.314]  ? __pfx___schedule+0x10/0x10
  [464.321]  ? lock_release+0x567/0x790
  [464.327]  ? __pfx_lock_acquire+0x10/0x10
  [464.334]  ? __pfx_lock_release+0x10/0x10
  [464.340]  ? __pfx_lock_acquire+0x10/0x10
  [464.347]  ? __pfx_lock_release+0x10/0x10
  [464.353]  ? do_raw_spin_lock+0x12e/0x270
  [464.360]  schedule+0xdf/0x3b0
  [464.365]  io_schedule+0x8f/0xf0
  [464.371]  folio_wait_bit_common+0x2ca/0x6d0
  [464.378]  ? folio_wait_bit_common+0x1cc/0x6d0
  [464.385]  ? __pfx_folio_wait_bit_common+0x10/0x10
  [464.392]  ? __pfx_filemap_get_folios_tag+0x10/0x10
  [464.400]  ? __pfx_wake_page_function+0x10/0x10
  [464.407]  ? __pfx___might_resched+0x10/0x10
  [464.414]  ? do_raw_spin_unlock+0x58/0x1f0
  [464.420]  extent_write_cache_pages+0xe49/0x1620 [btrfs]
  [464.428]  ? lock_acquire+0x435/0x500
  [464.435]  ? __pfx_extent_write_cache_pages+0x10/0x10 [btrfs]
  [464.443]  ? btrfs_do_write_iter+0x493/0x640 [btrfs]
  [464.451]  ? orc_find.part.0+0x1d4/0x380
  [464.457]  ? __pfx_lock_release+0x10/0x10
  [464.464]  ? __pfx_lock_release+0x10/0x10
  [464.471]  ? btrfs_do_write_iter+0x493/0x640 [btrfs]
  [464.478]  btrfs_writepages+0x1cc/0x460 [btrfs]
  [464.485]  ? __pfx_btrfs_writepages+0x10/0x10 [btrfs]
  [464.493]  ? is_bpf_text_address+0x6e/0x100
  [464.500]  ? kernel_text_address+0x145/0x160
  [464.507]  ? unwind_get_return_address+0x5e/0xa0
  [464.514]  ? arch_stack_walk+0xac/0x100
  [464.521]  do_writepages+0x176/0x780
  [464.527]  ? lock_release+0x567/0x790
  [464.533]  ? __pfx_do_writepages+0x10/0x10
  [464.540]  ? __pfx_lock_acquire+0x10/0x10
  [464.546]  ? __pfx_stack_trace_save+0x10/0x10
  [464.553]  ? do_raw_spin_lock+0x12e/0x270
  [464.560]  ? do_raw_spin_unlock+0x58/0x1f0
  [464.566]  ? _raw_spin_unlock+0x23/0x40
  [464.573]  ? wbc_attach_and_unlock_inode+0x3da/0x7d0
  [464.580]  filemap_fdatawrite_wbc+0x113/0x180
  [464.587]  ? prepare_pages.constprop.0+0x13c/0x5c0 [btrfs]
  [464.596]  __filemap_fdatawrite_range+0xaf/0xf0
  [464.603]  ? __pfx___filemap_fdatawrite_range+0x10/0x10
  [464.611]  ? trace_irq_enable.constprop.0+0xce/0x110
  [464.618]  ? kasan_quarantine_put+0xd7/0x1e0
  [464.625]  btrfs_start_ordered_extent+0x46f/0x570 [btrfs]
  [464.633]  ? __pfx_btrfs_start_ordered_extent+0x10/0x10 [btrfs]
  [464.642]  ? __clear_extent_bit+0x2c0/0x9d0 [btrfs]
  [464.650]  btrfs_lock_and_flush_ordered_range+0xc6/0x180 [btrfs]
  [464.659]  ? __pfx_btrfs_lock_and_flush_ordered_range+0x10/0x10 [btrfs]
  [464.669]  btrfs_read_folio+0x12a/0x1d0 [btrfs]
  [464.676]  ? __pfx_btrfs_read_folio+0x10/0x10 [btrfs]
  [464.684]  ? __pfx_filemap_add_folio+0x10/0x10
  [464.691]  ? __pfx___might_resched+0x10/0x10
  [464.698]  ? __filemap_get_folio+0x1c5/0x450
  [464.705]  prepare_uptodate_page+0x12e/0x4d0 [btrfs]
  [464.713]  prepare_pages.constprop.0+0x13c/0x5c0 [btrfs]
  [464.721]  ? fault_in_iov_iter_readable+0xd2/0x240
  [464.729]  btrfs_buffered_write+0x5bd/0x12f0 [btrfs]
  [464.737]  ? __pfx_btrfs_buffered_write+0x10/0x10 [btrfs]
  [464.745]  ? __pfx_lock_release+0x10/0x10
  [464.752]  ? generic_write_checks+0x275/0x400
  [464.759]  ? down_write+0x118/0x1f0
  [464.765]  ? up_write+0x19b/0x500
  [464.770]  btrfs_direct_write+0x731/0xba0 [btrfs]
  [464.778]  ? __pfx_btrfs_direct_write+0x10/0x10 [btrfs]
  [464.785]  ? __pfx___might_resched+0x10/0x10
  [464.792]  ? lock_acquire+0x435/0x500
  [464.798]  ? lock_acquire+0x435/0x500
  [464.804]  btrfs_do_write_iter+0x494/0x640 [btrfs]
  [464.811]  ? __pfx_btrfs_do_write_iter+0x10/0x10 [btrfs]
  [464.819]  ? __pfx___might_resched+0x10/0x10
  [464.825]  ? rw_verify_area+0x6d/0x590
  [464.831]  vfs_write+0x5d7/0xf50
  [464.837]  ? __might_fault+0x9d/0x120
  [464.843]  ? __pfx_vfs_write+0x10/0x10
  [464.849]  ? btrfs_file_llseek+0xb1/0xfb0 [btrfs]
  [464.856]  ? lock_release+0x567/0x790
  [464.862]  ksys_write+0xfb/0x1d0
  [464.867]  ? __pfx_ksys_write+0x10/0x10
  [464.873]  ? _raw_spin_unlock+0x23/0x40
  [464.879]  ? btrfs_getattr+0x4af/0x670 [btrfs]
  [464.886]  ? vfs_getattr_nosec+0x79/0x340
  [464.892]  do_syscall_64+0x95/0x180
  [464.898]  ? __do_sys_newfstat+0xde/0xf0
  [464.904]  ? __pfx___do_sys_newfstat+0x10/0x10
  [464.911]  ? trace_irq_enable.constprop.0+0xce/0x110
  [464.918]  ? syscall_exit_to_user_mode+0xac/0x2a0
  [464.925]  ? do_syscall_64+0xa1/0x180
  [464.931]  ? trace_irq_enable.constprop.0+0xce/0x110
  [464.939]  ? trace_irq_enable.constprop.0+0xce/0x110
  [464.946]  ? syscall_exit_to_user_mode+0xac/0x2a0
  [464.953]  ? btrfs_file_llseek+0xb1/0xfb0 [btrfs]
  [464.960]  ? do_syscall_64+0xa1/0x180
  [464.966]  ? btrfs_file_llseek+0xb1/0xfb0 [btrfs]
  [464.973]  ? trace_irq_enable.constprop.0+0xce/0x110
  [464.980]  ? syscall_exit_to_user_mode+0xac/0x2a0
  [464.987]  ? __pfx_btrfs_file_llseek+0x10/0x10 [btrfs]
  [464.995]  ? trace_irq_enable.constprop.0+0xce/0x110
  [465.002]  ? __pfx_btrfs_file_llseek+0x10/0x10 [btrfs]
  [465.010]  ? do_syscall_64+0xa1/0x180
  [465.016]  ? lock_release+0x567/0x790
  [465.022]  ? __pfx_lock_acquire+0x10/0x10
  [465.028]  ? __pfx_lock_release+0x10/0x10
  [465.034]  ? trace_irq_enable.constprop.0+0xce/0x110
  [465.042]  ? syscall_exit_to_user_mode+0xac/0x2a0
  [465.049]  ? do_syscall_64+0xa1/0x180
  [465.055]  ? syscall_exit_to_user_mode+0xac/0x2a0
  [465.062]  ? do_syscall_64+0xa1/0x180
  [465.068]  ? syscall_exit_to_user_mode+0xac/0x2a0
  [465.075]  ? do_syscall_64+0xa1/0x180
  [465.081]  ? clear_bhb_loop+0x25/0x80
  [465.087]  ? clear_bhb_loop+0x25/0x80
  [465.093]  ? clear_bhb_loop+0x25/0x80
  [465.099]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
  [465.106] RIP: 0033:0x7f093b8ee784
  [465.111] RSP: 002b:00007ffc29d31b28 EFLAGS: 00000202 ORIG_RAX: 0000000000000001
  [465.122] RAX: ffffffffffffffda RBX: 0000000000006000 RCX: 00007f093b8ee784
  [465.131] RDX: 000000000001de00 RSI: 00007f093b6ed200 RDI: 0000000000000003
  [465.141] RBP: 000000000001de00 R08: 0000000000006000 R09: 0000000000000000
  [465.150] R10: 0000000000023e00 R11: 0000000000000202 R12: 0000000000006000
  [465.160] R13: 0000000000023e00 R14: 0000000000023e00 R15: 0000000000000001
  [465.170]  </TASK>
  [465.174] INFO: lockdep is turned off.

Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fixes: 97713b1a2ced ("btrfs: do not clear page dirty inside extent_write_locked_range()")
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index aa7f8148cd0d..c73cd4f89015 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1496,6 +1496,13 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 		free_extent_map(em);
 		em = NULL;
 
+		/*
+		 * Although the PageDirty bit might be cleared before entering
+		 * this function, subpage dirty bit is not cleared.
+		 * So clear subpage dirty bit here so next time we won't submit
+		 * page for range already written to disk.
+		 */
+		btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
 		btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
 		if (!PageWriteback(page)) {
 			btrfs_err(inode->root->fs_info,
@@ -1503,13 +1510,6 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
 			       page->index, cur, end);
 		}
 
-		/*
-		 * Although the PageDirty bit is cleared before entering this
-		 * function, subpage dirty bit is not cleared.
-		 * So clear subpage dirty bit here so next time we won't submit
-		 * page for range already written to disk.
-		 */
-		btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
 
 		submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
 				   cur - page_offset(page));
-- 
cgit 


From f50733b45d865f91db90919f8311e2127ce5a0cb Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Thu, 8 Aug 2024 11:39:08 -0700
Subject: exec: Fix ToCToU between perm check and set-uid/gid usage

When opening a file for exec via do_filp_open(), permission checking is
done against the file's metadata at that moment, and on success, a file
pointer is passed back. Much later in the execve() code path, the file
metadata (specifically mode, uid, and gid) is used to determine if/how
to set the uid and gid. However, those values may have changed since the
permissions check, meaning the execution may gain unintended privileges.

For example, if a file could change permissions from executable and not
set-id:

---------x 1 root root 16048 Aug  7 13:16 target

to set-id and non-executable:

---S------ 1 root root 16048 Aug  7 13:16 target

it is possible to gain root privileges when execution should have been
disallowed.

While this race condition is rare in real-world scenarios, it has been
observed (and proven exploitable) when package managers are updating
the setuid bits of installed programs. Such files start with being
world-executable but then are adjusted to be group-exec with a set-uid
bit. For example, "chmod o-x,u+s target" makes "target" executable only
by uid "root" and gid "cdrom", while also becoming setuid-root:

-rwxr-xr-x 1 root cdrom 16048 Aug  7 13:16 target

becomes:

-rwsr-xr-- 1 root cdrom 16048 Aug  7 13:16 target

But racing the chmod means users without group "cdrom" membership can
get the permission to execute "target" just before the chmod, and when
the chmod finishes, the exec reaches brpm_fill_uid(), and performs the
setuid to root, violating the expressed authorization of "only cdrom
group members can setuid to root".

Re-check that we still have execute permissions in case the metadata
has changed. It would be better to keep a copy from the perm-check time,
but until we can do that refactoring, the least-bad option is to do a
full inode_permission() call (under inode lock). It is understood that
this is safe against dead-locks, but hardly optimal.

Reported-by: Marco Vanotti <mvanotti@google.com>
Tested-by: Marco Vanotti <mvanotti@google.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/exec.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index a126e3d1cacb..50e76cc633c4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1692,6 +1692,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
 	unsigned int mode;
 	vfsuid_t vfsuid;
 	vfsgid_t vfsgid;
+	int err;
 
 	if (!mnt_may_suid(file->f_path.mnt))
 		return;
@@ -1708,12 +1709,17 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
 	/* Be careful if suid/sgid is set */
 	inode_lock(inode);
 
-	/* reload atomically mode/uid/gid now that lock held */
+	/* Atomically reload and check mode/uid/gid now that lock held. */
 	mode = inode->i_mode;
 	vfsuid = i_uid_into_vfsuid(idmap, inode);
 	vfsgid = i_gid_into_vfsgid(idmap, inode);
+	err = inode_permission(idmap, inode, MAY_EXEC);
 	inode_unlock(inode);
 
+	/* Did the exec bit vanish out from under us? Give up. */
+	if (err)
+		return;
+
 	/* We ignore suid/sgid if there are no mappings for them in the ns */
 	if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
 	    !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
-- 
cgit 


From a24e6e7146e361aa0855cf8ee3b2e80b8eb692e3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Aug 2024 22:40:39 -0400
Subject: bcachefs: delete faulty fastpath in bch2_btree_path_traverse_cached()

bch2_btree_path_traverse_cached() was previously checking if it could
just relock the path, which is a common idiom in path traversal.

However, it was using btree_node_relock(), not btree_path_relock();
btree_path_relock() only succeeds if the path was in state
BTREE_ITER_NEED_RELOCK.

If the path was in state BTREE_ITER_NEED_TRAVERSE a full traversal is
needed; this led to a null ptr deref in
bch2_btree_path_traverse_cached().

And the short circuit check here isn't needed, since it was already done
in the main bch2_btree_path_traverse_one().

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index f2f2e525460b..79954490627c 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -497,11 +497,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path
 
 	path->l[1].b = NULL;
 
-	if (bch2_btree_node_relock_notrace(trans, path, 0)) {
-		path->uptodate = BTREE_ITER_UPTODATE;
-		return 0;
-	}
-
 	int ret;
 	do {
 		ret = btree_path_traverse_cached_fast(trans, path);
-- 
cgit 


From bd864bc2d90790e00b02b17c75fb951cb4b0bb8b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Aug 2024 23:24:03 -0400
Subject: bcachefs: Fix bch2_trigger_alloc when upgrading from old versions

bch2_trigger_alloc was assuming that the new key would always be newly
created and thus always an alloc_v4 key, but - not when called from
btree_gc.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index d9c5a92fa708..0a8a1bc9a4ac 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -829,7 +829,19 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 
 	struct bch_alloc_v4 old_a_convert;
 	const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
-	struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
+
+	struct bch_alloc_v4 *new_a;
+	if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
+		new_a = bkey_s_to_alloc_v4(new).v;
+	} else {
+		BUG_ON(!(flags & BTREE_TRIGGER_gc));
+
+		struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
+		ret = PTR_ERR_OR_ZERO(new_ka);
+		if (unlikely(ret))
+			goto err;
+		new_a = &new_ka->v;
+	}
 
 	if (flags & BTREE_TRIGGER_transactional) {
 		alloc_data_type_set(new_a, new_a->data_type);
-- 
cgit 


From d9e615762bf2eb7459fb0f270525f8b186bce6b7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Aug 2024 04:53:12 -0400
Subject: bcachefs: bch2_accounting_invalid() fixup

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 046ac92b6639..212f53927111 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -154,7 +154,7 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
 				 "accounting key replicas entry with bad nr_required");
 
 		for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
-			bkey_fsck_err_on(acc_k.replicas.devs[i] > acc_k.replicas.devs[i + 1],
+			bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
 					 c, err, accounting_key_replicas_devs_unsorted,
 					 "accounting key replicas entry with unsorted devs");
 
-- 
cgit 


From 486d920735325e507d965c2639ba2775b81fd329 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Aug 2024 22:47:55 -0400
Subject: bcachefs: disk accounting: ignore unknown types

forward compat fix

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/disk_accounting.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 212f53927111..03a9de6c2e0a 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -528,6 +528,9 @@ int bch2_gc_accounting_done(struct bch_fs *c)
 		struct disk_accounting_pos acc_k;
 		bpos_to_disk_accounting_pos(&acc_k, e->pos);
 
+		if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+			continue;
+
 		u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS];
 		u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS];
 
@@ -760,6 +763,12 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
 			struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k);
 			unsigned nr = bch2_accounting_counters(k.k);
 
+			struct disk_accounting_pos acc_k;
+			bpos_to_disk_accounting_pos(&acc_k, k.k->p);
+
+			if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
+				continue;
+
 			bch2_accounting_mem_read(c, k.k->p, v, nr);
 
 			if (memcmp(a.v->d, v, nr * sizeof(u64))) {
@@ -775,9 +784,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
 				mismatch = true;
 			}
 
-			struct disk_accounting_pos acc_k;
-			bpos_to_disk_accounting_pos(&acc_k, a.k->p);
-
 			switch (acc_k.type) {
 			case BCH_DISK_ACCOUNTING_persistent_reserved:
 				base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
-- 
cgit 


From 48d6cc1b4895ada0781da11a0a483332a236ec14 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 13 Aug 2024 01:01:35 -0400
Subject: bcachefs: Add missing downgrade table entry

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-downgrade.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 6c4469f53313..9f82d497d9e0 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -104,6 +104,7 @@
 	  BCH_FSCK_ERR_fs_usage_nr_inodes_wrong,		\
 	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
 	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
+	  BCH_FSCK_ERR_accounting_replicas_not_marked,		\
 	  BCH_FSCK_ERR_bkey_version_in_future)
 
 struct upgrade_downgrade_entry {
-- 
cgit 


From 968feb854a86b59cc4bc72af3105989706ca2c7d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 7 Aug 2024 16:34:28 -0400
Subject: bcachefs: Convert for_each_btree_node() to lockrestart_do()

for_each_btree_node() now works similarly to for_each_btree_key(), where
the loop body is passed as an argument to be passed to lockrestart_do().

This now calls trans_begin() on every loop iteration - which fixes an
SRCU warning in backpointers fsck.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/backpointers.c | 15 +++++----------
 fs/bcachefs/btree_iter.c   |  1 +
 fs/bcachefs/btree_iter.h   | 42 +++++++++++++++++++++++++++---------------
 fs/bcachefs/debug.c        | 38 +++++++++-----------------------------
 4 files changed, 42 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 3cc02479a982..9edc4c5f735c 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -763,27 +763,22 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
 	     btree < BTREE_ID_NR && !ret;
 	     btree++) {
 		unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1;
-		struct btree_iter iter;
-		struct btree *b;
 
 		if (!(BIT_ULL(btree) & btree_leaf_mask) &&
 		    !(BIT_ULL(btree) & btree_interior_mask))
 			continue;
 
-		bch2_trans_begin(trans);
-
-		__for_each_btree_node(trans, iter, btree,
+		ret = __for_each_btree_node(trans, iter, btree,
 				      btree == start.btree ? start.pos : POS_MIN,
-				      0, depth, BTREE_ITER_prefetch, b, ret) {
+				      0, depth, BTREE_ITER_prefetch, b, ({
 			mem_may_pin -= btree_buf_bytes(b);
 			if (mem_may_pin <= 0) {
 				c->btree_cache.pinned_nodes_end = *end =
 					BBPOS(btree, b->key.k.p);
-				bch2_trans_iter_exit(trans, &iter);
-				return 0;
+				break;
 			}
-		}
-		bch2_trans_iter_exit(trans, &iter);
+			0;
+		}));
 	}
 
 	return ret;
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index aa8a049071f4..2e84d22e17bd 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -1900,6 +1900,7 @@ err:
 	goto out;
 }
 
+/* Only kept for -tools */
 struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
 {
 	struct btree *b;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index c7725865309c..dca62375d7d3 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -600,23 +600,35 @@ void bch2_trans_srcu_unlock(struct btree_trans *);
 
 u32 bch2_trans_begin(struct btree_trans *);
 
-/*
- * XXX
- * this does not handle transaction restarts from bch2_btree_iter_next_node()
- * correctly
- */
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			      _locks_want, _depth, _flags, _b, _ret)	\
-	for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id),	\
-				_start, _locks_want, _depth, _flags);	\
-	     (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)),	\
-	     !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);			\
-	     (_b) = bch2_btree_iter_next_node(&(_iter)))
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start,			\
+			      _locks_want, _depth, _flags, _b, _do)		\
+({										\
+	bch2_trans_begin((_trans));						\
+										\
+	struct btree_iter _iter;						\
+	bch2_trans_node_iter_init((_trans), &_iter, (_btree_id),		\
+				  _start, _locks_want, _depth, _flags);		\
+	int _ret3 = 0;								\
+	do {									\
+		_ret3 = lockrestart_do((_trans), ({				\
+			struct btree *_b = bch2_btree_iter_peek_node(&_iter);	\
+			if (!_b)						\
+				break;						\
+										\
+			PTR_ERR_OR_ZERO(_b) ?: (_do);				\
+		})) ?:								\
+		lockrestart_do((_trans),					\
+			PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter)));	\
+	} while (!_ret3);							\
+										\
+	bch2_trans_iter_exit((_trans), &(_iter));				\
+	_ret3;									\
+})
 
 #define for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			    _flags, _b, _ret)				\
-	__for_each_btree_node(_trans, _iter, _btree_id, _start,		\
-			      0, 0, _flags, _b, _ret)
+			    _flags, _b, _do)				\
+	__for_each_btree_node(_trans, _iter, _btree_id, _start,	\
+			      0, 0, _flags, _b, _do)
 
 static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
 							     unsigned flags)
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index ebabab171fe5..45aec1afdb0e 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -397,47 +397,27 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 				       size_t size, loff_t *ppos)
 {
 	struct dump_iter *i = file->private_data;
-	struct btree_trans *trans;
-	struct btree_iter iter;
-	struct btree *b;
-	ssize_t ret;
 
 	i->ubuf = buf;
 	i->size	= size;
 	i->ret	= 0;
 
-	ret = flush_buf(i);
+	ssize_t ret = flush_buf(i);
 	if (ret)
 		return ret;
 
 	if (bpos_eq(SPOS_MAX, i->from))
 		return i->ret;
 
-	trans = bch2_trans_get(i->c);
-retry:
-	bch2_trans_begin(trans);
-
-	for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
-		bch2_btree_node_to_text(&i->buf, i->c, b);
-		i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
-			? bpos_successor(b->key.k.p)
-			: b->key.k.p;
-
-		ret = drop_locks_do(trans, flush_buf(i));
-		if (ret)
-			break;
-	}
-	bch2_trans_iter_exit(trans, &iter);
-
-	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-		goto retry;
-
-	bch2_trans_put(trans);
-
-	if (!ret)
-		ret = flush_buf(i);
+	return bch2_trans_run(i->c,
+		for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({
+			bch2_btree_node_to_text(&i->buf, i->c, b);
+			i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
+				? bpos_successor(b->key.k.p)
+				: b->key.k.p;
 
-	return ret ?: i->ret;
+			drop_locks_do(trans, flush_buf(i));
+		}))) ?: i->ret;
 }
 
 static const struct file_operations btree_format_debug_ops = {
-- 
cgit 


From 7254555c440ff6b136aa97fb3c33fd5e0bb4fb9f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 Aug 2024 14:40:09 -0400
Subject: bcachefs: Add hysteresis to waiting on btree key cache flush

This helps ensure key cache reclaim isn't contending with threads
waiting for the key cache to be helped, and fixes a severe performance
bug.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.h    | 9 +++++++++
 fs/bcachefs/btree_trans_commit.c | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index e6b2cd0dd2c1..113309f62918 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -20,6 +20,15 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
 	return nr_dirty > max_dirty;
 }
 
+static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
+{
+	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
+	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
+	size_t max_dirty = 2048 + (nr_keys * 5) / 8;
+
+	return nr_dirty <= max_dirty;
+}
+
 int bch2_btree_key_cache_journal_flush(struct journal *,
 				struct journal_entry_pin *, u64);
 
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index cca336fe46e9..f567bfb82850 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -927,7 +927,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 static int journal_reclaim_wait_done(struct bch_fs *c)
 {
 	int ret = bch2_journal_error(&c->journal) ?:
-		!bch2_btree_key_cache_must_wait(c);
+		bch2_btree_key_cache_wait_done(c);
 
 	if (!ret)
 		journal_reclaim_kick(&c->journal);
-- 
cgit 


From 790666c8ac6427ddaa00f502dc44073c1e039355 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 Aug 2024 14:31:17 -0400
Subject: bcachefs: Improve trans_blocked_journal_reclaim tracepoint

include information about the state of the btree key cache

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.h |  9 +++++++--
 fs/bcachefs/trace.c           |  1 +
 fs/bcachefs/trace.h           | 27 +++++++++++++++++++++++++--
 3 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 113309f62918..51d6289b8dee 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -11,13 +11,18 @@ static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
 	return max_t(ssize_t, 0, nr_dirty - max_dirty);
 }
 
-static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+static inline ssize_t __bch2_btree_key_cache_must_wait(struct bch_fs *c)
 {
 	size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
 	size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
 	size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 
-	return nr_dirty > max_dirty;
+	return nr_dirty - max_dirty;
+}
+
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
+{
+	return __bch2_btree_key_cache_must_wait(c) > 0;
 }
 
 static inline bool bch2_btree_key_cache_wait_done(struct bch_fs *c)
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
index dc48b52b01b4..dfad1d06633d 100644
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
@@ -4,6 +4,7 @@
 #include "buckets.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
+#include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "keylist.h"
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index d0e6b9deb6cb..c62f00322d1e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -988,10 +988,33 @@ TRACE_EVENT(trans_restart_split_race,
 		  __entry->u64s_remaining)
 );
 
-DEFINE_EVENT(transaction_event,	trans_blocked_journal_reclaim,
+TRACE_EVENT(trans_blocked_journal_reclaim,
 	TP_PROTO(struct btree_trans *trans,
 		 unsigned long caller_ip),
-	TP_ARGS(trans, caller_ip)
+	TP_ARGS(trans, caller_ip),
+
+	TP_STRUCT__entry(
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
+
+		__field(unsigned long,		key_cache_nr_keys	)
+		__field(unsigned long,		key_cache_nr_dirty	)
+		__field(long,			must_wait		)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
+		__entry->key_cache_nr_keys	= atomic_long_read(&trans->c->btree_key_cache.nr_keys);
+		__entry->key_cache_nr_dirty	= atomic_long_read(&trans->c->btree_key_cache.nr_dirty);
+		__entry->must_wait		= __bch2_btree_key_cache_must_wait(trans->c);
+	),
+
+	TP_printk("%s %pS key cache keys %lu dirty %lu must_wait %li",
+		  __entry->trans_fn, (void *) __entry->caller_ip,
+		  __entry->key_cache_nr_keys,
+		  __entry->key_cache_nr_dirty,
+		  __entry->must_wait)
 );
 
 TRACE_EVENT(trans_restart_journal_preres_get,
-- 
cgit 


From 06a8693b890c0cf7d94bf7c6f0e2adf3a3aaa346 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 10 Aug 2024 15:48:18 -0400
Subject: bcachefs: Add a time_stat for blocked on key cache flush

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h           | 1 +
 fs/bcachefs/btree_trans_commit.c | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index eedf2d6045e7..0c7086e00d18 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -447,6 +447,7 @@ BCH_DEBUG_PARAMS_DEBUG()
 	x(blocked_journal_low_on_space)		\
 	x(blocked_journal_low_on_pin)		\
 	x(blocked_journal_max_in_flight)	\
+	x(blocked_key_cache_flush)		\
 	x(blocked_allocate)			\
 	x(blocked_allocate_open_bucket)		\
 	x(blocked_write_buffer_full)		\
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index f567bfb82850..ac0c92683aad 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -973,9 +973,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 		bch2_trans_unlock(trans);
 
 		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+		track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], true);
 
 		wait_event_freezable(c->journal.reclaim_wait,
 				     (ret = journal_reclaim_wait_done(c)));
+
+		track_event_change(&c->times[BCH_TIME_blocked_key_cache_flush], false);
+
 		if (ret < 0)
 			break;
 
-- 
cgit 


From c99471024f24b3cbafc02bf5b112ecf34b0dbd40 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Aug 2024 23:29:46 -0400
Subject: bcachefs: Fix warning in __bch2_fsck_err() for trans not passed in

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_io.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 2c424435ca4a..70ebcca08ba2 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1767,6 +1767,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
 
 	set_btree_node_read_in_flight(b);
 
+	/* we can't pass the trans to read_done() for fsck errors, so it must be unlocked */
+	bch2_trans_unlock(trans);
 	bch2_btree_node_read(trans, b, true);
 
 	if (btree_node_read_error(b)) {
-- 
cgit 


From d97de0d017cde0d442c3d144b4f969f43064cc0f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Aug 2024 21:31:25 -0400
Subject: bcachefs: Make bkey_fsck_err() a wrapper around fsck_err()

bkey_fsck_err() was added as an interface that looks like fsck_err(),
but previously all it did was ensure that the appropriate error counter
was incremented in the superblock.

This is a cleanup and bugfix patch that converts it to a wrapper around
fsck_err(). This is needed to fix an issue with the upgrade path to
disk_accounting_v3, where the "silent fix" error list now includes
bkey_fsck errors; fsck_err() handles this in a unified way, and since we
need to change printing of bkey fsck errors from the caller to the inner
bkey_fsck_err() calls, this ends up being a pretty big change.

Als,, rename .invalid() methods to .validate(), for clarity, while we're
changing the function signature anyways (to drop the printbuf argument).

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c      |  63 ++++++++--------
 fs/bcachefs/alloc_background.h      |  26 +++----
 fs/bcachefs/backpointers.c          |   8 +-
 fs/bcachefs/backpointers.h          |   5 +-
 fs/bcachefs/bkey.h                  |   7 +-
 fs/bcachefs/bkey_methods.c          | 109 +++++++++++++--------------
 fs/bcachefs/bkey_methods.h          |  21 +++---
 fs/bcachefs/btree_io.c              |  67 +++++------------
 fs/bcachefs/btree_node_scan.c       |   2 +-
 fs/bcachefs/btree_trans_commit.c    |  72 ++++--------------
 fs/bcachefs/btree_update_interior.c |  16 +---
 fs/bcachefs/data_update.c           |   6 +-
 fs/bcachefs/dirent.c                |  33 ++++-----
 fs/bcachefs/dirent.h                |   5 +-
 fs/bcachefs/disk_accounting.c       |  13 ++--
 fs/bcachefs/disk_accounting.h       |   5 +-
 fs/bcachefs/ec.c                    |  15 ++--
 fs/bcachefs/ec.h                    |   5 +-
 fs/bcachefs/errcode.h               |   1 +
 fs/bcachefs/error.c                 |  22 ++++++
 fs/bcachefs/error.h                 |  39 ++++++----
 fs/bcachefs/extents.c               | 144 ++++++++++++++++++------------------
 fs/bcachefs/extents.h               |  24 +++---
 fs/bcachefs/inode.c                 |  77 +++++++++----------
 fs/bcachefs/inode.h                 |  24 +++---
 fs/bcachefs/journal_io.c            |  24 ++----
 fs/bcachefs/lru.c                   |   9 +--
 fs/bcachefs/lru.h                   |   5 +-
 fs/bcachefs/quota.c                 |   8 +-
 fs/bcachefs/quota.h                 |   5 +-
 fs/bcachefs/reflink.c               |  19 ++---
 fs/bcachefs/reflink.h               |  22 +++---
 fs/bcachefs/snapshot.c              |  42 +++++------
 fs/bcachefs/snapshot.h              |  11 ++-
 fs/bcachefs/subvolume.c             |  16 ++--
 fs/bcachefs/subvolume.h             |   5 +-
 fs/bcachefs/xattr.c                 |  21 +++---
 fs/bcachefs/xattr.h                 |   5 +-
 38 files changed, 448 insertions(+), 553 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 0a8a1bc9a4ac..fd3a2522bc3e 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -196,75 +196,71 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
 	return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
+int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
+			   enum bch_validate_flags flags)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 	int ret = 0;
 
 	/* allow for unknown fields */
-	bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
-			 alloc_v1_val_size_bad,
+	bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v),
+			 c, alloc_v1_val_size_bad,
 			 "incorrect value size (%zu < %u)",
 			 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
 fsck_err:
 	return ret;
 }
 
-int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
+int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+			   enum bch_validate_flags flags)
 {
 	struct bkey_alloc_unpacked u;
 	int ret = 0;
 
-	bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
-			 alloc_v2_unpack_error,
+	bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k),
+			 c, alloc_v2_unpack_error,
 			 "unpack error");
 fsck_err:
 	return ret;
 }
 
-int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
+int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
+			   enum bch_validate_flags flags)
 {
 	struct bkey_alloc_unpacked u;
 	int ret = 0;
 
-	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
-			 alloc_v2_unpack_error,
+	bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
+			 c, alloc_v2_unpack_error,
 			 "unpack error");
 fsck_err:
 	return ret;
 }
 
-int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags, struct printbuf *err)
+int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
+			   enum bch_validate_flags flags)
 {
 	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
 	int ret = 0;
 
-	bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err,
-			 alloc_v4_val_size_bad,
+	bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k),
+			 c, alloc_v4_val_size_bad,
 			 "bad val size (%u > %zu)",
 			 alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
 
 	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
-			 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
-			 alloc_v4_backpointers_start_bad,
+			 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v),
+			 c, alloc_v4_backpointers_start_bad,
 			 "invalid backpointers_start");
 
-	bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
-			 alloc_key_data_type_bad,
+	bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type,
+			 c, alloc_key_data_type_bad,
 			 "invalid data type (got %u should be %u)",
 			 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
 
 	for (unsigned i = 0; i < 2; i++)
 		bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
-				 c, err,
-				 alloc_key_io_time_bad,
+				 c, alloc_key_io_time_bad,
 				 "invalid io_time[%s]: %llu, max %llu",
 				 i == READ ? "read" : "write",
 				 a.v->io_time[i], LRU_TIME_MAX);
@@ -282,7 +278,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
 				 a.v->dirty_sectors ||
 				 a.v->cached_sectors ||
 				 a.v->stripe,
-				 c, err, alloc_key_empty_but_have_data,
+				 c, alloc_key_empty_but_have_data,
 				 "empty data type free but have data %u.%u.%u %u",
 				 stripe_sectors,
 				 a.v->dirty_sectors,
@@ -296,7 +292,7 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
 	case BCH_DATA_parity:
 		bkey_fsck_err_on(!a.v->dirty_sectors &&
 				 !stripe_sectors,
-				 c, err, alloc_key_dirty_sectors_0,
+				 c, alloc_key_dirty_sectors_0,
 				 "data_type %s but dirty_sectors==0",
 				 bch2_data_type_str(a.v->data_type));
 		break;
@@ -305,12 +301,12 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
 				 a.v->dirty_sectors ||
 				 stripe_sectors ||
 				 a.v->stripe,
-				 c, err, alloc_key_cached_inconsistency,
+				 c, alloc_key_cached_inconsistency,
 				 "data type inconsistency");
 
 		bkey_fsck_err_on(!a.v->io_time[READ] &&
 				 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
-				 c, err, alloc_key_cached_but_read_time_zero,
+				 c, alloc_key_cached_but_read_time_zero,
 				 "cached bucket with read_time == 0");
 		break;
 	case BCH_DATA_stripe:
@@ -513,14 +509,13 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
 		: 0;
 }
 
-int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
-			     enum bch_validate_flags flags,
-			     struct printbuf *err)
+int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
+			     enum bch_validate_flags flags)
 {
 	int ret = 0;
 
-	bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
-			 bucket_gens_val_size_bad,
+	bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens),
+			 c, bucket_gens_val_size_bad,
 			 "bad val size (%zu != %zu)",
 			 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
 fsck_err:
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 96a0444ea78f..260e7fa83d05 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -240,52 +240,48 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
+int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc ((struct bkey_ops) {	\
-	.key_invalid	= bch2_alloc_v1_invalid,	\
+	.key_validate	= bch2_alloc_v1_validate,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trigger	= bch2_trigger_alloc,		\
 	.min_val_size	= 8,				\
 })
 
 #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_alloc_v2_invalid,	\
+	.key_validate	= bch2_alloc_v2_validate,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trigger	= bch2_trigger_alloc,		\
 	.min_val_size	= 8,				\
 })
 
 #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_alloc_v3_invalid,	\
+	.key_validate	= bch2_alloc_v3_validate,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.trigger	= bch2_trigger_alloc,		\
 	.min_val_size	= 16,				\
 })
 
 #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_alloc_v4_invalid,	\
+	.key_validate	= bch2_alloc_v4_validate,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 	.swab		= bch2_alloc_v4_swab,		\
 	.trigger	= bch2_trigger_alloc,		\
 	.min_val_size	= 48,				\
 })
 
-int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
-			     enum bch_validate_flags, struct printbuf *);
+int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
+			     enum bch_validate_flags);
 void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) {	\
-	.key_invalid	= bch2_bucket_gens_invalid,	\
+	.key_validate	= bch2_bucket_gens_validate,	\
 	.val_to_text	= bch2_bucket_gens_to_text,	\
 })
 
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 9edc4c5f735c..d4da6343efa9 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -47,9 +47,8 @@ static bool extent_matches_bp(struct bch_fs *c,
 	return false;
 }
 
-int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
-			     enum bch_validate_flags flags,
-			     struct printbuf *err)
+int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
+			      enum bch_validate_flags flags)
 {
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
 
@@ -68,8 +67,7 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 	bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
 			 !bpos_eq(bp.k->p, bp_pos),
-			 c, err,
-			 backpointer_bucket_offset_wrong,
+			 c, backpointer_bucket_offset_wrong,
 			 "backpointer bucket_offset wrong");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 6021de1c5e98..7daecadb764e 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -18,14 +18,13 @@ static inline u64 swab40(u64 x)
 		((x & 0xff00000000ULL) >> 32));
 }
 
-int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
-			     enum bch_validate_flags, struct printbuf *);
+int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags);
 void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
 void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_backpointer_swab(struct bkey_s);
 
 #define bch2_bkey_ops_backpointer ((struct bkey_ops) {	\
-	.key_invalid	= bch2_backpointer_invalid,	\
+	.key_validate	= bch2_backpointer_validate,	\
 	.val_to_text	= bch2_backpointer_k_to_text,	\
 	.swab		= bch2_backpointer_swab,	\
 	.min_val_size	= 32,				\
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 936357149cf0..e34cb2bf329c 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -10,9 +10,10 @@
 #include "vstructs.h"
 
 enum bch_validate_flags {
-	BCH_VALIDATE_write		= (1U << 0),
-	BCH_VALIDATE_commit		= (1U << 1),
-	BCH_VALIDATE_journal		= (1U << 2),
+	BCH_VALIDATE_write		= BIT(0),
+	BCH_VALIDATE_commit		= BIT(1),
+	BCH_VALIDATE_journal		= BIT(2),
+	BCH_VALIDATE_silent		= BIT(3),
 };
 
 #if 0
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5f07cf853d0c..88d8958281e8 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -27,27 +27,27 @@ const char * const bch2_bkey_types[] = {
 	NULL
 };
 
-static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
-			       enum bch_validate_flags flags, struct printbuf *err)
+static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
+				enum bch_validate_flags flags)
 {
 	return 0;
 }
 
 #define bch2_bkey_ops_deleted ((struct bkey_ops) {	\
-	.key_invalid = deleted_key_invalid,		\
+	.key_validate	= deleted_key_validate,		\
 })
 
 #define bch2_bkey_ops_whiteout ((struct bkey_ops) {	\
-	.key_invalid = deleted_key_invalid,		\
+	.key_validate	= deleted_key_validate,		\
 })
 
-static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
-				 enum bch_validate_flags flags, struct printbuf *err)
+static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
+				 enum bch_validate_flags flags)
 {
 	int ret = 0;
 
-	bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
-			 bkey_val_size_nonzero,
+	bkey_fsck_err_on(bkey_val_bytes(k.k),
+			 c, bkey_val_size_nonzero,
 			 "incorrect value size (%zu != 0)",
 			 bkey_val_bytes(k.k));
 fsck_err:
@@ -55,11 +55,11 @@ fsck_err:
 }
 
 #define bch2_bkey_ops_error ((struct bkey_ops) {	\
-	.key_invalid = empty_val_key_invalid,		\
+	.key_validate = empty_val_key_validate,		\
 })
 
-static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
-				   enum bch_validate_flags flags, struct printbuf *err)
+static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
+				    enum bch_validate_flags flags)
 {
 	return 0;
 }
@@ -73,17 +73,17 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 #define bch2_bkey_ops_cookie ((struct bkey_ops) {	\
-	.key_invalid	= key_type_cookie_invalid,	\
+	.key_validate	= key_type_cookie_validate,	\
 	.val_to_text	= key_type_cookie_to_text,	\
 	.min_val_size	= 8,				\
 })
 
 #define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\
-	.key_invalid = empty_val_key_invalid,		\
+	.key_validate	= empty_val_key_validate,	\
 })
 
-static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
-					enum bch_validate_flags flags, struct printbuf *err)
+static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+					 enum bch_validate_flags flags)
 {
 	return 0;
 }
@@ -98,9 +98,9 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
 	       datalen, min(datalen, 32U), d.v->data);
 }
 
-#define bch2_bkey_ops_inline_data ((struct bkey_ops) {	\
-	.key_invalid	= key_type_inline_data_invalid,	\
-	.val_to_text	= key_type_inline_data_to_text,	\
+#define bch2_bkey_ops_inline_data ((struct bkey_ops) {		\
+	.key_validate	= key_type_inline_data_validate,	\
+	.val_to_text	= key_type_inline_data_to_text,		\
 })
 
 static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
@@ -110,7 +110,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
 }
 
 #define bch2_bkey_ops_set ((struct bkey_ops) {		\
-	.key_invalid	= empty_val_key_invalid,	\
+	.key_validate	= empty_val_key_validate,	\
 	.key_merge	= key_type_set_merge,		\
 })
 
@@ -123,9 +123,8 @@ const struct bkey_ops bch2_bkey_ops[] = {
 const struct bkey_ops bch2_bkey_null_ops = {
 };
 
-int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
+int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
+			   enum bch_validate_flags flags)
 {
 	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
 		return 0;
@@ -133,15 +132,15 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
 	const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
 	int ret = 0;
 
-	bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
-			 bkey_val_size_too_small,
+	bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size,
+			 c, bkey_val_size_too_small,
 			 "bad val size (%zu < %u)",
 			 bkey_val_bytes(k.k), ops->min_val_size);
 
-	if (!ops->key_invalid)
+	if (!ops->key_validate)
 		return 0;
 
-	ret = ops->key_invalid(c, k, flags, err);
+	ret = ops->key_validate(c, k, flags);
 fsck_err:
 	return ret;
 }
@@ -161,18 +160,17 @@ const char *bch2_btree_node_type_str(enum btree_node_type type)
 	return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
 }
 
-int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
-			enum btree_node_type type,
-			enum bch_validate_flags flags,
-			struct printbuf *err)
+int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
+			 enum btree_node_type type,
+			 enum bch_validate_flags flags)
 {
 	if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
 		return 0;
 
 	int ret = 0;
 
-	bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
-			 bkey_u64s_too_small,
+	bkey_fsck_err_on(k.k->u64s < BKEY_U64s,
+			 c, bkey_u64s_too_small,
 			 "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
 
 	if (type >= BKEY_TYPE_NR)
@@ -180,8 +178,8 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 	bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
 			 (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) &&
-			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
-			 bkey_invalid_type_for_btree,
+			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
+			 c, bkey_invalid_type_for_btree,
 			 "invalid key type for btree %s (%s)",
 			 bch2_btree_node_type_str(type),
 			 k.k->type < KEY_TYPE_MAX
@@ -189,17 +187,17 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 			 : "(unknown)");
 
 	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
-		bkey_fsck_err_on(k.k->size == 0, c, err,
-				 bkey_extent_size_zero,
+		bkey_fsck_err_on(k.k->size == 0,
+				 c, bkey_extent_size_zero,
 				 "size == 0");
 
-		bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
-				 bkey_extent_size_greater_than_offset,
+		bkey_fsck_err_on(k.k->size > k.k->p.offset,
+				 c, bkey_extent_size_greater_than_offset,
 				 "size greater than offset (%u > %llu)",
 				 k.k->size, k.k->p.offset);
 	} else {
-		bkey_fsck_err_on(k.k->size, c, err,
-				 bkey_size_nonzero,
+		bkey_fsck_err_on(k.k->size,
+				 c, bkey_size_nonzero,
 				 "size != 0");
 	}
 
@@ -207,12 +205,12 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 		enum btree_id btree = type - 1;
 
 		if (btree_type_has_snapshots(btree)) {
-			bkey_fsck_err_on(!k.k->p.snapshot, c, err,
-					 bkey_snapshot_zero,
+			bkey_fsck_err_on(!k.k->p.snapshot,
+					 c, bkey_snapshot_zero,
 					 "snapshot == 0");
 		} else if (!btree_type_has_snapshot_field(btree)) {
-			bkey_fsck_err_on(k.k->p.snapshot, c, err,
-					 bkey_snapshot_nonzero,
+			bkey_fsck_err_on(k.k->p.snapshot,
+					 c, bkey_snapshot_nonzero,
 					 "nonzero snapshot");
 		} else {
 			/*
@@ -221,34 +219,33 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 			 */
 		}
 
-		bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
-				 bkey_at_pos_max,
+		bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX),
+				 c, bkey_at_pos_max,
 				 "key at POS_MAX");
 	}
 fsck_err:
 	return ret;
 }
 
-int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
+int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
 		      enum btree_node_type type,
-		      enum bch_validate_flags flags,
-		      struct printbuf *err)
+		      enum bch_validate_flags flags)
 {
-	return __bch2_bkey_invalid(c, k, type, flags, err) ?:
-		bch2_bkey_val_invalid(c, k, flags, err);
+	return __bch2_bkey_validate(c, k, type, flags) ?:
+		bch2_bkey_val_validate(c, k, flags);
 }
 
 int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
-			    struct bkey_s_c k, struct printbuf *err)
+			    struct bkey_s_c k, enum bch_validate_flags flags)
 {
 	int ret = 0;
 
-	bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
-			 bkey_before_start_of_btree_node,
+	bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key),
+			 c, bkey_before_start_of_btree_node,
 			 "key before start of btree node");
 
-	bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
-			 bkey_after_end_of_btree_node,
+	bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key),
+			 c, bkey_after_end_of_btree_node,
 			 "key past end of btree node");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index baef0722f5fb..3df3dd2723a1 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -14,15 +14,15 @@ extern const char * const bch2_bkey_types[];
 extern const struct bkey_ops bch2_bkey_null_ops;
 
 /*
- * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
+ * key_validate: checks validity of @k, returns 0 if good or -EINVAL if bad. If
  * invalid, entire key will be deleted.
  *
  * When invalid, error string is returned via @err. @rw indicates whether key is
  * being read or written; more aggressive checks can be enabled when rw == WRITE.
  */
 struct bkey_ops {
-	int		(*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
-				       enum bch_validate_flags flags, struct printbuf *err);
+	int		(*key_validate)(struct bch_fs *c, struct bkey_s_c k,
+					enum bch_validate_flags flags);
 	void		(*val_to_text)(struct printbuf *, struct bch_fs *,
 				       struct bkey_s_c);
 	void		(*swab)(struct bkey_s);
@@ -48,14 +48,13 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
 		: &bch2_bkey_null_ops;
 }
 
-int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
-			enum bch_validate_flags, struct printbuf *);
-int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
-		      enum bch_validate_flags, struct printbuf *);
-int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
-			    struct bkey_s_c, struct printbuf *);
+int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+			 enum bch_validate_flags);
+int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
+		       enum bch_validate_flags);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
+			    enum bch_validate_flags);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 70ebcca08ba2..56ea9a77cd4a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -836,14 +836,13 @@ fsck_err:
 	return ret;
 }
 
-static int bset_key_invalid(struct bch_fs *c, struct btree *b,
-			    struct bkey_s_c k,
-			    bool updated_range, int rw,
-			    struct printbuf *err)
+static int bset_key_validate(struct bch_fs *c, struct btree *b,
+			     struct bkey_s_c k,
+			     bool updated_range, int rw)
 {
-	return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
-		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
-		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
+	return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?:
+		(!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?:
+		(rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0);
 }
 
 static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
@@ -858,12 +857,9 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
 	if (!bkeyp_u64s_valid(&b->format, k))
 		return false;
 
-	struct printbuf buf = PRINTBUF;
 	struct bkey tmp;
 	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
-	bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
-	printbuf_exit(&buf);
-	return ret;
+	return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent);
 }
 
 static int validate_bset_keys(struct bch_fs *c, struct btree *b,
@@ -915,19 +911,11 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 
 		u = __bkey_disassemble(b, k, &tmp);
 
-		printbuf_reset(&buf);
-		if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
-			printbuf_reset(&buf);
-			bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
-			prt_printf(&buf, "\n  ");
-			bch2_bkey_val_to_text(&buf, c, u.s_c);
-
-			btree_err(-BCH_ERR_btree_node_read_err_fixable,
-				  c, NULL, b, i, k,
-				  btree_node_bad_bkey,
-				  "invalid bkey: %s", buf.buf);
+		ret = bset_key_validate(c, b, u.s_c, updated_range, write);
+		if (ret == -BCH_ERR_fsck_delete_bkey)
 			goto drop_this_key;
-		}
+		if (ret)
+			goto fsck_err;
 
 		if (write)
 			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
@@ -1228,23 +1216,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 		struct bkey tmp;
 		struct bkey_s u = __bkey_disassemble(b, k, &tmp);
 
-		printbuf_reset(&buf);
-
-		if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) ||
+		ret = bch2_bkey_val_validate(c, u.s_c, READ);
+		if (ret == -BCH_ERR_fsck_delete_bkey ||
 		    (bch2_inject_invalid_keys &&
 		     !bversion_cmp(u.k->version, MAX_VERSION))) {
-			printbuf_reset(&buf);
-
-			prt_printf(&buf, "invalid bkey: ");
-			bch2_bkey_val_invalid(c, u.s_c, READ, &buf);
-			prt_printf(&buf, "\n  ");
-			bch2_bkey_val_to_text(&buf, c, u.s_c);
-
-			btree_err(-BCH_ERR_btree_node_read_err_fixable,
-				  c, NULL, b, i, k,
-				  btree_node_bad_bkey,
-				  "%s", buf.buf);
-
 			btree_keys_account_key_drop(&b->nr, 0, k);
 
 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@@ -1253,6 +1228,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 			set_btree_bset_end(b, b->set);
 			continue;
 		}
+		if (ret)
+			goto fsck_err;
 
 		if (u.k->type == KEY_TYPE_btree_ptr_v2) {
 			struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
@@ -1954,18 +1931,14 @@ static void btree_node_write_endio(struct bio *bio)
 static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 				   struct bset *i, unsigned sectors)
 {
-	struct printbuf buf = PRINTBUF;
 	bool saw_error;
-	int ret;
-
-	ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key),
-				BKEY_TYPE_btree, WRITE, &buf);
 
-	if (ret)
-		bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf);
-	printbuf_exit(&buf);
-	if (ret)
+	int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
+				     BKEY_TYPE_btree, WRITE);
+	if (ret) {
+		bch2_fs_inconsistent(c, "invalid btree node key before write");
 		return ret;
+	}
 
 	ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
 		validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 001107226377..b28c649c6838 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -530,7 +530,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
 		bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
 		printbuf_exit(&buf);
 
-		BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+		BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0));
 
 		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
 		if (ret)
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index ac0c92683aad..1a1e9d2036da 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -818,50 +818,6 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
 			bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
 }
 
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
-						   enum bch_validate_flags flags,
-						   struct btree_insert_entry *i,
-						   struct printbuf *err)
-{
-	struct bch_fs *c = trans->c;
-
-	printbuf_reset(err);
-	prt_printf(err, "invalid bkey on insert from %s -> %ps\n",
-		   trans->fn, (void *) i->ip_allocated);
-	printbuf_indent_add(err, 2);
-
-	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
-	prt_newline(err);
-
-	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
-	bch2_print_string_as_lines(KERN_ERR, err->buf);
-
-	bch2_inconsistent_error(c);
-	bch2_dump_trans_updates(trans);
-
-	return -EINVAL;
-}
-
-static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans,
-						   struct jset_entry *i)
-{
-	struct bch_fs *c = trans->c;
-	struct printbuf buf = PRINTBUF;
-
-	prt_printf(&buf, "invalid bkey on insert from %s\n", trans->fn);
-	printbuf_indent_add(&buf, 2);
-
-	bch2_journal_entry_to_text(&buf, c, i);
-	prt_newline(&buf);
-
-	bch2_print_string_as_lines(KERN_ERR, buf.buf);
-
-	bch2_inconsistent_error(c);
-	bch2_dump_trans_updates(trans);
-
-	return -EINVAL;
-}
-
 static int bch2_trans_commit_journal_pin_flush(struct journal *j,
 				struct journal_entry_pin *_pin, u64 seq)
 {
@@ -1064,20 +1020,19 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 		goto out_reset;
 
 	trans_for_each_update(trans, i) {
-		struct printbuf buf = PRINTBUF;
 		enum bch_validate_flags invalid_flags = 0;
 
 		if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
 			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
 
-		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-					       i->bkey_type, invalid_flags, &buf)))
-			ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
-		btree_insert_entry_checks(trans, i);
-		printbuf_exit(&buf);
-
-		if (ret)
+		ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
+					 i->bkey_type, invalid_flags);
+		if (unlikely(ret)){
+			bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+						trans->fn, (void *) i->ip_allocated);
 			return ret;
+		}
+		btree_insert_entry_checks(trans, i);
 	}
 
 	for (struct jset_entry *i = trans->journal_entries;
@@ -1088,13 +1043,14 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 		if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
 			invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
 
-		if (unlikely(bch2_journal_entry_validate(c, NULL, i,
-					bcachefs_metadata_version_current,
-					CPU_BIG_ENDIAN, invalid_flags)))
-			ret = bch2_trans_commit_journal_entry_invalid(trans, i);
-
-		if (ret)
+		ret = bch2_journal_entry_validate(c, NULL, i,
+						  bcachefs_metadata_version_current,
+						  CPU_BIG_ENDIAN, invalid_flags);
+		if (unlikely(ret)) {
+			bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
+						trans->fn);
 			return ret;
+		}
 	}
 
 	if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index e61f9695771e..b3454d4619e8 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1364,18 +1364,10 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 	if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
 		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
 
-	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
-			      btree_node_type(b), WRITE, &buf) ?:
-	    bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
-		printbuf_reset(&buf);
-		prt_printf(&buf, "inserting invalid bkey\n  ");
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-		prt_printf(&buf, "\n  ");
-		bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
-				  btree_node_type(b), WRITE, &buf);
-		bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
-
-		bch2_fs_inconsistent(c, "%s", buf.buf);
+	if (bch2_bkey_validate(c, bkey_i_to_s_c(insert),
+			      btree_node_type(b), BCH_VALIDATE_write) ?:
+	    bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) {
+		bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
 		dump_stack();
 	}
 
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0087b8555ead..6a854c918496 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -250,10 +250,8 @@ restart_drop_extra_replicas:
 		 * it's been hard to reproduce, so this should give us some more
 		 * information when it does occur:
 		 */
-		struct printbuf err = PRINTBUF;
-		int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
-		printbuf_exit(&err);
-
+		int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id),
+						 BCH_VALIDATE_commit);
 		if (invalid) {
 			struct printbuf buf = PRINTBUF;
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index d743da89308e..32bfdf19289a 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -100,20 +100,19 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 	.is_visible	= dirent_is_visible,
 };
 
-int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
-			enum bch_validate_flags flags,
-			struct printbuf *err)
+int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
+			 enum bch_validate_flags flags)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	struct qstr d_name = bch2_dirent_get_name(d);
 	int ret = 0;
 
-	bkey_fsck_err_on(!d_name.len, c, err,
-			 dirent_empty_name,
+	bkey_fsck_err_on(!d_name.len,
+			 c, dirent_empty_name,
 			 "empty name");
 
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
-			 dirent_val_too_big,
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
+			 c, dirent_val_too_big,
 			 "value too big (%zu > %u)",
 			 bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
 
@@ -121,27 +120,27 @@ int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
 	 * Check new keys don't exceed the max length
 	 * (older keys may be larger.)
 	 */
-	bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, err,
-			 dirent_name_too_long,
+	bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
+			 c, dirent_name_too_long,
 			 "dirent name too big (%u > %u)",
 			 d_name.len, BCH_NAME_MAX);
 
-	bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
-			 dirent_name_embedded_nul,
+	bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len),
+			 c, dirent_name_embedded_nul,
 			 "dirent has stray data after name's NUL");
 
 	bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
-			 (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
-			 dirent_name_dot_or_dotdot,
+			 (d_name.len == 2 && !memcmp(d_name.name, "..", 2)),
+			 c, dirent_name_dot_or_dotdot,
 			 "invalid name");
 
-	bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
-			 dirent_name_has_slash,
+	bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len),
+			 c, dirent_name_has_slash,
 			 "name with /");
 
 	bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
-			 le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
-			 dirent_to_itself,
+			 le64_to_cpu(d.v->d_inum) == d.k->p.inode,
+			 c, dirent_to_itself,
 			 "dirent points to own directory");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 24037e6e0a09..8945145865c5 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -7,12 +7,11 @@
 enum bch_validate_flags;
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
-			enum bch_validate_flags, struct printbuf *);
+int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_dirent ((struct bkey_ops) {	\
-	.key_invalid	= bch2_dirent_invalid,		\
+	.key_validate	= bch2_dirent_validate,		\
 	.val_to_text	= bch2_dirent_to_text,		\
 	.min_val_size	= 16,				\
 })
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 03a9de6c2e0a..f059cbffdf23 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -126,9 +126,8 @@ static inline bool is_zero(char *start, char *end)
 
 #define field_end(p, member)	(((void *) (&p.member)) + sizeof(p.member))
 
-int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
-			    enum bch_validate_flags flags,
-			    struct printbuf *err)
+int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
+			     enum bch_validate_flags flags)
 {
 	struct disk_accounting_pos acc_k;
 	bpos_to_disk_accounting_pos(&acc_k, k.k->p);
@@ -144,18 +143,18 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
 		break;
 	case BCH_DISK_ACCOUNTING_replicas:
 		bkey_fsck_err_on(!acc_k.replicas.nr_devs,
-				 c, err, accounting_key_replicas_nr_devs_0,
+				 c, accounting_key_replicas_nr_devs_0,
 				 "accounting key replicas entry with nr_devs=0");
 
 		bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs ||
 				 (acc_k.replicas.nr_required > 1 &&
 				  acc_k.replicas.nr_required == acc_k.replicas.nr_devs),
-				 c, err, accounting_key_replicas_nr_required_bad,
+				 c, accounting_key_replicas_nr_required_bad,
 				 "accounting key replicas entry with bad nr_required");
 
 		for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++)
 			bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1],
-					 c, err, accounting_key_replicas_devs_unsorted,
+					 c, accounting_key_replicas_devs_unsorted,
 					 "accounting key replicas entry with unsorted devs");
 
 		end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas);
@@ -178,7 +177,7 @@ int bch2_accounting_invalid(struct bch_fs *c, struct bkey_s_c k,
 	}
 
 	bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
-			 c, err, accounting_key_junk_at_end,
+			 c, accounting_key_junk_at_end,
 			 "junk at end of accounting key");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 3d3f25e08b69..b92f8c2e3054 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -82,14 +82,13 @@ int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
 			     s64 *, unsigned, bool);
 int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
 
-int bch2_accounting_invalid(struct bch_fs *, struct bkey_s_c,
-			    enum bch_validate_flags, struct printbuf *);
+int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
 void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_accounting_swab(struct bkey_s);
 
 #define bch2_bkey_ops_accounting ((struct bkey_ops) {	\
-	.key_invalid	= bch2_accounting_invalid,	\
+	.key_validate	= bch2_accounting_validate,	\
 	.val_to_text	= bch2_accounting_to_text,	\
 	.swab		= bch2_accounting_swab,		\
 	.min_val_size	= 8,				\
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 84f1cbf6497f..141a4c63142f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -107,24 +107,23 @@ struct ec_bio {
 
 /* Stripes btree keys: */
 
-int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
-			enum bch_validate_flags flags,
-			struct printbuf *err)
+int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
+			 enum bch_validate_flags flags)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	int ret = 0;
 
 	bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
-			 bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
-			 stripe_pos_bad,
+			 bpos_gt(k.k->p, POS(0, U32_MAX)),
+			 c, stripe_pos_bad,
 			 "stripe at bad pos");
 
-	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
-			 stripe_val_size_bad,
+	bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s),
+			 c, stripe_val_size_bad,
 			 "incorrect value size (%zu < %u)",
 			 bkey_val_u64s(k.k), stripe_val_u64s(s));
 
-	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+	ret = bch2_bkey_ptrs_validate(c, k, flags);
 fsck_err:
 	return ret;
 }
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 84a23eeb6249..90962b3c0130 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -8,8 +8,7 @@
 
 enum bch_validate_flags;
 
-int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
-			enum bch_validate_flags, struct printbuf *);
+int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
 			 struct bkey_s_c);
 int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
@@ -17,7 +16,7 @@ int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
 			enum btree_iter_update_trigger_flags);
 
 #define bch2_bkey_ops_stripe ((struct bkey_ops) {	\
-	.key_invalid	= bch2_stripe_invalid,		\
+	.key_validate	= bch2_stripe_validate,		\
 	.val_to_text	= bch2_stripe_to_text,		\
 	.swab		= bch2_ptr_swab,		\
 	.trigger	= bch2_trigger_stripe,		\
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index a268af3e52bf..ab5a7adece10 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -166,6 +166,7 @@
 	x(0,				journal_reclaim_would_deadlock)		\
 	x(EINVAL,			fsck)					\
 	x(BCH_ERR_fsck,			fsck_fix)				\
+	x(BCH_ERR_fsck,			fsck_delete_bkey)			\
 	x(BCH_ERR_fsck,			fsck_ignore)				\
 	x(BCH_ERR_fsck,			fsck_errors_not_fixed)			\
 	x(BCH_ERR_fsck,			fsck_repair_unimplemented)		\
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index a62b63108820..95afa7bf2020 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -416,6 +416,28 @@ err:
 	return ret;
 }
 
+int __bch2_bkey_fsck_err(struct bch_fs *c,
+			 struct bkey_s_c k,
+			 enum bch_fsck_flags flags,
+			 enum bch_sb_error_id err,
+			 const char *fmt, ...)
+{
+	struct printbuf buf = PRINTBUF;
+	va_list args;
+
+	prt_str(&buf, "invalid bkey ");
+	bch2_bkey_val_to_text(&buf, c, k);
+	prt_str(&buf, "\n  ");
+	va_start(args, fmt);
+	prt_vprintf(&buf, fmt, args);
+	va_end(args);
+	prt_str(&buf, ": delete?");
+
+	int ret = __bch2_fsck_err(c, NULL, flags, err, "%s", buf.buf);
+	printbuf_exit(&buf);
+	return ret;
+}
+
 void bch2_flush_fsck_errs(struct bch_fs *c)
 {
 	struct fsck_err_state *s, *n;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 995e6bba9bad..2f1b86978f36 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -4,6 +4,7 @@
 
 #include <linux/list.h>
 #include <linux/printk.h>
+#include "bkey_types.h"
 #include "sb-errors.h"
 
 struct bch_dev;
@@ -166,24 +167,30 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 #define fsck_err_on(cond, c, _err_type, ...)				\
 	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
-__printf(4, 0)
-static inline void bch2_bkey_fsck_err(struct bch_fs *c,
-				     struct printbuf *err_msg,
-				     enum bch_sb_error_id err_type,
-				     const char *fmt, ...)
-{
-	va_list args;
+__printf(5, 6)
+int __bch2_bkey_fsck_err(struct bch_fs *,
+			 struct bkey_s_c,
+			 enum bch_fsck_flags,
+			 enum bch_sb_error_id,
+			 const char *, ...);
 
-	va_start(args, fmt);
-	prt_vprintf(err_msg, fmt, args);
-	va_end(args);
-}
-
-#define bkey_fsck_err(c, _err_msg, _err_type, ...)			\
+/*
+ * for now, bkey fsck errors are always handled by deleting the entire key -
+ * this will change at some point
+ */
+#define bkey_fsck_err(c, _err_type, _err_msg, ...)			\
 do {									\
-	prt_printf(_err_msg, __VA_ARGS__);				\
-	bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type);		\
-	ret = -BCH_ERR_invalid_bkey;					\
+	if ((flags & BCH_VALIDATE_silent)) {				\
+		ret = -BCH_ERR_fsck_delete_bkey;			\
+		goto fsck_err;						\
+	}								\
+	int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX,		\
+				BCH_FSCK_ERR_##_err_type,		\
+				_err_msg, ##__VA_ARGS__);		\
+	if (_ret != -BCH_ERR_fsck_fix &&				\
+	    _ret != -BCH_ERR_fsck_ignore)				\
+		ret = _ret;						\
+	ret = -BCH_ERR_fsck_delete_bkey;				\
 	goto fsck_err;							\
 } while (0)
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 07973198e35f..4419ad3e454e 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -171,17 +171,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags,
-			   struct printbuf *err)
+int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
+			    enum bch_validate_flags flags)
 {
 	int ret = 0;
 
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
-			 btree_ptr_val_too_big,
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX,
+			 c, btree_ptr_val_too_big,
 			 "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
 
-	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+	ret = bch2_bkey_ptrs_validate(c, k, flags);
 fsck_err:
 	return ret;
 }
@@ -192,28 +191,27 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
-			      enum bch_validate_flags flags,
-			      struct printbuf *err)
+int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+			       enum bch_validate_flags flags)
 {
 	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 	int ret = 0;
 
 	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
-			 c, err, btree_ptr_v2_val_too_big,
+			 c, btree_ptr_v2_val_too_big,
 			 "value too big (%zu > %zu)",
 			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
 
 	bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
-			 c, err, btree_ptr_v2_min_key_bad,
+			 c, btree_ptr_v2_min_key_bad,
 			 "min_key > key");
 
 	if (flags & BCH_VALIDATE_write)
 		bkey_fsck_err_on(!bp.v->sectors_written,
-				 c, err, btree_ptr_v2_written_0,
+				 c, btree_ptr_v2_written_0,
 				 "sectors_written == 0");
 
-	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+	ret = bch2_bkey_ptrs_validate(c, k, flags);
 fsck_err:
 	return ret;
 }
@@ -399,15 +397,14 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
 /* KEY_TYPE_reservation: */
 
-int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
-			     enum bch_validate_flags flags,
-			     struct printbuf *err)
+int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
+			      enum bch_validate_flags flags)
 {
 	struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 	int ret = 0;
 
-	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
-			 reservation_key_nr_replicas_invalid,
+	bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX,
+			 c, reservation_key_nr_replicas_invalid,
 			 "invalid nr_replicas (%u)", r.v->nr_replicas);
 fsck_err:
 	return ret;
@@ -1102,14 +1099,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 	}
 }
 
-
-static int extent_ptr_invalid(struct bch_fs *c,
-			      struct bkey_s_c k,
-			      enum bch_validate_flags flags,
-			      const struct bch_extent_ptr *ptr,
-			      unsigned size_ondisk,
-			      bool metadata,
-			      struct printbuf *err)
+static int extent_ptr_validate(struct bch_fs *c,
+			       struct bkey_s_c k,
+			       enum bch_validate_flags flags,
+			       const struct bch_extent_ptr *ptr,
+			       unsigned size_ondisk,
+			       bool metadata)
 {
 	int ret = 0;
 
@@ -1128,28 +1123,27 @@ static int extent_ptr_invalid(struct bch_fs *c,
 
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	bkey_for_each_ptr(ptrs, ptr2)
-		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
-				 ptr_to_duplicate_device,
+		bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
+				 c, ptr_to_duplicate_device,
 				 "multiple pointers to same device (%u)", ptr->dev);
 
 
-	bkey_fsck_err_on(bucket >= nbuckets, c, err,
-			 ptr_after_last_bucket,
+	bkey_fsck_err_on(bucket >= nbuckets,
+			 c, ptr_after_last_bucket,
 			 "pointer past last bucket (%llu > %llu)", bucket, nbuckets);
-	bkey_fsck_err_on(bucket < first_bucket, c, err,
-			 ptr_before_first_bucket,
+	bkey_fsck_err_on(bucket < first_bucket,
+			 c, ptr_before_first_bucket,
 			 "pointer before first bucket (%llu < %u)", bucket, first_bucket);
-	bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size, c, err,
-			 ptr_spans_multiple_buckets,
+	bkey_fsck_err_on(bucket_offset + size_ondisk > bucket_size,
+			 c, ptr_spans_multiple_buckets,
 			 "pointer spans multiple buckets (%u + %u > %u)",
 		       bucket_offset, size_ondisk, bucket_size);
 fsck_err:
 	return ret;
 }
 
-int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags,
-			   struct printbuf *err)
+int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
+			    enum bch_validate_flags flags)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
@@ -1164,25 +1158,24 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
 		size_ondisk = btree_sectors(c);
 
 	bkey_extent_entry_for_each(ptrs, entry) {
-		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
-			extent_ptrs_invalid_entry,
-			"invalid extent entry type (got %u, max %u)",
-			__extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
+				 c, extent_ptrs_invalid_entry,
+				 "invalid extent entry type (got %u, max %u)",
+				 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
 
 		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
-				 !extent_entry_is_ptr(entry), c, err,
-				 btree_ptr_has_non_ptr,
+				 !extent_entry_is_ptr(entry),
+				 c, btree_ptr_has_non_ptr,
 				 "has non ptr field");
 
 		switch (extent_entry_type(entry)) {
 		case BCH_EXTENT_ENTRY_ptr:
-			ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
-						 size_ondisk, false, err);
+			ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false);
 			if (ret)
 				return ret;
 
-			bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
-					 ptr_cached_and_erasure_coded,
+			bkey_fsck_err_on(entry->ptr.cached && have_ec,
+					 c, ptr_cached_and_erasure_coded,
 					 "cached, erasure coded ptr");
 
 			if (!entry->ptr.unwritten)
@@ -1199,44 +1192,50 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
 		case BCH_EXTENT_ENTRY_crc128:
 			crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
-					 ptr_crc_uncompressed_size_too_small,
+			bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
+					 c, ptr_crc_uncompressed_size_too_small,
 					 "checksum offset + key size > uncompressed size");
-			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
-					 ptr_crc_csum_type_unknown,
+			bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
+					 c, ptr_crc_csum_type_unknown,
 					 "invalid checksum type");
-			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
-					 ptr_crc_compression_type_unknown,
+			bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR,
+					 c, ptr_crc_compression_type_unknown,
 					 "invalid compression type");
 
 			if (bch2_csum_type_is_encryption(crc.csum_type)) {
 				if (nonce == UINT_MAX)
 					nonce = crc.offset + crc.nonce;
 				else if (nonce != crc.offset + crc.nonce)
-					bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+					bkey_fsck_err(c, ptr_crc_nonce_mismatch,
 						      "incorrect nonce");
 			}
 
-			bkey_fsck_err_on(crc_since_last_ptr, c, err,
-					 ptr_crc_redundant,
+			bkey_fsck_err_on(crc_since_last_ptr,
+					 c, ptr_crc_redundant,
 					 "redundant crc entry");
 			crc_since_last_ptr = true;
 
 			bkey_fsck_err_on(crc_is_encoded(crc) &&
 					 (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
-					 (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, err,
-					 ptr_crc_uncompressed_size_too_big,
+					 (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
+					 c, ptr_crc_uncompressed_size_too_big,
 					 "too large encoded extent");
 
 			size_ondisk = crc.compressed_size;
 			break;
 		case BCH_EXTENT_ENTRY_stripe_ptr:
-			bkey_fsck_err_on(have_ec, c, err,
-					 ptr_stripe_redundant,
+			bkey_fsck_err_on(have_ec,
+					 c, ptr_stripe_redundant,
 					 "redundant stripe entry");
 			have_ec = true;
 			break;
 		case BCH_EXTENT_ENTRY_rebalance: {
+			/*
+			 * this shouldn't be a fsck error, for forward
+			 * compatibility; the rebalance code should just refetch
+			 * the compression opt if it's unknown
+			 */
+#if 0
 			const struct bch_extent_rebalance *r = &entry->rebalance;
 
 			if (!bch2_compression_opt_valid(r->compression)) {
@@ -1245,28 +1244,29 @@ int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
 					   opt.type, opt.level);
 				return -BCH_ERR_invalid_bkey;
 			}
+#endif
 			break;
 		}
 		}
 	}
 
-	bkey_fsck_err_on(!nr_ptrs, c, err,
-			 extent_ptrs_no_ptrs,
+	bkey_fsck_err_on(!nr_ptrs,
+			 c, extent_ptrs_no_ptrs,
 			 "no ptrs");
-	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
-			 extent_ptrs_too_many_ptrs,
+	bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX,
+			 c, extent_ptrs_too_many_ptrs,
 			 "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
-	bkey_fsck_err_on(have_written && have_unwritten, c, err,
-			 extent_ptrs_written_and_unwritten,
+	bkey_fsck_err_on(have_written && have_unwritten,
+			 c, extent_ptrs_written_and_unwritten,
 			 "extent with unwritten and written ptrs");
-	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
-			 extent_ptrs_unwritten,
+	bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten,
+			 c, extent_ptrs_unwritten,
 			 "has unwritten ptrs");
-	bkey_fsck_err_on(crc_since_last_ptr, c, err,
-			 extent_ptrs_redundant_crc,
+	bkey_fsck_err_on(crc_since_last_ptr,
+			 c, extent_ptrs_redundant_crc,
 			 "redundant crc entry");
-	bkey_fsck_err_on(have_ec, c, err,
-			 extent_ptrs_redundant_stripe,
+	bkey_fsck_err_on(have_ec,
+			 c, extent_ptrs_redundant_stripe,
 			 "redundant stripe entry");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index facdb8a86eec..1a6ddee48041 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -409,26 +409,26 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
-			   enum bch_validate_flags, struct printbuf *);
+int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
+			    enum bch_validate_flags);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 
-int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
-			      enum bch_validate_flags, struct printbuf *);
+int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
+			       enum bch_validate_flags);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 			      int, struct bkey_s);
 
 #define bch2_bkey_ops_btree_ptr ((struct bkey_ops) {		\
-	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_validate	= bch2_btree_ptr_validate,		\
 	.val_to_text	= bch2_btree_ptr_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.trigger	= bch2_trigger_extent,			\
 })
 
 #define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) {		\
-	.key_invalid	= bch2_btree_ptr_v2_invalid,		\
+	.key_validate	= bch2_btree_ptr_v2_validate,		\
 	.val_to_text	= bch2_btree_ptr_v2_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.compat		= bch2_btree_ptr_v2_compat,		\
@@ -441,7 +441,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
 bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 #define bch2_bkey_ops_extent ((struct bkey_ops) {		\
-	.key_invalid	= bch2_bkey_ptrs_invalid,		\
+	.key_validate	= bch2_bkey_ptrs_validate,		\
 	.val_to_text	= bch2_bkey_ptrs_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.key_normalize	= bch2_extent_normalize,		\
@@ -451,13 +451,13 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 /* KEY_TYPE_reservation: */
 
-int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
-			     enum bch_validate_flags, struct printbuf *);
+int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
+			      enum bch_validate_flags);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 #define bch2_bkey_ops_reservation ((struct bkey_ops) {		\
-	.key_invalid	= bch2_reservation_invalid,		\
+	.key_validate	= bch2_reservation_validate,		\
 	.val_to_text	= bch2_reservation_to_text,		\
 	.key_merge	= bch2_reservation_merge,		\
 	.trigger	= bch2_trigger_reservation,		\
@@ -683,8 +683,8 @@ bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
-int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
-			   enum bch_validate_flags, struct printbuf *);
+int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
+			    enum bch_validate_flags);
 
 void bch2_ptr_swab(struct bkey_s);
 
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 1e20020eadd1..2be6be33afa3 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -434,100 +434,98 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
 	return &inode_p->inode.k_i;
 }
 
-static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
+static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
+				 enum bch_validate_flags flags)
 {
 	struct bch_inode_unpacked unpacked;
 	int ret = 0;
 
-	bkey_fsck_err_on(k.k->p.inode, c, err,
-			 inode_pos_inode_nonzero,
+	bkey_fsck_err_on(k.k->p.inode,
+			 c, inode_pos_inode_nonzero,
 			 "nonzero k.p.inode");
 
-	bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
-			 inode_pos_blockdev_range,
+	bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX,
+			 c, inode_pos_blockdev_range,
 			 "fs inode in blockdev range");
 
-	bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
-			 inode_unpack_error,
+	bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked),
+			 c, inode_unpack_error,
 			 "invalid variable length fields");
 
-	bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
-			 inode_checksum_type_invalid,
+	bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1,
+			 c, inode_checksum_type_invalid,
 			 "invalid data checksum type (%u >= %u",
 			 unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
 
 	bkey_fsck_err_on(unpacked.bi_compression &&
-			 !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
-			 inode_compression_type_invalid,
+			 !bch2_compression_opt_valid(unpacked.bi_compression - 1),
+			 c, inode_compression_type_invalid,
 			 "invalid compression opt %u", unpacked.bi_compression - 1);
 
 	bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
-			 unpacked.bi_nlink != 0, c, err,
-			 inode_unlinked_but_nlink_nonzero,
+			 unpacked.bi_nlink != 0,
+			 c, inode_unlinked_but_nlink_nonzero,
 			 "flagged as unlinked but bi_nlink != 0");
 
-	bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
-			 inode_subvol_root_but_not_dir,
+	bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode),
+			 c, inode_subvol_root_but_not_dir,
 			 "subvolume root but not a directory");
 fsck_err:
 	return ret;
 }
 
-int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
-		       enum bch_validate_flags flags,
-		       struct printbuf *err)
+int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
+			enum bch_validate_flags flags)
 {
 	struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 	int ret = 0;
 
-	bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
-			 inode_str_hash_invalid,
+	bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+			 c, inode_str_hash_invalid,
 			 "invalid str hash type (%llu >= %u)",
 			 INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-	ret = __bch2_inode_invalid(c, k, err);
+	ret = __bch2_inode_validate(c, k, flags);
 fsck_err:
 	return ret;
 }
 
-int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
+int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k,
+			   enum bch_validate_flags flags)
 {
 	struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
 	int ret = 0;
 
-	bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
-			 inode_str_hash_invalid,
+	bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+			 c, inode_str_hash_invalid,
 			 "invalid str hash type (%llu >= %u)",
 			 INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-	ret = __bch2_inode_invalid(c, k, err);
+	ret = __bch2_inode_validate(c, k, flags);
 fsck_err:
 	return ret;
 }
 
-int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
+int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k,
+			   enum bch_validate_flags flags)
 {
 	struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
 	int ret = 0;
 
 	bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
-			 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
-			 inode_v3_fields_start_bad,
+			 INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k),
+			 c, inode_v3_fields_start_bad,
 			 "invalid fields_start (got %llu, min %u max %zu)",
 			 INODEv3_FIELDS_START(inode.v),
 			 INODEv3_FIELDS_START_INITIAL,
 			 bkey_val_u64s(inode.k));
 
-	bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
-			 inode_str_hash_invalid,
+	bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+			 c, inode_str_hash_invalid,
 			 "invalid str hash type (%llu >= %u)",
 			 INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-	ret = __bch2_inode_invalid(c, k, err);
+	ret = __bch2_inode_validate(c, k, flags);
 fsck_err:
 	return ret;
 }
@@ -625,14 +623,13 @@ int bch2_trigger_inode(struct btree_trans *trans,
 	return 0;
 }
 
-int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
-				  enum bch_validate_flags flags,
-				  struct printbuf *err)
+int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k,
+				   enum bch_validate_flags flags)
 {
 	int ret = 0;
 
-	bkey_fsck_err_on(k.k->p.inode, c, err,
-			 inode_pos_inode_nonzero,
+	bkey_fsck_err_on(k.k->p.inode,
+			 c, inode_pos_inode_nonzero,
 			 "nonzero k.p.inode");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index da0e4a745099..f1fcb4c58039 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -9,12 +9,12 @@
 enum bch_validate_flags;
 extern const char * const bch2_inode_opts[];
 
-int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
-		       enum bch_validate_flags, struct printbuf *);
-int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
-int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
+int bch2_inode_validate(struct bch_fs *, struct bkey_s_c,
+		       enum bch_validate_flags);
+int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c,
+			  enum bch_validate_flags);
+int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
+			  enum bch_validate_flags);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
@@ -22,21 +22,21 @@ int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
 		       enum btree_iter_update_trigger_flags);
 
 #define bch2_bkey_ops_inode ((struct bkey_ops) {	\
-	.key_invalid	= bch2_inode_invalid,		\
+	.key_validate	= bch2_inode_validate,		\
 	.val_to_text	= bch2_inode_to_text,		\
 	.trigger	= bch2_trigger_inode,		\
 	.min_val_size	= 16,				\
 })
 
 #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_inode_v2_invalid,	\
+	.key_validate	= bch2_inode_v2_validate,	\
 	.val_to_text	= bch2_inode_to_text,		\
 	.trigger	= bch2_trigger_inode,		\
 	.min_val_size	= 32,				\
 })
 
 #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) {	\
-	.key_invalid	= bch2_inode_v3_invalid,	\
+	.key_validate	= bch2_inode_v3_validate,	\
 	.val_to_text	= bch2_inode_to_text,		\
 	.trigger	= bch2_trigger_inode,		\
 	.min_val_size	= 48,				\
@@ -49,12 +49,12 @@ static inline bool bkey_is_inode(const struct bkey *k)
 		k->type == KEY_TYPE_inode_v3;
 }
 
-int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
-				  enum bch_validate_flags, struct printbuf *);
+int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c,
+				  enum bch_validate_flags);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode_generation ((struct bkey_ops) {	\
-	.key_invalid	= bch2_inode_generation_invalid,	\
+	.key_validate	= bch2_inode_generation_validate,	\
 	.val_to_text	= bch2_inode_generation_to_text,	\
 	.min_val_size	= 8,					\
 })
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 7a833a3f1c63..7664b68e6a15 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -332,7 +332,6 @@ static int journal_validate_key(struct bch_fs *c,
 {
 	int write = flags & BCH_VALIDATE_write;
 	void *next = vstruct_next(entry);
-	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (journal_entry_err_on(!k->k.u64s,
@@ -368,34 +367,21 @@ static int journal_validate_key(struct bch_fs *c,
 		bch2_bkey_compat(level, btree_id, version, big_endian,
 				 write, NULL, bkey_to_packed(k));
 
-	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
-			      __btree_node_type(level, btree_id), write, &buf)) {
-		printbuf_reset(&buf);
-		journal_entry_err_msg(&buf, version, jset, entry);
-		prt_newline(&buf);
-		printbuf_indent_add(&buf, 2);
-
-		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-		prt_newline(&buf);
-		bch2_bkey_invalid(c, bkey_i_to_s_c(k),
-				  __btree_node_type(level, btree_id), write, &buf);
-
-		mustfix_fsck_err(c, journal_entry_bkey_invalid,
-				 "%s", buf.buf);
-
+	ret = bch2_bkey_validate(c, bkey_i_to_s_c(k),
+				 __btree_node_type(level, btree_id), write);
+	if (ret == -BCH_ERR_fsck_delete_bkey) {
 		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
 		journal_entry_null_range(vstruct_next(entry), next);
-
-		printbuf_exit(&buf);
 		return FSCK_DELETED_KEY;
 	}
+	if (ret)
+		goto fsck_err;
 
 	if (write)
 		bch2_bkey_compat(level, btree_id, version, big_endian,
 				 write, NULL, bkey_to_packed(k));
 fsck_err:
-	printbuf_exit(&buf);
 	return ret;
 }
 
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 83b1586cb371..96f2f4f8c397 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -10,14 +10,13 @@
 #include "recovery.h"
 
 /* KEY_TYPE_lru is obsolete: */
-int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
-		     enum bch_validate_flags flags,
-		     struct printbuf *err)
+int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k,
+		     enum bch_validate_flags flags)
 {
 	int ret = 0;
 
-	bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
-			 lru_entry_at_time_0,
+	bkey_fsck_err_on(!lru_pos_time(k.k->p),
+			 c, lru_entry_at_time_0,
 			 "lru entry at time=0");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index 5bd8974a7f11..e6a7d8241bb8 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -33,14 +33,13 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
 	return BCH_LRU_read;
 }
 
-int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
-		     enum bch_validate_flags, struct printbuf *);
+int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
 
 #define bch2_bkey_ops_lru ((struct bkey_ops) {	\
-	.key_invalid	= bch2_lru_invalid,	\
+	.key_validate	= bch2_lru_validate,	\
 	.val_to_text	= bch2_lru_to_text,	\
 	.min_val_size	= 8,			\
 })
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index a0cca8b70e0a..c32a05e252e2 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -59,13 +59,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
 	.to_text	= bch2_sb_quota_to_text,
 };
 
-int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
-		       enum bch_validate_flags flags, struct printbuf *err)
+int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k,
+			enum bch_validate_flags flags)
 {
 	int ret = 0;
 
-	bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
-			 quota_type_invalid,
+	bkey_fsck_err_on(k.k->p.inode >= QTYP_NR,
+			 c, quota_type_invalid,
 			 "invalid quota type (%llu >= %u)",
 			 k.k->p.inode, QTYP_NR);
 fsck_err:
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
index 02d37a332218..a62abcc5332a 100644
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
@@ -8,12 +8,11 @@
 enum bch_validate_flags;
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
-		       enum bch_validate_flags, struct printbuf *);
+int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_quota ((struct bkey_ops) {	\
-	.key_invalid	= bch2_quota_invalid,		\
+	.key_validate	= bch2_quota_validate,		\
 	.val_to_text	= bch2_quota_to_text,		\
 	.min_val_size	= 32,				\
 })
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 5f92715e1525..e59c0abb4772 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -29,15 +29,14 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 
 /* reflink pointers */
 
-int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags,
-			   struct printbuf *err)
+int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k,
+			    enum bch_validate_flags flags)
 {
 	struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 	int ret = 0;
 
 	bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad),
-			 c, err, reflink_p_front_pad_bad,
+			 c, reflink_p_front_pad_bad,
 			 "idx < front_pad (%llu < %u)",
 			 le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
 fsck_err:
@@ -256,11 +255,10 @@ int bch2_trigger_reflink_p(struct btree_trans *trans,
 
 /* indirect extents */
 
-int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags,
-			   struct printbuf *err)
+int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k,
+			    enum bch_validate_flags flags)
 {
-	return bch2_bkey_ptrs_invalid(c, k, flags, err);
+	return bch2_bkey_ptrs_validate(c, k, flags);
 }
 
 void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
@@ -311,9 +309,8 @@ int bch2_trigger_reflink_v(struct btree_trans *trans,
 
 /* indirect inline data */
 
-int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
-				      enum bch_validate_flags flags,
-				      struct printbuf *err)
+int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
+				      enum bch_validate_flags flags)
 {
 	return 0;
 }
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index e894f3a2c67a..51afe11d8ed6 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -4,41 +4,37 @@
 
 enum bch_validate_flags;
 
-int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
-			   enum bch_validate_flags, struct printbuf *);
-void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
+int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
 			   struct bkey_s_c, struct bkey_s,
 			   enum btree_iter_update_trigger_flags);
 
 #define bch2_bkey_ops_reflink_p ((struct bkey_ops) {		\
-	.key_invalid	= bch2_reflink_p_invalid,		\
+	.key_validate	= bch2_reflink_p_validate,		\
 	.val_to_text	= bch2_reflink_p_to_text,		\
 	.key_merge	= bch2_reflink_p_merge,			\
 	.trigger	= bch2_trigger_reflink_p,		\
 	.min_val_size	= 16,					\
 })
 
-int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
-			   enum bch_validate_flags, struct printbuf *);
-void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
-			    struct bkey_s_c);
+int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
 			   struct bkey_s_c, struct bkey_s,
 			   enum btree_iter_update_trigger_flags);
 
 #define bch2_bkey_ops_reflink_v ((struct bkey_ops) {		\
-	.key_invalid	= bch2_reflink_v_invalid,		\
+	.key_validate	= bch2_reflink_v_validate,		\
 	.val_to_text	= bch2_reflink_v_to_text,		\
 	.swab		= bch2_ptr_swab,			\
 	.trigger	= bch2_trigger_reflink_v,		\
 	.min_val_size	= 8,					\
 })
 
-int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
-				      enum bch_validate_flags, struct printbuf *);
+int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c,
+				      enum bch_validate_flags);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
 				struct bch_fs *, struct bkey_s_c);
 int bch2_trigger_indirect_inline_data(struct btree_trans *,
@@ -47,7 +43,7 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *,
 			      enum btree_iter_update_trigger_flags);
 
 #define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) {	\
-	.key_invalid	= bch2_indirect_inline_data_invalid,	\
+	.key_validate	= bch2_indirect_inline_data_validate,	\
 	.val_to_text	= bch2_indirect_inline_data_to_text,	\
 	.trigger	= bch2_trigger_indirect_inline_data,	\
 	.min_val_size	= 8,					\
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 96744b1a76f5..8b18a9b483a4 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -31,15 +31,14 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
 		   le32_to_cpu(t.v->root_snapshot));
 }
 
-int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
-			       enum bch_validate_flags flags,
-			       struct printbuf *err)
+int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k,
+			       enum bch_validate_flags flags)
 {
 	int ret = 0;
 
 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-			 bkey_lt(k.k->p, POS(0, 1)), c, err,
-			 snapshot_tree_pos_bad,
+			 bkey_lt(k.k->p, POS(0, 1)),
+			 c, snapshot_tree_pos_bad,
 			 "bad pos");
 fsck_err:
 	return ret;
@@ -225,55 +224,54 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
 			   le32_to_cpu(s.v->skip[2]));
 }
 
-int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
-			  enum bch_validate_flags flags,
-			  struct printbuf *err)
+int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k,
+			  enum bch_validate_flags flags)
 {
 	struct bkey_s_c_snapshot s;
 	u32 i, id;
 	int ret = 0;
 
 	bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-			 bkey_lt(k.k->p, POS(0, 1)), c, err,
-			 snapshot_pos_bad,
+			 bkey_lt(k.k->p, POS(0, 1)),
+			 c, snapshot_pos_bad,
 			 "bad pos");
 
 	s = bkey_s_c_to_snapshot(k);
 
 	id = le32_to_cpu(s.v->parent);
-	bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
-			 snapshot_parent_bad,
+	bkey_fsck_err_on(id && id <= k.k->p.offset,
+			 c, snapshot_parent_bad,
 			 "bad parent node (%u <= %llu)",
 			 id, k.k->p.offset);
 
-	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
-			 snapshot_children_not_normalized,
+	bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]),
+			 c, snapshot_children_not_normalized,
 			 "children not normalized");
 
-	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
-			 snapshot_child_duplicate,
+	bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1],
+			 c, snapshot_child_duplicate,
 			 "duplicate child nodes");
 
 	for (i = 0; i < 2; i++) {
 		id = le32_to_cpu(s.v->children[i]);
 
-		bkey_fsck_err_on(id >= k.k->p.offset, c, err,
-				 snapshot_child_bad,
+		bkey_fsck_err_on(id >= k.k->p.offset,
+				 c, snapshot_child_bad,
 				 "bad child node (%u >= %llu)",
 				 id, k.k->p.offset);
 	}
 
 	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
 		bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
-				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
-				 snapshot_skiplist_not_normalized,
+				 le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]),
+				 c, snapshot_skiplist_not_normalized,
 				 "skiplist not normalized");
 
 		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
 			id = le32_to_cpu(s.v->skip[i]);
 
-			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
-					 snapshot_skiplist_bad,
+			bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent),
+					 c, snapshot_skiplist_bad,
 					 "bad skiplist node %u", id);
 		}
 	}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 31b0ee03e962..eb5ef64221d6 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -5,11 +5,11 @@
 enum bch_validate_flags;
 
 void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
-			       enum bch_validate_flags, struct printbuf *);
+int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c,
+			       enum bch_validate_flags);
 
 #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
-	.key_invalid	= bch2_snapshot_tree_invalid,		\
+	.key_validate	= bch2_snapshot_tree_validate,		\
 	.val_to_text	= bch2_snapshot_tree_to_text,		\
 	.min_val_size	= 8,					\
 })
@@ -19,14 +19,13 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
 int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
 
 void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
-			  enum bch_validate_flags, struct printbuf *);
+int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s,
 		       enum btree_iter_update_trigger_flags);
 
 #define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
-	.key_invalid	= bch2_snapshot_invalid,		\
+	.key_validate	= bch2_snapshot_validate,		\
 	.val_to_text	= bch2_snapshot_to_text,		\
 	.trigger	= bch2_mark_snapshot,			\
 	.min_val_size	= 24,					\
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index f56720b55862..dbe834cb349f 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -207,23 +207,23 @@ int bch2_check_subvol_children(struct bch_fs *c)
 
 /* Subvolumes: */
 
-int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
-			   enum bch_validate_flags flags, struct printbuf *err)
+int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k,
+			   enum bch_validate_flags flags)
 {
 	struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k);
 	int ret = 0;
 
 	bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
-			 bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
-			 subvol_pos_bad,
+			 bkey_gt(k.k->p, SUBVOL_POS_MAX),
+			 c, subvol_pos_bad,
 			 "invalid pos");
 
-	bkey_fsck_err_on(!subvol.v->snapshot, c, err,
-			 subvol_snapshot_bad,
+	bkey_fsck_err_on(!subvol.v->snapshot,
+			 c, subvol_snapshot_bad,
 			 "invalid snapshot");
 
-	bkey_fsck_err_on(!subvol.v->inode, c, err,
-			 subvol_inode_bad,
+	bkey_fsck_err_on(!subvol.v->inode,
+			 c, subvol_inode_bad,
 			 "invalid inode");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index afa5e871efb2..a8299ba2cab2 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -10,15 +10,14 @@ enum bch_validate_flags;
 int bch2_check_subvols(struct bch_fs *);
 int bch2_check_subvol_children(struct bch_fs *);
 
-int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
-			   enum bch_validate_flags, struct printbuf *);
+int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
 			   struct bkey_s_c, struct bkey_s,
 			   enum btree_iter_update_trigger_flags);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
-	.key_invalid	= bch2_subvolume_invalid,		\
+	.key_validate	= bch2_subvolume_validate,		\
 	.val_to_text	= bch2_subvolume_to_text,		\
 	.trigger	= bch2_subvolume_trigger,		\
 	.min_val_size	= 16,					\
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index c11bf6dacc2c..f2b4c17a0307 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -70,17 +70,16 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 	.cmp_bkey	= xattr_cmp_bkey,
 };
 
-int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
-		       enum bch_validate_flags flags,
-		       struct printbuf *err)
+int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
+		       enum bch_validate_flags flags)
 {
 	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 	unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
 					   le16_to_cpu(xattr.v->x_val_len));
 	int ret = 0;
 
-	bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err,
-			 xattr_val_size_too_small,
+	bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s,
+			 c, xattr_val_size_too_small,
 			 "value too small (%zu < %u)",
 			 bkey_val_u64s(k.k), val_u64s);
 
@@ -88,17 +87,17 @@ int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
 	val_u64s = xattr_val_u64s(xattr.v->x_name_len,
 				  le16_to_cpu(xattr.v->x_val_len) + 4);
 
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err,
-			 xattr_val_size_too_big,
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s,
+			 c, xattr_val_size_too_big,
 			 "value too big (%zu > %u)",
 			 bkey_val_u64s(k.k), val_u64s);
 
-	bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err,
-			 xattr_invalid_type,
+	bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type),
+			 c, xattr_invalid_type,
 			 "invalid type (%u)", xattr.v->x_type);
 
-	bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err,
-			 xattr_name_invalid_chars,
+	bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
+			 c, xattr_name_invalid_chars,
 			 "xattr name has invalid characters");
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 1574b9eb4c85..c188a5ad64ce 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -6,12 +6,11 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
-		       enum bch_validate_flags, struct printbuf *);
+int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_xattr ((struct bkey_ops) {	\
-	.key_invalid	= bch2_xattr_invalid,		\
+	.key_validate	= bch2_xattr_validate,		\
 	.val_to_text	= bch2_xattr_to_text,		\
 	.min_val_size	= 8,				\
 })
-- 
cgit 


From 5132b99bb62664c02bd6c0dd62ad3fedc75294d8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Aug 2024 02:35:10 -0400
Subject: bcachefs: Kill __bch2_accounting_mem_mod()

The next patch will be adding a disk accounting counter type which is
not kept in the in-memory eytzinger tree.

As prep, fold __bch2_accounting_mem_mod() into
bch2_accounting_mem_mod_locked() so that we can check for that counter
type and bail out without calling bpos_to_disk_accounting_pos() twice.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_trans_commit.c |  4 +--
 fs/bcachefs/disk_accounting.c    |  4 +--
 fs/bcachefs/disk_accounting.h    | 54 ++++++++++++++++++----------------------
 3 files changed, 28 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 1a1e9d2036da..a0101d9c5d83 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -712,7 +712,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 				a->k.version = journal_pos_to_bversion(&trans->journal_res,
 								(u64 *) entry - (u64 *) trans->journal_entries);
 				BUG_ON(bversion_zero(a->k.version));
-				ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false);
+				ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false);
 				if (ret)
 					goto revert_fs_usage;
 			}
@@ -798,7 +798,7 @@ revert_fs_usage:
 			struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
 
 			bch2_accounting_neg(a);
-			bch2_accounting_mem_mod_locked(trans, a.c, false);
+			bch2_accounting_mem_mod_locked(trans, a.c, false, false);
 			bch2_accounting_neg(a);
 		}
 	percpu_up_read(&c->mark_lock);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index f059cbffdf23..e57b40623cd9 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -566,7 +566,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
 					struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
 
 					accounting_key_init(&k_i.k, &acc_k, src_v, nr);
-					bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false);
+					bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false);
 
 					preempt_disable();
 					struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -595,7 +595,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
 		return 0;
 
 	percpu_down_read(&c->mark_lock);
-	int ret = __bch2_accounting_mem_mod(c, bkey_s_c_to_accounting(k), false);
+	int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true);
 	percpu_up_read(&c->mark_lock);
 
 	if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index b92f8c2e3054..653090667aaa 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -106,41 +106,17 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
 int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool);
 void bch2_accounting_mem_gc(struct bch_fs *);
 
-static inline int __bch2_accounting_mem_mod(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
-{
-	struct bch_accounting_mem *acc = &c->accounting;
-	unsigned idx;
-
-	EBUG_ON(gc && !acc->gc_running);
-
-	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
-		int ret = bch2_accounting_mem_insert(c, a, gc);
-		if (ret)
-			return ret;
-	}
-
-	struct accounting_mem_entry *e = &acc->k.data[idx];
-
-	EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
-
-	for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
-		this_cpu_add(e->v[gc][i], a.v->d[i]);
-	return 0;
-}
-
 /*
  * Update in memory counters so they match the btree update we're doing; called
  * from transaction commit path
  */
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read)
 {
 	struct bch_fs *c = trans->c;
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
 
-	if (!gc) {
-		struct disk_accounting_pos acc_k;
-		bpos_to_disk_accounting_pos(&acc_k, a.k->p);
-
+	if (!gc && !read) {
 		switch (acc_k.type) {
 		case BCH_DISK_ACCOUNTING_persistent_reserved:
 			trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
@@ -161,13 +137,31 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
 		}
 	}
 
-	return __bch2_accounting_mem_mod(c, a, gc);
+	struct bch_accounting_mem *acc = &c->accounting;
+	unsigned idx;
+
+	EBUG_ON(gc && !acc->gc_running);
+
+	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
+		int ret = bch2_accounting_mem_insert(c, a, gc);
+		if (ret)
+			return ret;
+	}
+
+	struct accounting_mem_entry *e = &acc->k.data[idx];
+
+	EBUG_ON(bch2_accounting_counters(a.k) != e->nr_counters);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(a.k); i++)
+		this_cpu_add(e->v[gc][i], a.v->d[i]);
+	return 0;
 }
 
 static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
 {
 	percpu_down_read(&trans->c->mark_lock);
-	int ret = bch2_accounting_mem_mod_locked(trans, a, gc);
+	int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false);
 	percpu_up_read(&trans->c->mark_lock);
 	return ret;
 }
-- 
cgit 


From 58474f76a770bcc79d4b2d7232e4d6650e732b50 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 12 Aug 2024 02:27:36 -0400
Subject: bcachefs: bcachefs_metadata_version_disk_accounting_inum

This adds another disk accounting counter to track usage per inode
number (any snapshot ID).

This will be used for a couple things:

- It'll give us a way to tell the user how much space a given file ista
  consuming in all snapshots; i.e. how much extra space it's consuming
  due to snapshot versioning.

- It counts number of extents and total size of extents (both in btree
  keyspace sectors and actual disk usage), meaning it gives us average
  extent size: that is, it'll let us cheaply find fragmented files that
  should be defragmented.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h        |  3 ++-
 fs/bcachefs/buckets.c                | 14 ++++++++++++++
 fs/bcachefs/disk_accounting.c        |  3 +++
 fs/bcachefs/disk_accounting.h        |  3 +++
 fs/bcachefs/disk_accounting_format.h |  8 +++++++-
 fs/bcachefs/sb-downgrade.c           |  5 ++++-
 6 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index b25f86356728..c75f2e0f32bb 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -676,7 +676,8 @@ struct bch_sb_field_ext {
 	x(mi_btree_bitmap,		BCH_VERSION(1,  7))		\
 	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))		\
 	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
-	x(disk_accounting_v3,		BCH_VERSION(1, 10))
+	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
+	x(disk_accounting_inum,		BCH_VERSION(1, 11))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 9f7004e941ce..b69ef4b3de6e 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -810,6 +810,20 @@ static int __trigger_extent(struct btree_trans *trans,
 		ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
+	} else {
+		bool insert = !(flags & BTREE_TRIGGER_overwrite);
+		struct disk_accounting_pos acc_inum_key = {
+			.type		= BCH_DISK_ACCOUNTING_inum,
+			.inum.inum	= k.k->p.inode,
+		};
+		s64 v[3] = {
+			insert ? 1 : -1,
+			insert ? k.k->size : -((s64) k.k->size),
+			replicas_sectors,
+		};
+		ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
+		if (ret)
+			return ret;
 	}
 
 	if (bch2_bkey_rebalance_opts(k)) {
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index e57b40623cd9..e972e2bca546 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -768,6 +768,9 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
 			if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
 				continue;
 
+			if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+				continue;
+
 			bch2_accounting_mem_read(c, k.k->p, v, nr);
 
 			if (memcmp(a.v->d, v, nr * sizeof(u64))) {
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 653090667aaa..f29fd0dd9581 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -116,6 +116,9 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
 	struct disk_accounting_pos acc_k;
 	bpos_to_disk_accounting_pos(&acc_k, a.k->p);
 
+	if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
+		return 0;
+
 	if (!gc && !read) {
 		switch (acc_k.type) {
 		case BCH_DISK_ACCOUNTING_persistent_reserved:
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index a93cf26ff4a9..7b6e6c97e6aa 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -103,7 +103,8 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(compression,		4)		\
 	x(snapshot,		5)		\
 	x(btree,		6)		\
-	x(rebalance_work,	7)
+	x(rebalance_work,	7)		\
+	x(inum,			8)
 
 enum disk_accounting_type {
 #define x(f, nr)	BCH_DISK_ACCOUNTING_##f	= nr,
@@ -136,6 +137,10 @@ struct bch_acct_btree {
 	__u32			id;
 } __packed;
 
+struct bch_acct_inum {
+	__u64			inum;
+} __packed;
+
 struct bch_acct_rebalance_work {
 };
 
@@ -152,6 +157,7 @@ struct disk_accounting_pos {
 		struct bch_acct_snapshot	snapshot;
 		struct bch_acct_btree		btree;
 		struct bch_acct_rebalance_work	rebalance_work;
+		struct bch_acct_inum		inum;
 		} __packed;
 	} __packed;
 		struct bpos			_pad;
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 9f82d497d9e0..650a1f77ca40 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -72,7 +72,10 @@
 	  BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0,	\
 	  BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad,	\
 	  BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted,	\
-	  BCH_FSCK_ERR_accounting_key_junk_at_end)
+	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
+	x(disk_accounting_inum,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch)
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
-- 
cgit 


From 73c34b0b85d46bf9c2c0b367aeaffa1e2481b136 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 29 Jul 2024 13:44:33 -0700
Subject: xfs: attr forks require attr, not attr2

It turns out that I misunderstood the difference between the attr and
attr2 feature bits.  "attr" means that at some point an attr fork was
created somewhere in the filesystem.  "attr2" means that inodes have
variable-sized forks, but says nothing about whether or not there
actually /are/ attr forks in the system.

If we have an attr fork, we only need to check that attr is set.

Fixes: 99d9d8d05da26 ("xfs: scrub inode block mappings")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/scrub/bmap.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 24a15bf784f1..5ab2ac53c920 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -938,7 +938,13 @@ xchk_bmap(
 		}
 		break;
 	case XFS_ATTR_FORK:
-		if (!xfs_has_attr(mp) && !xfs_has_attr2(mp))
+		/*
+		 * "attr" means that an attr fork was created at some point in
+		 * the life of this filesystem.  "attr2" means that inodes have
+		 * variable-sized data/attr fork areas.  Hence we only check
+		 * attr here.
+		 */
+		if (!xfs_has_attr(mp))
 			xchk_ino_set_corrupt(sc, sc->ip->i_ino);
 		break;
 	default:
-- 
cgit 


From 04d6dbb55301a29aeb9a197e6b0012cdc265f1e4 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 4 Aug 2024 14:39:34 -0700
Subject: xfs: revert AIL TASK_KILLABLE threshold

In commit 9adf40249e6c, we changed the behavior of the AIL thread to
set its own task state to KILLABLE whenever the timeout value is
nonzero.  Unfortunately, this missed the fact that xfsaild_push will
return 50ms (aka a longish sleep) when we reach the push target or the
AIL becomes empty, so xfsaild goes to sleep for a long period of time in
uninterruptible D state.

This results in artificially high load averages because KILLABLE
processes are UNINTERRUPTIBLE, which contributes to load average even
though the AIL is asleep waiting for someone to interrupt it.  It's not
blocked on IOs or anything, but people scrap ps for processes that look
like they're stuck in D state, so restore the previous threshold.

Fixes: 9adf40249e6c ("xfs: AIL doesn't need manual pushing")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_trans_ail.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0fafcc9f3dbe..8ede9d099d1f 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -644,7 +644,12 @@ xfsaild(
 	set_freezable();
 
 	while (1) {
-		if (tout)
+		/*
+		 * Long waits of 50ms or more occur when we've run out of items
+		 * to push, so we only want uninterruptible state if we're
+		 * actually blocked on something.
+		 */
+		if (tout && tout <= 20)
 			set_current_state(TASK_KILLABLE|TASK_FREEZABLE);
 		else
 			set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
-- 
cgit 


From 8d16762047c627073955b7ed171a36addaf7b1ff Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Sun, 4 Aug 2024 14:39:57 -0700
Subject: xfs: conditionally allow FS_XFLAG_REALTIME changes if S_DAX is set

If a file has the S_DAX flag (aka fsdax access mode) set, we cannot
allow users to change the realtime flag unless the datadev and rtdev
both support fsdax access modes.  Even if there are no extents allocated
to the file, the setattr thread could be racing with another thread
that has already started down the write code paths.

Fixes: ba23cba9b3bdc ("fs: allow per-device dax status checking for filesystems")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_ioctl.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4e933db75b12..6b13666d4e96 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -483,6 +483,17 @@ xfs_ioctl_setattr_xflags(
 		/* Can't change realtime flag if any extents are allocated. */
 		if (ip->i_df.if_nextents || ip->i_delayed_blks)
 			return -EINVAL;
+
+		/*
+		 * If S_DAX is enabled on this file, we can only switch the
+		 * device if both support fsdax.  We can't update S_DAX because
+		 * there might be other threads walking down the access paths.
+		 */
+		if (IS_DAX(VFS_I(ip)) &&
+		    (mp->m_ddev_targp->bt_daxdev == NULL ||
+		     (mp->m_rtdev_targp &&
+		      mp->m_rtdev_targp->bt_daxdev == NULL)))
+			return -EINVAL;
 	}
 
 	if (rtflag) {
-- 
cgit 


From c0247d289e73e18f6ddb0895de30c09770fbed95 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Tue, 13 Aug 2024 12:53:15 +0200
Subject: btrfs: send: annotate struct name_cache_entry with __counted_by()

Add the __counted_by compiler attribute to the flexible array member
name to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and
CONFIG_FORTIFY_SOURCE.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 7fc692fc76e1..619fa0b8b3f6 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -347,7 +347,7 @@ struct name_cache_entry {
 	int ret;
 	int need_later_update;
 	int name_len;
-	char name[];
+	char name[] __counted_by(name_len);
 };
 
 /* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
-- 
cgit 


From 3bc2ac2f8f0b78a13140fc72022771efe0c9b778 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 12 Aug 2024 12:30:52 -0400
Subject: btrfs: update target inode's ctime on unlink

Unlink changes the link count on the target inode. POSIX mandates that
the ctime must also change when this occurs.

According to https://pubs.opengroup.org/onlinepubs/9699919799/functions/unlink.html:

"Upon successful completion, unlink() shall mark for update the last data
 modification and last file status change timestamps of the parent
 directory. Also, if the file's link count is not 0, the last file status
 change timestamp of the file shall be marked for update."

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: David Sterba <dsterba@suse.com>
[ add link to the opengroup docs ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 333b0e8587a2..b1b6564ab68f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4195,6 +4195,7 @@ err:
 
 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
 	inode_inc_iversion(&inode->vfs_inode);
+	inode_set_ctime_current(&inode->vfs_inode);
 	inode_inc_iversion(&dir->vfs_inode);
  	inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
 	ret = btrfs_update_inode(trans, dir);
-- 
cgit 


From 008e2512dc5696ab2dc5bf264e98a9fe9ceb830e Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sun, 11 Aug 2024 15:00:22 +0930
Subject: btrfs: tree-checker: add dev extent item checks

[REPORT]
There is a corruption report that btrfs refused to mount a fs that has
overlapping dev extents:

  BTRFS error (device sdc): dev extent devid 4 physical offset 14263979671552 overlap with previous dev extent end 14263980982272
  BTRFS error (device sdc): failed to verify dev extents against chunks: -117
  BTRFS error (device sdc): open_ctree failed

[CAUSE]
The direct cause is very obvious, there is a bad dev extent item with
incorrect length.

With btrfs check reporting two overlapping extents, the second one shows
some clue on the cause:

  ERROR: dev extent devid 4 offset 14263979671552 len 6488064 overlap with previous dev extent end 14263980982272
  ERROR: dev extent devid 13 offset 2257707008000 len 6488064 overlap with previous dev extent end 2257707270144
  ERROR: errors found in extent allocation tree or chunk allocation

The second one looks like a bitflip happened during new chunk
allocation:
hex(2257707008000) = 0x20da9d30000
hex(2257707270144) = 0x20da9d70000
diff               = 0x00000040000

So it looks like a bitflip happened during new dev extent allocation,
resulting the second overlap.

Currently we only do the dev-extent verification at mount time, but if the
corruption is caused by memory bitflip, we really want to catch it before
writing the corruption to the storage.

Furthermore the dev extent items has the following key definition:

	(<device id> DEV_EXTENT <physical offset>)

Thus we can not just rely on the generic key order check to make sure
there is no overlapping.

[ENHANCEMENT]
Introduce dedicated dev extent checks, including:

- Fixed member checks
  * chunk_tree should always be BTRFS_CHUNK_TREE_OBJECTID (3)
  * chunk_objectid should always be
    BTRFS_FIRST_CHUNK_CHUNK_TREE_OBJECTID (256)

- Alignment checks
  * chunk_offset should be aligned to sectorsize
  * length should be aligned to sectorsize
  * key.offset should be aligned to sectorsize

- Overlap checks
  If the previous key is also a dev-extent item, with the same
  device id, make sure we do not overlap with the previous dev extent.

Reported: Stefan N <stefannnau@gmail.com>
Link: https://lore.kernel.org/linux-btrfs/CA+W5K0rSO3koYTo=nzxxTm1-Pdu1HYgVxEpgJ=aGc7d=E8mGEg@mail.gmail.com/
CC: stable@vger.kernel.org # 5.10+
Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-checker.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 6f1e2f2215d9..634d69964fe4 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1764,6 +1764,72 @@ static int check_raid_stripe_extent(const struct extent_buffer *leaf,
 	return 0;
 }
 
+static int check_dev_extent_item(const struct extent_buffer *leaf,
+				 const struct btrfs_key *key,
+				 int slot,
+				 struct btrfs_key *prev_key)
+{
+	struct btrfs_dev_extent *de;
+	const u32 sectorsize = leaf->fs_info->sectorsize;
+
+	de = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
+	/* Basic fixed member checks. */
+	if (unlikely(btrfs_dev_extent_chunk_tree(leaf, de) !=
+		     BTRFS_CHUNK_TREE_OBJECTID)) {
+		generic_err(leaf, slot,
+			    "invalid dev extent chunk tree id, has %llu expect %llu",
+			    btrfs_dev_extent_chunk_tree(leaf, de),
+			    BTRFS_CHUNK_TREE_OBJECTID);
+		return -EUCLEAN;
+	}
+	if (unlikely(btrfs_dev_extent_chunk_objectid(leaf, de) !=
+		     BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
+		generic_err(leaf, slot,
+			    "invalid dev extent chunk objectid, has %llu expect %llu",
+			    btrfs_dev_extent_chunk_objectid(leaf, de),
+			    BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+		return -EUCLEAN;
+	}
+	/* Alignment check. */
+	if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) {
+		generic_err(leaf, slot,
+			    "invalid dev extent key.offset, has %llu not aligned to %u",
+			    key->offset, sectorsize);
+		return -EUCLEAN;
+	}
+	if (unlikely(!IS_ALIGNED(btrfs_dev_extent_chunk_offset(leaf, de),
+				 sectorsize))) {
+		generic_err(leaf, slot,
+			    "invalid dev extent chunk offset, has %llu not aligned to %u",
+			    btrfs_dev_extent_chunk_objectid(leaf, de),
+			    sectorsize);
+		return -EUCLEAN;
+	}
+	if (unlikely(!IS_ALIGNED(btrfs_dev_extent_length(leaf, de),
+				 sectorsize))) {
+		generic_err(leaf, slot,
+			    "invalid dev extent length, has %llu not aligned to %u",
+			    btrfs_dev_extent_length(leaf, de), sectorsize);
+		return -EUCLEAN;
+	}
+	/* Overlap check with previous dev extent. */
+	if (slot && prev_key->objectid == key->objectid &&
+	    prev_key->type == key->type) {
+		struct btrfs_dev_extent *prev_de;
+		u64 prev_len;
+
+		prev_de = btrfs_item_ptr(leaf, slot - 1, struct btrfs_dev_extent);
+		prev_len = btrfs_dev_extent_length(leaf, prev_de);
+		if (unlikely(prev_key->offset + prev_len > key->offset)) {
+			generic_err(leaf, slot,
+		"dev extent overlap, prev offset %llu len %llu current offset %llu",
+				    prev_key->objectid, prev_len, key->offset);
+			return -EUCLEAN;
+		}
+	}
+	return 0;
+}
+
 /*
  * Common point to switch the item-specific validation.
  */
@@ -1800,6 +1866,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
 	case BTRFS_DEV_ITEM_KEY:
 		ret = check_dev_item(leaf, key, slot);
 		break;
+	case BTRFS_DEV_EXTENT_KEY:
+		ret = check_dev_extent_item(leaf, key, slot, prev_key);
+		break;
 	case BTRFS_INODE_ITEM_KEY:
 		ret = check_inode_item(leaf, key, slot);
 		break;
-- 
cgit 


From e30729d4bd4001881be4d1ad4332a5d4985398f8 Mon Sep 17 00:00:00 2001
From: Naohiro Aota <naohiro.aota@wdc.com>
Date: Thu, 1 Aug 2024 16:47:52 +0900
Subject: btrfs: zoned: properly take lock to read/update block group's zoned
 variables

__btrfs_add_free_space_zoned() references and modifies bg's alloc_offset,
ro, and zone_unusable, but without taking the lock. It is mostly safe
because they monotonically increase (at least for now) and this function is
mostly called by a transaction commit, which is serialized by itself.

Still, taking the lock is a safer and correct option and I'm going to add a
change to reset zone_unusable while a block group is still alive. So, add
locking around the operations.

Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones")
CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/free-space-cache.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f5996a43db24..eaa1dbd31352 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2697,15 +2697,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 	u64 offset = bytenr - block_group->start;
 	u64 to_free, to_unusable;
 	int bg_reclaim_threshold = 0;
-	bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
+	bool initial;
 	u64 reclaimable_unusable;
 
-	WARN_ON(!initial && offset + size > block_group->zone_capacity);
+	spin_lock(&block_group->lock);
 
+	initial = ((size == block_group->length) && (block_group->alloc_offset == 0));
+	WARN_ON(!initial && offset + size > block_group->zone_capacity);
 	if (!initial)
 		bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
 
-	spin_lock(&ctl->tree_lock);
 	if (!used)
 		to_free = size;
 	else if (initial)
@@ -2718,7 +2719,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 		to_free = offset + size - block_group->alloc_offset;
 	to_unusable = size - to_free;
 
+	spin_lock(&ctl->tree_lock);
 	ctl->free_space += to_free;
+	spin_unlock(&ctl->tree_lock);
 	/*
 	 * If the block group is read-only, we should account freed space into
 	 * bytes_readonly.
@@ -2727,11 +2730,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 		block_group->zone_unusable += to_unusable;
 		WARN_ON(block_group->zone_unusable > block_group->length);
 	}
-	spin_unlock(&ctl->tree_lock);
 	if (!used) {
-		spin_lock(&block_group->lock);
 		block_group->alloc_offset -= size;
-		spin_unlock(&block_group->lock);
 	}
 
 	reclaimable_unusable = block_group->zone_unusable -
@@ -2745,6 +2745,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
 		btrfs_mark_bg_to_reclaim(block_group);
 	}
 
+	spin_unlock(&block_group->lock);
+
 	return 0;
 }
 
-- 
cgit 


From 74c2ab6d653b4c2354df65a7f7f2df1925a40a51 Mon Sep 17 00:00:00 2001
From: Su Hui <suhui@nfschina.com>
Date: Thu, 8 Aug 2024 20:23:32 +0800
Subject: smb/client: avoid possible NULL dereference in cifs_free_subrequest()

Clang static checker (scan-build) warning:
	cifsglob.h:line 890, column 3
	Access to field 'ops' results in a dereference of a null pointer.

Commit 519be989717c ("cifs: Add a tracepoint to track credits involved in
R/W requests") adds a check for 'rdata->server', and let clang throw this
warning about NULL dereference.

When 'rdata->credits.value != 0 && rdata->server == NULL' happens,
add_credits_and_wake_if() will call rdata->server->ops->add_credits().
This will cause NULL dereference problem. Add a check for 'rdata->server'
to avoid NULL dereference.

Cc: stable@vger.kernel.org
Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks")
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Su Hui <suhui@nfschina.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/file.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index b2405dd4d4d4..45459af5044d 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -315,7 +315,7 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq)
 #endif
 	}
 
-	if (rdata->credits.value != 0)
+	if (rdata->credits.value != 0) {
 		trace_smb3_rw_credits(rdata->rreq->debug_id,
 				      rdata->subreq.debug_index,
 				      rdata->credits.value,
@@ -323,8 +323,12 @@ static void cifs_free_subrequest(struct netfs_io_subrequest *subreq)
 				      rdata->server ? rdata->server->in_flight : 0,
 				      -rdata->credits.value,
 				      cifs_trace_rw_credits_free_subreq);
+		if (rdata->server)
+			add_credits_and_wake_if(rdata->server, &rdata->credits, 0);
+		else
+			rdata->credits.value = 0;
+	}
 
-	add_credits_and_wake_if(rdata->server, &rdata->credits, 0);
 	if (rdata->have_xid)
 		free_xid(rdata->xid);
 }
-- 
cgit 


From 836bb3268db405cf9021496ac4dbc26d3e4758fe Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Thu, 15 Aug 2024 14:03:43 -0500
Subject: smb3: fix lock breakage for cached writes

Mandatory locking is enforced for cached writes, which violates
default posix semantics, and also it is enforced inconsistently.
This apparently breaks recent versions of libreoffice, but can
also be demonstrated by opening a file twice from the same
client, locking it from handle one and writing to it from
handle two (which fails, returning EACCES).

Since there was already a mount option "forcemandatorylock"
(which defaults to off), with this change only when the user
intentionally specifies "forcemandatorylock" on mount will we
break posix semantics on write to a locked range (ie we will
only fail the write in this case, if the user mounts with
"forcemandatorylock").

Fixes: 85160e03a79e ("CIFS: Implement caching mechanism for mandatory brlocks")
Cc: stable@vger.kernel.org
Cc: Pavel Shilovsky <piastryyy@gmail.com>
Reported-by: abartlet@samba.org
Reported-by: Kevin Ottens <kevin.ottens@enioka.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/file.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 45459af5044d..06a0667f8ff2 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -2753,6 +2753,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 	struct inode *inode = file->f_mapping->host;
 	struct cifsInodeInfo *cinode = CIFS_I(inode);
 	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	ssize_t rc;
 
 	rc = netfs_start_io_write(inode);
@@ -2769,12 +2770,16 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 	if (rc <= 0)
 		goto out;
 
-	if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) &&
+	    (cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from),
 				     server->vals->exclusive_lock_type, 0,
-				     NULL, CIFS_WRITE_OP))
-		rc = netfs_buffered_write_iter_locked(iocb, from, NULL);
-	else
+				     NULL, CIFS_WRITE_OP))) {
 		rc = -EACCES;
+		goto out;
+	}
+
+	rc = netfs_buffered_write_iter_locked(iocb, from, NULL);
+
 out:
 	up_read(&cinode->lock_sem);
 	netfs_end_io_write(inode);
-- 
cgit 


From 5b4f3af39b6588e8de4444d8e1ccf759b40f9414 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 8 Aug 2024 16:04:04 -0600
Subject: smb: smb2pdu.h: Use static_assert() to check struct sizes

Commit 9f9bef9bc5c6 ("smb: smb2pdu.h: Avoid -Wflex-array-member-not-at-end
warnings") introduced tagged `struct create_context_hdr`. We want to
ensure that when new members need to be added to the flexible structure,
they are always included within this tagged struct.

So, we use `static_assert()` to ensure that the memory layout for
both the flexible structure and the tagged struct is the same after
any changes.

Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/common/smb2pdu.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index c3ee42188d25..c769f9dbc0b4 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -1216,6 +1216,8 @@ struct create_context {
 	);
 	__u8 Buffer[];
 } __packed;
+static_assert(offsetof(struct create_context, Buffer) == sizeof(struct create_context_hdr),
+	      "struct member likely outside of __struct_group()");
 
 struct smb2_create_req {
 	struct smb2_hdr hdr;
-- 
cgit 


From c2f6e16a6771eaefba6bb35f6803fe7217822d41 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 15 Aug 2024 13:02:55 -0400
Subject: bcachefs: Increase size of cuckoo hash table on too many rehashes

Also, improve the calculation of the new table size, so that it can
shrink when needed.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets_waiting_for_journal.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index ec1b636ef78d..f70eb2127d32 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -93,7 +93,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 		.dev_bucket	= (u64) dev << 56 | bucket,
 		.journal_seq	= journal_seq,
 	};
-	size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0;
+	size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0;
 	int ret = 0;
 
 	mutex_lock(&b->lock);
@@ -106,7 +106,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 	for (i = 0; i < size; i++)
 		nr_elements += t->d[i].journal_seq > flushed_seq;
 
-	new_bits = t->bits + (nr_elements * 3 > size);
+	new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
 
 	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
 	if (!n) {
@@ -115,7 +115,14 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 	}
 
 retry_rehash:
+	if (nr_rehashes_this_size == 3) {
+		new_bits++;
+		nr_rehashes_this_size = 0;
+	}
+
 	nr_rehashes++;
+	nr_rehashes_this_size++;
+
 	bucket_table_init(n, new_bits);
 
 	tmp = new;
-- 
cgit 


From 075cabf324c3fd790d6ba39ff9db33a30b954fe2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Aug 2024 12:31:29 -0400
Subject: bcachefs: Fix forgetting to pass trans to fsck_err()

Reported-by: syzbot+e3938cd6d761b78750e6@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index b69ef4b3de6e..ad517ef744e5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -925,7 +925,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 		return PTR_ERR(a);
 
 	if (a->v.data_type && type && a->v.data_type != type) {
-		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+		bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
 			      bucket_metadata_type_mismatch,
 			"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
 			"while marking %s",
-- 
cgit 


From 9482f3b05332a624508a91c2ab2cf3527328a6a4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Aug 2024 12:41:46 -0400
Subject: bcachefs: avoid overflowing LRU_TIME_BITS for cached data lru

Reported-by: syzbot+510b0b28f8e6de64d307@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 260e7fa83d05..fd790b03fbe1 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -150,7 +150,9 @@ static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_typ
 
 static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
 {
-	return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
+	return a.data_type == BCH_DATA_cached
+		? a.io_time[READ] & LRU_TIME_MAX
+		: 0;
 }
 
 #define DATA_TYPES_MOVABLE		\
-- 
cgit 


From 99c87fe0f584f8d778a323141504d1ba5c89a4a5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 16 Aug 2024 12:44:49 -0400
Subject: bcachefs: fix incorrect i_state usage

Reported-by: syzbot+95e40eae71609e40d851@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 15fc41e63b6c..94c392abef65 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -193,7 +193,7 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino
 		 * only insert fully created inodes in the inode hash table. But
 		 * discard_new_inode() expects it to be set...
 		 */
-		inode->v.i_flags |= I_NEW;
+		inode->v.i_state |= I_NEW;
 		/*
 		 * We don't want bch2_evict_inode() to delete the inode on disk,
 		 * we just raced and had another inode in cache. Normally new
-- 
cgit 


From 534f7eff9239c1b0af852fc33f5af2b62c00eddf Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 16 Aug 2024 10:40:38 +0930
Subject: btrfs: only enable extent map shrinker for DEBUG builds

Although there are several patches improving the extent map shrinker,
there are still reports of too frequent shrinker behavior, taking too
much CPU for the kswapd process.

So let's only enable extent shrinker for now, until we got more
comprehensive understanding and a better solution.

Link: https://lore.kernel.org/linux-btrfs/3df4acd616a07ef4d2dc6bad668701504b412ffc.camel@intelfx.name/
Link: https://lore.kernel.org/linux-btrfs/c30fd6b3-ca7a-4759-8a53-d42878bf84f7@gmail.com/
Fixes: 956a17d9d050 ("btrfs: add a shrinker for extent maps")
CC: stable@vger.kernel.org # 6.10+
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 11044e9e2cb1..98fa0f382480 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2402,7 +2402,13 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
 
 	trace_btrfs_extent_map_shrinker_count(fs_info, nr);
 
-	return nr;
+	/*
+	 * Only report the real number for DEBUG builds, as there are reports of
+	 * serious performance degradation caused by too frequent shrinks.
+	 */
+	if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
+		return nr;
+	return 0;
 }
 
 static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
-- 
cgit 


From 0e49d3ff12501adaafaf6fdb19699f021d1eda1c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 4 May 2024 23:48:58 -0400
Subject: bcachefs: Fix locking in __bch2_trans_mark_dev_sb()

We run this in full RW mode now, so we have to guard against the
superblock buffer being reallocated.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_gc.c |  5 +----
 fs/bcachefs/buckets.c  | 14 +++++++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 6cbf2aa6a947..eb3002c4eae7 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -741,12 +741,9 @@ fsck_err:
 
 static int bch2_mark_superblocks(struct bch_fs *c)
 {
-	mutex_lock(&c->sb_lock);
 	gc_pos_set(c, gc_phase(GC_PHASE_sb));
 
-	int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
-	mutex_unlock(&c->sb_lock);
-	return ret;
+	return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
 }
 
 static void bch2_gc_free(struct bch_fs *c)
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ad517ef744e5..be2bbd248631 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -915,7 +915,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 				    enum bch_data_type type,
 				    unsigned sectors)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	int ret = 0;
 
@@ -1046,13 +1045,18 @@ static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *ca,
 			enum btree_iter_update_trigger_flags flags)
 {
-	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+	struct bch_fs *c = trans->c;
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_layout layout = ca->disk_sb.sb->layout;
+	mutex_unlock(&c->sb_lock);
+
 	u64 bucket = 0;
 	unsigned i, bucket_sectors = 0;
 	int ret;
 
-	for (i = 0; i < layout->nr_superblocks; i++) {
-		u64 offset = le64_to_cpu(layout->sb_offset[i]);
+	for (i = 0; i < layout.nr_superblocks; i++) {
+		u64 offset = le64_to_cpu(layout.sb_offset[i]);
 
 		if (offset == BCH_SB_SECTOR) {
 			ret = bch2_trans_mark_metadata_sectors(trans, ca,
@@ -1063,7 +1067,7 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c
 		}
 
 		ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
-				      offset + (1 << layout->sb_max_size_bits),
+				      offset + (1 << layout.sb_max_size_bits),
 				      BCH_DATA_sb, &bucket, &bucket_sectors, flags);
 		if (ret)
 			return ret;
-- 
cgit 


From 3c0da3d163eb32f1f91891efaade027fa9b245b9 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 6 Aug 2024 21:51:42 +0200
Subject: fuse: Initialize beyond-EOF page contents before setting uptodate

fuse_notify_store(), unlike fuse_do_readpage(), does not enable page
zeroing (because it can be used to change partial page contents).

So fuse_notify_store() must be more careful to fully initialize page
contents (including parts of the page that are beyond end-of-file)
before marking the page uptodate.

The current code can leave beyond-EOF page contents uninitialized, which
makes these uninitialized page contents visible to userspace via mmap().

This is an information leak, but only affects systems which do not
enable init-on-alloc (via CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y or the
corresponding kernel command line parameter).

Link: https://bugs.chromium.org/p/project-zero/issues/detail?id=2574
Cc: stable@kernel.org
Fixes: a1d75f258230 ("fuse: add store request")
Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/dev.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9eb191b5c4de..7146038b2fe7 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1618,9 +1618,11 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
 
 		this_num = min_t(unsigned, num, PAGE_SIZE - offset);
 		err = fuse_copy_page(cs, &page, offset, this_num, 0);
-		if (!err && offset == 0 &&
-		    (this_num == PAGE_SIZE || file_size == end))
+		if (!PageUptodate(page) && !err && offset == 0 &&
+		    (this_num == PAGE_SIZE || file_size == end)) {
+			zero_user_segment(page, this_num, PAGE_SIZE);
 			SetPageUptodate(page);
+		}
 		unlock_page(page);
 		put_page(page);
 
-- 
cgit 


From e4be320eeca842a3d7648258ee3673f1755a5a59 Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Thu, 15 Aug 2024 18:31:36 -0500
Subject: smb3: fix broken cached reads when posix locks

Mandatory locking is enforced for cached reads, which violates
default posix semantics, and also it is enforced inconsistently.
This affected recent versions of libreoffice, and can be
demonstrated by opening a file twice from the same client,
locking it from handle one and trying to read from it from
handle two (which fails, returning EACCES).

There is already a mount option "forcemandatorylock"
(which defaults to off), so with this change only when the user
intentionally specifies "forcemandatorylock" on mount will we
break posix semantics on read to a locked range (ie we will
only fail in this case, if the user mounts with
"forcemandatorylock").

An earlier patch fixed the write path.

Fixes: 85160e03a79e ("CIFS: Implement caching mechanism for mandatory brlocks")
Cc: stable@vger.kernel.org
Cc: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Reported-by: abartlet@samba.org
Reported-by: Kevin Ottens <kevin.ottens@enioka.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/file.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 1fc66bcf49eb..f9b302cb8233 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -2912,9 +2912,7 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
 	if (!CIFS_CACHE_READ(cinode))
 		return netfs_unbuffered_read_iter(iocb, to);
 
-	if (cap_unix(tcon->ses) &&
-	    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
-	    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+	if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) {
 		if (iocb->ki_flags & IOCB_DIRECT)
 			return netfs_unbuffered_read_iter(iocb, to);
 		return netfs_buffered_read_iter(iocb, to);
-- 
cgit 


From dfd046d0ced19b6ff5f11ec4ceab0a83de924771 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Thu, 15 Aug 2024 08:56:35 +0900
Subject: ksmbd: Use unsafe_memcpy() for ntlm_negotiate

rsp buffer is allocated larger than spnego_blob from
smb2_allocate_rsp_buf().

Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 2df1354288e6..3f4c56a10a86 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1370,7 +1370,8 @@ static int ntlm_negotiate(struct ksmbd_work *work,
 	}
 
 	sz = le16_to_cpu(rsp->SecurityBufferOffset);
-	memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
+	unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len,
+			/* alloc is larger than blob, see smb2_allocate_rsp_buf() */);
 	rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
 
 out:
@@ -1453,7 +1454,9 @@ static int ntlm_authenticate(struct ksmbd_work *work,
 			return -ENOMEM;
 
 		sz = le16_to_cpu(rsp->SecurityBufferOffset);
-		memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
+		unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob,
+				spnego_blob_len,
+				/* alloc is larger than blob, see smb2_allocate_rsp_buf() */);
 		rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
 		kfree(spnego_blob);
 	}
-- 
cgit 


From 76e98a158b207771a6c9a0de0a60522a446a3447 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Sat, 17 Aug 2024 14:03:49 +0900
Subject: ksmbd: fix race condition between destroy_previous_session() and smb2
 operations()

If there is ->PreviousSessionId field in the session setup request,
The session of the previous connection should be destroyed.
During this, if the smb2 operation requests in the previous session are
being processed, a racy issue could happen with ksmbd_destroy_file_table().
This patch sets conn->status to KSMBD_SESS_NEED_RECONNECT to block
incoming  operations and waits until on-going operations are complete
(i.e. idle) before desctorying the previous session.

Fixes: c8efcc786146 ("ksmbd: add support for durable handles v1/v2")
Cc: stable@vger.kernel.org # v6.6+
Reported-by: zdi-disclosures@trendmicro.com # ZDI-CAN-25040
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/connection.c        | 34 +++++++++++++++++++++++++++++++++-
 fs/smb/server/connection.h        |  3 ++-
 fs/smb/server/mgmt/user_session.c |  9 +++++++++
 fs/smb/server/smb2pdu.c           |  2 +-
 4 files changed, 45 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 09e1e7771592..7889df8112b4 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -165,11 +165,43 @@ void ksmbd_all_conn_set_status(u64 sess_id, u32 status)
 	up_read(&conn_list_lock);
 }
 
-void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id)
+void ksmbd_conn_wait_idle(struct ksmbd_conn *conn)
 {
 	wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2);
 }
 
+int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id)
+{
+	struct ksmbd_conn *conn;
+	int rc, retry_count = 0, max_timeout = 120;
+	int rcount = 1;
+
+retry_idle:
+	if (retry_count >= max_timeout)
+		return -EIO;
+
+	down_read(&conn_list_lock);
+	list_for_each_entry(conn, &conn_list, conns_list) {
+		if (conn->binding || xa_load(&conn->sessions, sess_id)) {
+			if (conn == curr_conn)
+				rcount = 2;
+			if (atomic_read(&conn->req_running) >= rcount) {
+				rc = wait_event_timeout(conn->req_running_q,
+					atomic_read(&conn->req_running) < rcount,
+					HZ);
+				if (!rc) {
+					up_read(&conn_list_lock);
+					retry_count++;
+					goto retry_idle;
+				}
+			}
+		}
+	}
+	up_read(&conn_list_lock);
+
+	return 0;
+}
+
 int ksmbd_conn_write(struct ksmbd_work *work)
 {
 	struct ksmbd_conn *conn = work->conn;
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 5c2845e47cf2..5b947175c048 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -145,7 +145,8 @@ extern struct list_head conn_list;
 extern struct rw_semaphore conn_list_lock;
 
 bool ksmbd_conn_alive(struct ksmbd_conn *conn);
-void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id);
+void ksmbd_conn_wait_idle(struct ksmbd_conn *conn);
+int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id);
 struct ksmbd_conn *ksmbd_conn_alloc(void);
 void ksmbd_conn_free(struct ksmbd_conn *conn);
 bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 162a12685d2c..99416ce9f501 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -311,6 +311,7 @@ void destroy_previous_session(struct ksmbd_conn *conn,
 {
 	struct ksmbd_session *prev_sess;
 	struct ksmbd_user *prev_user;
+	int err;
 
 	down_write(&sessions_table_lock);
 	down_write(&conn->session_lock);
@@ -325,8 +326,16 @@ void destroy_previous_session(struct ksmbd_conn *conn,
 	    memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
 		goto out;
 
+	ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT);
+	err = ksmbd_conn_wait_idle_sess_id(conn, id);
+	if (err) {
+		ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
+		goto out;
+	}
+
 	ksmbd_destroy_file_table(&prev_sess->file_table);
 	prev_sess->state = SMB2_SESSION_EXPIRED;
+	ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
 	ksmbd_launch_ksmbd_durable_scavenger();
 out:
 	up_write(&conn->session_lock);
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 3f4c56a10a86..cb7f487c96af 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -2213,7 +2213,7 @@ int smb2_session_logoff(struct ksmbd_work *work)
 	ksmbd_conn_unlock(conn);
 
 	ksmbd_close_session_fds(work);
-	ksmbd_conn_wait_idle(conn, sess_id);
+	ksmbd_conn_wait_idle(conn);
 
 	/*
 	 * Re-lookup session to validate if session is deleted
-- 
cgit 


From 7c525dddbee71880e654ad44f3917787a4f6042c Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Fri, 16 Aug 2024 19:33:39 +0200
Subject: ksmbd: Replace one-element arrays with flexible-array members

Replace the deprecated one-element arrays with flexible-array members
in the structs filesystem_attribute_info and filesystem_device_info.

There are no binary differences after this conversion.

Link: https://github.com/KSPP/linux/issues/79
Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c    | 4 ++--
 fs/smb/server/smb_common.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index cb7f487c96af..0bc9edf22ba4 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -5360,7 +5360,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
 					"NTFS", PATH_MAX, conn->local_nls, 0);
 		len = len * 2;
 		info->FileSystemNameLen = cpu_to_le32(len);
-		sz = sizeof(struct filesystem_attribute_info) - 2 + len;
+		sz = sizeof(struct filesystem_attribute_info) + len;
 		rsp->OutputBufferLength = cpu_to_le32(sz);
 		break;
 	}
@@ -5386,7 +5386,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
 		len = len * 2;
 		info->VolumeLabelSize = cpu_to_le32(len);
 		info->Reserved = 0;
-		sz = sizeof(struct filesystem_vol_info) - 2 + len;
+		sz = sizeof(struct filesystem_vol_info) + len;
 		rsp->OutputBufferLength = cpu_to_le32(sz);
 		break;
 	}
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index 4a3148b0167f..cc1d6dfe29d5 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -213,7 +213,7 @@ struct filesystem_attribute_info {
 	__le32 Attributes;
 	__le32 MaxPathNameComponentLength;
 	__le32 FileSystemNameLen;
-	__le16 FileSystemName[1]; /* do not have to save this - get subset? */
+	__le16 FileSystemName[]; /* do not have to save this - get subset? */
 } __packed;
 
 struct filesystem_device_info {
@@ -226,7 +226,7 @@ struct filesystem_vol_info {
 	__le32 SerialNumber;
 	__le32 VolumeLabelSize;
 	__le16 Reserved;
-	__le16 VolumeLabel[1];
+	__le16 VolumeLabel[];
 } __packed;
 
 struct filesystem_info {
-- 
cgit 


From d6d539c9a7ad0655e5ad46b5e869f1b20bce8953 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Aug 2024 19:51:13 -0400
Subject: bcachefs: Reallocate table when we're increasing size

Fixes: c2f6e16a6771 ("bcachefs: Increase size of cuckoo hash table on too many rehashes")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/buckets_waiting_for_journal.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index f70eb2127d32..f9fb150eda70 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -107,7 +107,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
 		nr_elements += t->d[i].journal_seq > flushed_seq;
 
 	new_bits = ilog2(roundup_pow_of_two(nr_elements * 3));
-
+realloc:
 	n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
 	if (!n) {
 		ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
@@ -118,6 +118,8 @@ retry_rehash:
 	if (nr_rehashes_this_size == 3) {
 		new_bits++;
 		nr_rehashes_this_size = 0;
+		kvfree(n);
+		goto realloc;
 	}
 
 	nr_rehashes++;
-- 
cgit 


From d9f49c3106e404776afcf6c5682357f4fe088beb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Aug 2024 16:41:39 -0400
Subject: bcachefs: fix field-spanning write warning

attempts to retrofit memory safety onto C are increasingly annoying

------------[ cut here ]------------
memcpy: detected field-spanning write (size 4) of single field "&k.replicas" at fs/bcachefs/replicas.c:454 (size 3)
WARNING: CPU: 5 PID: 6525 at fs/bcachefs/replicas.c:454 bch2_replicas_gc2+0x2cb/0x400 [bcachefs]
bch2_replicas_gc2+0x2cb/0x400:
bch2_replicas_gc2 at /home/ojab/src/bcachefs/fs/bcachefs/replicas.c:454 (discriminator 3)
Modules linked in: dm_mod tun nf_conntrack_netlink nfnetlink xt_addrtype br_netfilter overlay msr sctp bcachefs lz4hc_compress lz4_compress libcrc32c xor raid6_pq lz4_decompress pps_ldisc pps_core wireguard libchacha20poly1305 chacha_x86_64 poly1305_x86_64 ip6_udp_tunnel udp_tunnel curve25519_x86_64 libcurve25519_generic libchacha sit tunnel4 ip_tunnel af_packet bridge stp llc ip6table_nat ip6table_filter ip6_tables xt_MASQUERADE xt_conntrack iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter ip_tables x_tables tcp_bbr sch_fq_codel efivarfs nls_iso8859_1 nls_cp437 vfat fat cdc_mbim cdc_wdm cdc_ncm cdc_ether usbnet r8152 input_leds joydev mii amdgpu mousedev hid_generic usbhid hid ath10k_pci amd_atl edac_mce_amd ath10k_core kvm_amd ath kvm mac80211 bfq crc32_pclmul crc32c_intel polyval_clmulni polyval_generic sha512_ssse3 sha256_ssse3 sha1_ssse3 snd_hda_codec_generic snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg i2c_algo_bit drm_exec snd_hda_codec r8169 drm_suballoc_helper
aesni_intel gf128mul crypto_simd amdxcp realtek mfd_core tpm_crb drm_buddy snd_hwdep mdio_devres libarc4 cryptd tpm_tis wmi_bmof cfg80211 evdev libphy snd_hda_core tpm_tis_core gpu_sched rapl xhci_pci xhci_hcd snd_pcm drm_display_helper snd_timer tpm sp5100_tco rfkill efi_pstore mpt3sas drm_ttm_helper ahci usbcore libaescfb ccp snd ttm 8250 libahci watchdog soundcore raid_class sha1_generic acpi_cpufreq k10temp 8250_base usb_common scsi_transport_sas i2c_piix4 hwmon video serial_mctrl_gpio serial_base ecdh_generic wmi rtc_cmos backlight ecc gpio_amdpt rng_core gpio_generic button
CPU: 5 UID: 0 PID: 6525 Comm: bcachefs Tainted: G        W          6.11.0-rc1-ojab-00058-g224bc118aec9 #6 6d5debde398d2a84851f42ab300dae32c2992027
Tainted: [W]=WARN
RIP: 0010:bch2_replicas_gc2+0x2cb/0x400 [bcachefs]
Code: c7 c2 60 91 d1 c1 48 89 c6 48 c7 c7 98 91 d1 c1 4c 89 14 24 44 89 5c 24 08 48 89 44 24 20 c6 05 fa 68 04 00 01 e8 05 a3 40 e4 <0f> 0b 4c 8b 14 24 44 8b 5c 24 08 48 8b 44 24 20 e9 55 fe ff ff 8b
RSP: 0018:ffffb434c9263d60 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff9a8efa79cc00 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: ffffb434c9263de0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000005
R13: ffff9a8efa73c300 R14: ffff9a8d9e880000 R15: ffff9a8d9e8806f8
FS:  0000000000000000(0000) GS:ffff9a9410c80000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000565423373090 CR3: 0000000164e30000 CR4: 00000000003506f0
Call Trace:
<TASK>
? __warn+0x97/0x150
? bch2_replicas_gc2+0x2cb/0x400 [bcachefs 9803eca5e131ef28f26250ede34072d5b50d98b3]
bch2_replicas_gc2+0x2cb/0x400:
bch2_replicas_gc2 at /home/ojab/src/bcachefs/fs/bcachefs/replicas.c:454 (discriminator 3)
? report_bug+0x196/0x1c0
? handle_bug+0x3c/0x70
? exc_invalid_op+0x17/0x80
? __wake_up_klogd.part.0+0x4c/0x80
? asm_exc_invalid_op+0x16/0x20
? bch2_replicas_gc2+0x2cb/0x400 [bcachefs 9803eca5e131ef28f26250ede34072d5b50d98b3]
bch2_replicas_gc2+0x2cb/0x400:
bch2_replicas_gc2 at /home/ojab/src/bcachefs/fs/bcachefs/replicas.c:454 (discriminator 3)
? bch2_dev_usage_read+0xa0/0xa0 [bcachefs 9803eca5e131ef28f26250ede34072d5b50d98b3]
bch2_dev_usage_read+0xa0/0xa0:
discard_in_flight_remove at /home/ojab/src/bcachefs/fs/bcachefs/alloc_background.c:1712

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/replicas.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 1223b710755d..12b1d28b7eb4 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -451,7 +451,8 @@ retry:
 			.type = BCH_DISK_ACCOUNTING_replicas,
 		};
 
-		memcpy(&k.replicas, e, replicas_entry_bytes(e));
+		unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
+			      "embedded variable length struct");
 
 		struct bpos p = disk_accounting_pos_to_bpos(&k);
 
-- 
cgit 


From 47cdc7b14417a40af6a5d5909f1d28a5a23fc11d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Aug 2024 17:38:43 -0400
Subject: bcachefs: Fix incorrect gfp flags

fixes:
00488 WARNING: CPU: 9 PID: 194 at mm/page_alloc.c:4410 __alloc_pages_noprof+0x1818/0x1888
00488 Modules linked in:
00488 CPU: 9 UID: 0 PID: 194 Comm: kworker/u66:1 Not tainted 6.11.0-rc1-ktest-g18fa10d6495f #2931
00488 Hardware name: linux,dummy-virt (DT)
00488 Workqueue: writeback wb_workfn (flush-bcachefs-2)
00488 pstate: 20001005 (nzCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
00488 pc : __alloc_pages_noprof+0x1818/0x1888
00488 lr : __alloc_pages_noprof+0x5f4/0x1888
00488 sp : ffffff80ccd8ed00
00488 x29: ffffff80ccd8ed00 x28: 0000000000000000 x27: dfffffc000000000
00488 x26: 0000000000000010 x25: 0000000000000002 x24: 0000000000000000
00488 x23: 0000000000000000 x22: 1ffffff0199b1dbe x21: ffffff80cc680900
00488 x20: 0000000000000000 x19: ffffff80ccd8eed0 x18: 0000000000000000
00488 x17: ffffff80cc58a010 x16: dfffffc000000000 x15: 1ffffff00474e518
00488 x14: 1ffffff00474e518 x13: 1ffffff00474e518 x12: ffffffb8104701b9
00488 x11: 1ffffff8104701b8 x10: ffffffb8104701b8 x9 : ffffffc08043cde8
00488 x8 : 00000047efb8fe48 x7 : ffffff80ccd8ee20 x6 : 0000000000048000
00488 x5 : 1ffffff810470138 x4 : 0000000000000050 x3 : 1ffffff0199b1d94
00488 x2 : ffffffb0199b1d94 x1 : 0000000000000001 x0 : ffffffc082387448
00488 Call trace:
00488  __alloc_pages_noprof+0x1818/0x1888
00488  new_slab+0x284/0x2f0
00488  ___slab_alloc+0x208/0x8e0
00488  __kmalloc_noprof+0x328/0x340
00488  __bch2_writepage+0x106c/0x1830
00488  write_cache_pages+0xa0/0xe8

due to __GFP_NOFAIL without allowing reclaim

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-io-buffered.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index cc33d763f722..184d03851676 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -534,7 +534,7 @@ do_io:
 
 	if (f_sectors > w->tmp_sectors) {
 		kfree(w->tmp);
-		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), GFP_NOFS|__GFP_NOFAIL);
 		w->tmp_sectors = f_sectors;
 	}
 
-- 
cgit 


From 5b5c96c63d5b6e91c622611e04b2b156bbae53f5 Mon Sep 17 00:00:00 2001
From: Hongzhen Luo <hongzhen@linux.alibaba.com>
Date: Thu, 1 Aug 2024 19:26:22 +0800
Subject: erofs: simplify readdir operation

 - Use i_size instead of i_size_read() due to immutable fses;

 - Get rid of an unneeded goto since erofs_fill_dentries() also works;

 - Remove unnecessary lines.

Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240801112622.2164029-1-hongzhen@linux.alibaba.com
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/dir.c      | 35 ++++++++++++-----------------------
 fs/erofs/internal.h |  2 +-
 2 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 2193a6710c8f..c3b90abdee37 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -8,19 +8,15 @@
 
 static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
 			       void *dentry_blk, struct erofs_dirent *de,
-			       unsigned int nameoff, unsigned int maxsize)
+			       unsigned int nameoff0, unsigned int maxsize)
 {
-	const struct erofs_dirent *end = dentry_blk + nameoff;
+	const struct erofs_dirent *end = dentry_blk + nameoff0;
 
 	while (de < end) {
-		const char *de_name;
+		unsigned char d_type = fs_ftype_to_dtype(de->file_type);
+		unsigned int nameoff = le16_to_cpu(de->nameoff);
+		const char *de_name = (char *)dentry_blk + nameoff;
 		unsigned int de_namelen;
-		unsigned char d_type;
-
-		d_type = fs_ftype_to_dtype(de->file_type);
-
-		nameoff = le16_to_cpu(de->nameoff);
-		de_name = (char *)dentry_blk + nameoff;
 
 		/* the last dirent in the block? */
 		if (de + 1 >= end)
@@ -52,21 +48,20 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
 	struct super_block *sb = dir->i_sb;
 	unsigned long bsz = sb->s_blocksize;
-	const size_t dirsize = i_size_read(dir);
-	unsigned int i = erofs_blknr(sb, ctx->pos);
 	unsigned int ofs = erofs_blkoff(sb, ctx->pos);
 	int err = 0;
 	bool initial = true;
 
 	buf.mapping = dir->i_mapping;
-	while (ctx->pos < dirsize) {
+	while (ctx->pos < dir->i_size) {
+		erofs_off_t dbstart = ctx->pos - ofs;
 		struct erofs_dirent *de;
 		unsigned int nameoff, maxsize;
 
-		de = erofs_bread(&buf, erofs_pos(sb, i), EROFS_KMAP);
+		de = erofs_bread(&buf, dbstart, EROFS_KMAP);
 		if (IS_ERR(de)) {
 			erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
-				  i, EROFS_I(dir)->nid);
+				  erofs_blknr(sb, dbstart), EROFS_I(dir)->nid);
 			err = PTR_ERR(de);
 			break;
 		}
@@ -79,25 +74,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 			break;
 		}
 
-		maxsize = min_t(unsigned int, dirsize - ctx->pos + ofs, bsz);
-
+		maxsize = min_t(unsigned int, dir->i_size - dbstart, bsz);
 		/* search dirents at the arbitrary position */
 		if (initial) {
 			initial = false;
-
 			ofs = roundup(ofs, sizeof(struct erofs_dirent));
-			ctx->pos = erofs_pos(sb, i) + ofs;
-			if (ofs >= nameoff)
-				goto skip_this;
+			ctx->pos = dbstart + ofs;
 		}
 
 		err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
 					  nameoff, maxsize);
 		if (err)
 			break;
-skip_this:
-		ctx->pos = erofs_pos(sb, i) + maxsize;
-		++i;
+		ctx->pos = dbstart + maxsize;
 		ofs = 0;
 	}
 	erofs_put_metabuf(&buf);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 736607675396..45dc15ebd870 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -220,7 +220,7 @@ struct erofs_buf {
 };
 #define __EROFS_BUF_INITIALIZER	((struct erofs_buf){ .page = NULL })
 
-#define erofs_blknr(sb, addr)	((addr) >> (sb)->s_blocksize_bits)
+#define erofs_blknr(sb, addr)	((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits))
 #define erofs_blkoff(sb, addr)	((addr) & ((sb)->s_blocksize - 1))
 #define erofs_pos(sb, blk)	((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
 #define erofs_iblks(i)	(round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
-- 
cgit 


From 2c534624ae70100aeea0b5800b0f3768b2fd3cf0 Mon Sep 17 00:00:00 2001
From: Hongzhen Luo <hongzhen@linux.alibaba.com>
Date: Tue, 6 Aug 2024 19:22:08 +0800
Subject: erofs: get rid of check_layout_compatibility()

Simple enough to just open-code it.

Signed-off-by: Hongzhen Luo <hongzhen@linux.alibaba.com>
Reviewed-by: Sandeep Dhavale <dhavale@google.com>
Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240806112208.150323-1-hongzhen@linux.alibaba.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 fs/erofs/super.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 32ce5b35e1df..6cb5c8916174 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -108,22 +108,6 @@ static void erofs_free_inode(struct inode *inode)
 	kmem_cache_free(erofs_inode_cachep, vi);
 }
 
-static bool check_layout_compatibility(struct super_block *sb,
-				       struct erofs_super_block *dsb)
-{
-	const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
-
-	EROFS_SB(sb)->feature_incompat = feature;
-
-	/* check if current kernel meets all mandatory requirements */
-	if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
-		erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
-			   feature & ~EROFS_ALL_FEATURE_INCOMPAT);
-		return false;
-	}
-	return true;
-}
-
 /* read variable-sized metadata, offset will be aligned by 4-byte */
 void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
 			  erofs_off_t *offset, int *lengthp)
@@ -279,7 +263,7 @@ static int erofs_scan_devices(struct super_block *sb,
 
 static int erofs_read_superblock(struct super_block *sb)
 {
-	struct erofs_sb_info *sbi;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
 	struct erofs_super_block *dsb;
 	void *data;
@@ -291,9 +275,7 @@ static int erofs_read_superblock(struct super_block *sb)
 		return PTR_ERR(data);
 	}
 
-	sbi = EROFS_SB(sb);
 	dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
-
 	ret = -EINVAL;
 	if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
 		erofs_err(sb, "cannot find valid erofs superblock");
@@ -318,8 +300,12 @@ static int erofs_read_superblock(struct super_block *sb)
 	}
 
 	ret = -EINVAL;
-	if (!check_layout_compatibility(sb, dsb))
+	sbi->feature_incompat = le32_to_cpu(dsb->feature_incompat);
+	if (sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT) {
+		erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
+			  sbi->feature_incompat & ~EROFS_ALL_FEATURE_INCOMPAT);
 		goto out;
+	}
 
 	sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
 	if (sbi->sb_size > PAGE_SIZE - EROFS_SUPER_OFFSET) {
-- 
cgit 


From e080a26725fb36f535f22ea42694c60ab005fb2e Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Mon, 19 Aug 2024 10:52:07 +0800
Subject: erofs: allow large folios for compressed files

As commit 2e6506e1c4ee ("mm/migrate: fix deadlock in
migrate_pages_batch() on large folios") has landed upstream, large
folios can be safely enabled for compressed inodes since all
prerequisites have already landed in 6.11-rc1.

Stress tests has been running on my fleet for over 20 days without any
regression.  Additionally, users [1] have requested it for months.
Let's allow large folios for EROFS full cases upstream now for wider
testing.

[1] https://lore.kernel.org/r/CAGsJ_4wtE8OcpinuqVwG4jtdx6Qh5f+TON6wz+4HMCq=A2qFcA@mail.gmail.com

Cc: Barry Song <21cnbao@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
[ Gao Xiang: minor commit typo fixes. ]
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240819025207.3808649-1-hsiangkao@linux.alibaba.com
---
 Documentation/filesystems/erofs.rst |  2 +-
 fs/erofs/inode.c                    | 18 ++++++++----------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/erofs.rst b/Documentation/filesystems/erofs.rst
index cc4626d6ee4f..c293f8e37468 100644
--- a/Documentation/filesystems/erofs.rst
+++ b/Documentation/filesystems/erofs.rst
@@ -75,7 +75,7 @@ Here are the main features of EROFS:
 
  - Support merging tail-end data into a special inode as fragments.
 
- - Support large folios for uncompressed files.
+ - Support large folios to make use of THPs (Transparent Hugepages);
 
  - Support direct I/O on uncompressed files to avoid double caching for loop
    devices;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 43c09aae2afc..419432be3223 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -257,25 +257,23 @@ static int erofs_fill_inode(struct inode *inode)
 		goto out_unlock;
 	}
 
+	mapping_set_large_folios(inode->i_mapping);
 	if (erofs_inode_is_data_compressed(vi->datalayout)) {
 #ifdef CONFIG_EROFS_FS_ZIP
 		DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
 			  erofs_info, inode->i_sb,
 			  "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
 		inode->i_mapping->a_ops = &z_erofs_aops;
-		err = 0;
-		goto out_unlock;
-#endif
+#else
 		err = -EOPNOTSUPP;
-		goto out_unlock;
-	}
-	inode->i_mapping->a_ops = &erofs_raw_access_aops;
-	mapping_set_large_folios(inode->i_mapping);
+#endif
+	} else {
+		inode->i_mapping->a_ops = &erofs_raw_access_aops;
 #ifdef CONFIG_EROFS_FS_ONDEMAND
-	if (erofs_is_fscache_mode(inode->i_sb))
-		inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+		if (erofs_is_fscache_mode(inode->i_sb))
+			inode->i_mapping->a_ops = &erofs_fscache_access_aops;
 #endif
-
+	}
 out_unlock:
 	erofs_put_metabuf(&buf);
 	return err;
-- 
cgit 


From 2102bdac67b55bf2d1df4ff757bced74e94a5f74 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 17 Aug 2024 21:03:07 -0400
Subject: bcachefs: Extra debug for data move path

We don't have sufficient information to debug:

https://github.com/koverstreet/bcachefs/issues/726

- print out durability of extent ptrs, when non default
- print the number of replicas we need in data_update_to_text()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 3 +++
 fs/bcachefs/extents.c     | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 6a854c918496..1ca628e93e87 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -475,6 +475,9 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
 	bch2_compression_opt_to_text(out, background_compression(*io_opts));
 	prt_newline(out);
 
+	prt_str(out, "opts.replicas:\t");
+	prt_u64(out, io_opts->data_replicas);
+
 	prt_str(out, "extra replicas:\t");
 	prt_u64(out, data_opts->extra_replicas);
 }
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 4419ad3e454e..9406f82fc255 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1017,6 +1017,8 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
 
 		prt_printf(out, "ptr: %u:%llu:%u gen %u",
 			   ptr->dev, b, offset, ptr->gen);
+		if (ca->mi.durability != 1)
+			prt_printf(out, " d=%u", ca->mi.durability);
 		if (ptr->cached)
 			prt_str(out, " cached");
 		if (ptr->unwritten)
-- 
cgit 


From 0005e01e1e875c5e27130c5e2ed0189749d1e08a Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 20 Aug 2024 16:56:19 +0800
Subject: erofs: fix out-of-bound access when z_erofs_gbuf_growsize() partially
 fails

If z_erofs_gbuf_growsize() partially fails on a global buffer due to
memory allocation failure or fault injection (as reported by syzbot [1]),
new pages need to be freed by comparing to the existing pages to avoid
memory leaks.

However, the old gbuf->pages[] array may not be large enough, which can
lead to null-ptr-deref or out-of-bound access.

Fix this by checking against gbuf->nrpages in advance.

[1] https://lore.kernel.org/r/000000000000f7b96e062018c6e3@google.com

Reported-by: syzbot+242ee56aaa9585553766@syzkaller.appspotmail.com
Fixes: d6db47e571dc ("erofs: do not use pagepool in z_erofs_gbuf_growsize()")
Cc: <stable@vger.kernel.org> # 6.10+
Reviewed-by: Chunhai Guo <guochunhai@vivo.com>
Reviewed-by: Sandeep Dhavale <dhavale@google.com>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240820085619.1375963-1-hsiangkao@linux.alibaba.com
---
 fs/erofs/zutil.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/erofs/zutil.c b/fs/erofs/zutil.c
index 9b53883e5caf..37afe2024840 100644
--- a/fs/erofs/zutil.c
+++ b/fs/erofs/zutil.c
@@ -111,7 +111,8 @@ int z_erofs_gbuf_growsize(unsigned int nrpages)
 out:
 	if (i < z_erofs_gbuf_count && tmp_pages) {
 		for (j = 0; j < nrpages; ++j)
-			if (tmp_pages[j] && tmp_pages[j] != gbuf->pages[j])
+			if (tmp_pages[j] && (j >= gbuf->nrpages ||
+					     tmp_pages[j] != gbuf->pages[j]))
 				__free_page(tmp_pages[j]);
 		kfree(tmp_pages);
 	}
-- 
cgit 


From 92764e8822d4e7f8efb5ad959fac195a7f8ea0c6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 14 Aug 2024 21:38:21 +0100
Subject: netfs, ceph: Partially revert "netfs: Replace PG_fscache by setting
 folio->private and marking dirty"

This partially reverts commit 2ff1e97587f4d398686f52c07afde3faf3da4e5c.

In addition to reverting the removal of PG_private_2 wrangling from the
buffered read code[1][2], the removal of the waits for PG_private_2 from
netfs_release_folio() and netfs_invalidate_folio() need reverting too.

It also adds a wait into ceph_evict_inode() to wait for netfs read and
copy-to-cache ops to complete.

Fixes: 2ff1e97587f4 ("netfs: Replace PG_fscache by setting folio->private and marking dirty")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/3575457.1722355300@warthog.procyon.org.uk [1]
Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8e5ced7804cb9184c4a23f8054551240562a8eda [2]
Link: https://lore.kernel.org/r/20240814203850.2240469-2-dhowells@redhat.com
cc: Max Kellermann <max.kellermann@ionos.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: ceph-devel@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ceph/inode.c | 1 +
 fs/netfs/misc.c | 7 +++++++
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 71cd70514efa..4a8eec46254b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -695,6 +695,7 @@ void ceph_evict_inode(struct inode *inode)
 
 	percpu_counter_dec(&mdsc->metric.total_inodes);
 
+	netfs_wait_for_outstanding_io(inode);
 	truncate_inode_pages_final(&inode->i_data);
 	if (inode->i_state & I_PINNING_NETFS_WB)
 		ceph_fscache_unuse_cookie(inode, true);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 83e644bd518f..554a1a4615ad 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -101,6 +101,8 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 
 	_enter("{%lx},%zx,%zx", folio->index, offset, length);
 
+	folio_wait_private_2(folio); /* [DEPRECATED] */
+
 	if (!folio_test_private(folio))
 		return;
 
@@ -165,6 +167,11 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
 
 	if (folio_test_private(folio))
 		return false;
+	if (unlikely(folio_test_private_2(folio))) { /* [DEPRECATED] */
+		if (current_is_kswapd() || !(gfp & __GFP_FS))
+			return false;
+		folio_wait_private_2(folio);
+	}
 	fscache_note_page_release(netfs_i_cookie(ctx));
 	return true;
 }
-- 
cgit 


From 524b2c6dc80d735be9ebcd2decffe2889baab65d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 15 Aug 2024 14:39:33 +0200
Subject: romfs: fix romfs_read_folio()

Add the correct offset to folio_zero_tail().

Fixes: d86f2de026c5 ("romfs: Convert romfs_read_folio() to use a folio")
Reported-by: Greg Ungerer <gregungerer@westnet.com.au>
Link: https://lore.kernel.org/r/Zr0GTnPHfeA0P8nb@casper.infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/romfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 68758b6fed94..0addcc849ff2 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -126,7 +126,7 @@ static int romfs_read_folio(struct file *file, struct folio *folio)
 		}
 	}
 
-	buf = folio_zero_tail(folio, fillsize, buf);
+	buf = folio_zero_tail(folio, fillsize, buf + fillsize);
 	kunmap_local(buf);
 	folio_end_read(folio, ret == 0);
 	return ret;
-- 
cgit 


From 3f53d050416e88122d53aabbadb1fede998004da Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Aug 2024 12:22:23 -0400
Subject: bcachefs: bch2_data_update_init() cleanup

Factor out some helpers - this function has gotten much too big.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 179 ++++++++++++++++++++++++++--------------------
 1 file changed, 101 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 1ca628e93e87..5f49f4953b19 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -20,6 +20,76 @@
 #include "subvolume.h"
 #include "trace.h"
 
+static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr)
+		bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
+}
+
+static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		if (!bch2_dev_tryget(c, ptr->dev)) {
+			bkey_for_each_ptr(ptrs, ptr2) {
+				if (ptr2 == ptr)
+					break;
+				bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
+			}
+			return false;
+		}
+	}
+	return true;
+}
+
+static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+	}
+}
+
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+	bkey_for_each_ptr(ptrs, ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+
+		if (ctxt) {
+			bool locked;
+
+			move_ctxt_wait_event(ctxt,
+				(locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+				list_empty(&ctxt->ios));
+
+			if (!locked)
+				bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+		} else {
+			if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
+				bkey_for_each_ptr(ptrs, ptr2) {
+					if (ptr2 == ptr)
+						break;
+
+					bucket = PTR_BUCKET_POS(ca, ptr2);
+					bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+				}
+				return false;
+			}
+		}
+	}
+	return true;
+}
+
 static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k)
 {
 	if (trace_move_extent_finish_enabled()) {
@@ -355,17 +425,11 @@ void bch2_data_update_read_done(struct data_update *m,
 void bch2_data_update_exit(struct data_update *update)
 {
 	struct bch_fs *c = update->op.c;
-	struct bkey_ptrs_c ptrs =
-		bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
-
-	bkey_for_each_ptr(ptrs, ptr) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-		if (c->opts.nocow_enabled)
-			bch2_bucket_nocow_unlock(&c->nocow_locks,
-						 PTR_BUCKET_POS(ca, ptr), 0);
-		bch2_dev_put(ca);
-	}
+	struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
 
+	if (c->opts.nocow_enabled)
+		bkey_nocow_unlock(c, k);
+	bkey_put_dev_refs(c, k);
 	bch2_bkey_buf_exit(&update->k, c);
 	bch2_disk_reservation_put(c, &update->op.res);
 	bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
@@ -546,7 +610,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
 	unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
-	unsigned ptrs_locked = 0;
 	int ret = 0;
 
 	/*
@@ -557,6 +620,15 @@ int bch2_data_update_init(struct btree_trans *trans,
 	if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
 		return -BCH_ERR_data_update_done;
 
+	if (!bkey_get_dev_refs(c, k))
+		return -BCH_ERR_data_update_done;
+
+	if (c->opts.nocow_enabled &&
+	    !bkey_nocow_lock(c, ctxt, k)) {
+		bkey_put_dev_refs(c, k);
+		return -BCH_ERR_nocow_lock_blocked;
+	}
+
 	bch2_bkey_buf_init(&m->k);
 	bch2_bkey_buf_reassemble(&m->k, c, k);
 	m->btree_id	= btree_id;
@@ -578,40 +650,24 @@ int bch2_data_update_init(struct btree_trans *trans,
 	m->op.compression_opt	= background_compression(io_opts);
 	m->op.watermark		= m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
 
-	bkey_for_each_ptr(ptrs, ptr) {
-		if (!bch2_dev_tryget(c, ptr->dev)) {
-			bkey_for_each_ptr(ptrs, ptr2) {
-				if (ptr2 == ptr)
-					break;
-				bch2_dev_put(bch2_dev_have_ref(c, ptr2->dev));
-			}
-			return -BCH_ERR_data_update_done;
-		}
-	}
-
 	unsigned durability_have = 0, durability_removing = 0;
 
 	i = 0;
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-		bool locked;
-
-		rcu_read_lock();
-		if (((1U << i) & m->data_opts.rewrite_ptrs)) {
-			BUG_ON(p.ptr.cached);
-
-			if (crc_is_compressed(p.crc))
-				reserve_sectors += k.k->size;
-
-			m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
-			durability_removing += bch2_extent_ptr_desired_durability(c, &p);
-		} else if (!p.ptr.cached &&
-			   !((1U << i) & m->data_opts.kill_ptrs)) {
-			bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
-			durability_have += bch2_extent_ptr_durability(c, &p);
+		if (!p.ptr.cached) {
+			rcu_read_lock();
+			if (BIT(i) & m->data_opts.rewrite_ptrs) {
+				if (crc_is_compressed(p.crc))
+					reserve_sectors += k.k->size;
+
+				m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
+				durability_removing += bch2_extent_ptr_desired_durability(c, &p);
+			} else if (!(BIT(i) & m->data_opts.kill_ptrs)) {
+				bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
+				durability_have += bch2_extent_ptr_durability(c, &p);
+			}
+			rcu_read_unlock();
 		}
-		rcu_read_unlock();
 
 		/*
 		 * op->csum_type is normally initialized from the fs/file's
@@ -626,24 +682,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 		if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
 			m->op.incompressible = true;
 
-		if (c->opts.nocow_enabled) {
-			if (ctxt) {
-				move_ctxt_wait_event(ctxt,
-						(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
-									  bucket, 0)) ||
-						list_empty(&ctxt->ios));
-
-				if (!locked)
-					bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
-			} else {
-				if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
-					ret = -BCH_ERR_nocow_lock_blocked;
-					goto err;
-				}
-			}
-			ptrs_locked |= (1U << i);
-		}
-
 		i++;
 	}
 
@@ -664,7 +702,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		/* if iter == NULL, it's just a promote */
 		if (iter)
 			ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
-		goto done;
+		goto out;
 	}
 
 	m->op.nr_replicas = min(durability_removing, durability_required) +
@@ -684,8 +722,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 		bch2_data_update_to_text(&buf, m);
 		WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf);
 		printbuf_exit(&buf);
-		ret = -BCH_ERR_data_update_done;
-		goto done;
+		goto out;
 	}
 
 	m->op.nr_replicas_required = m->op.nr_replicas;
@@ -696,30 +733,16 @@ int bch2_data_update_init(struct btree_trans *trans,
 				? 0
 				: BCH_DISK_RESERVATION_NOFAIL);
 		if (ret)
-			goto err;
+			goto out;
 	}
 
 	if (bkey_extent_is_unwritten(k)) {
 		bch2_update_unwritten_extent(trans, m);
-		goto done;
+		goto out;
 	}
 
 	return 0;
-err:
-	i = 0;
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-		struct bch_dev *ca = bch2_dev_have_ref(c, p.ptr.dev);
-		struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
-		if ((1U << i) & ptrs_locked)
-			bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
-		bch2_dev_put(ca);
-		i++;
-	}
-
-	bch2_bkey_buf_exit(&m->k, c);
-	bch2_bio_free_pages_pool(c, &m->op.wbio.bio);
-	return ret;
-done:
+out:
 	bch2_data_update_exit(m);
 	return ret ?: -BCH_ERR_data_update_done;
 }
-- 
cgit 


From 8cc0e50614520c6c609c6ae32a65d0591b7865a1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Aug 2024 13:13:39 -0400
Subject: bcachefs: Fix "trying to move an extent, but nr_replicas=0"

data_update_init() does a bunch of complicated stuff to decide how many
replicas to add, since we only want to increase an extent's durability
on an explicit rereplicate, but extent pointers may be on devices with
different durability settings.

There was a corner case when evacuating a device that had been set to
durability=0 after data had been written to it, and extents on that
device had already been rereplicated - then evacuate only needs to drop
pointers on that device, not move them.

So the assert for !m->op.nr_replicas was spurious; this was a perfectly
legitimate case that needed to be handled.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 5f49f4953b19..65176d51b502 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -695,16 +695,6 @@ int bch2_data_update_init(struct btree_trans *trans,
 	 * Increasing replication is an explicit operation triggered by
 	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
 	 */
-	if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
-	    !durability_required) {
-		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
-		m->data_opts.rewrite_ptrs = 0;
-		/* if iter == NULL, it's just a promote */
-		if (iter)
-			ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
-		goto out;
-	}
-
 	m->op.nr_replicas = min(durability_removing, durability_required) +
 		m->data_opts.extra_replicas;
 
@@ -716,17 +706,22 @@ int bch2_data_update_init(struct btree_trans *trans,
 	if (!(durability_have + durability_removing))
 		m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
 
-	if (!m->op.nr_replicas) {
-		struct printbuf buf = PRINTBUF;
+	m->op.nr_replicas_required = m->op.nr_replicas;
 
-		bch2_data_update_to_text(&buf, m);
-		WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf);
-		printbuf_exit(&buf);
+	/*
+	 * It might turn out that we don't need any new replicas, if the
+	 * replicas or durability settings have been changed since the extent
+	 * was written:
+	 */
+	if (!m->op.nr_replicas) {
+		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
+		m->data_opts.rewrite_ptrs = 0;
+		/* if iter == NULL, it's just a promote */
+		if (iter)
+			ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
 		goto out;
 	}
 
-	m->op.nr_replicas_required = m->op.nr_replicas;
-
 	if (reserve_sectors) {
 		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
 				m->data_opts.extra_replicas
-- 
cgit 


From 548e7f51679bf0ec3cdc2027d780c5d06a2a7ac6 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Aug 2024 13:24:26 -0400
Subject: bcachefs: setting bcachefs_effective.* xattrs is a noop

bcachefs_effective.* xattrs show the options inherited from parent
directories (as well as explicitly set); this namespace is not for
setting bcachefs options.

Change the .set() handler to a noop so that if e.g. rsync is copying
xattrs it'll do the right thing, and only copy xattrs in the bcachefs.*
namespace. We don't want to return an error, because that will cause
rsync to bail out or get spammy.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/xattr.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index f2b4c17a0307..331f944d73dc 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -612,10 +612,20 @@ static int bch2_xattr_bcachefs_get_effective(
 					 name, buffer, size, true);
 }
 
+/* Noop - xattrs in the bcachefs_effective namespace are inherited */
+static int bch2_xattr_bcachefs_set_effective(const struct xattr_handler *handler,
+				   struct mnt_idmap *idmap,
+				   struct dentry *dentry, struct inode *vinode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
+{
+	return 0;
+}
+
 static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
 	.prefix	= "bcachefs_effective.",
 	.get	= bch2_xattr_bcachefs_get_effective,
-	.set	= bch2_xattr_bcachefs_set,
+	.set	= bch2_xattr_bcachefs_set_effective,
 };
 
 #endif /* NO_BCACHEFS_FS */
-- 
cgit 


From 49203a6b9d12bfd1a223a67847a631a78f1cd782 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Aug 2024 15:08:12 -0400
Subject: bcachefs: Fix failure to relock in btree_node_get()

discovered by new trans->locked asserts

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index f5d85b50b6f2..cc778d7e769e 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -974,6 +974,10 @@ retry:
 
 		bch2_btree_node_wait_on_read(b);
 
+		ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
+
 		/*
 		 * should_be_locked is not set on this path yet, so we need to
 		 * relock it specifically:
-- 
cgit 


From e150a7e89c4727176d07f5a0a8966fc2af05821c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Aug 2024 20:18:34 -0400
Subject: bcachefs: Fix bch2_trigger_alloc assert

On testing on an old mangled filesystem, we missed a case.

Fixes: bd864bc2d907 ("bcachefs: Fix bch2_trigger_alloc when upgrading from old versions")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd3a2522bc3e..488d0710f7b9 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -829,7 +829,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 	if (likely(new.k->type == KEY_TYPE_alloc_v4)) {
 		new_a = bkey_s_to_alloc_v4(new).v;
 	} else {
-		BUG_ON(!(flags & BTREE_TRIGGER_gc));
+		BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair)));
 
 		struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c);
 		ret = PTR_ERR_OR_ZERO(new_ka);
-- 
cgit 


From c2a503f3e98e191d86738f5438a3a2b69575c830 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sun, 18 Aug 2024 20:38:49 -0400
Subject: bcachefs: Fix bch2_bucket_gens_init()

Comparing the wrong bpos - this was missed because normally
bucket_gens_init() runs on brand new filesystems, but this bug caused it
to overwrite bucket_gens keys with 0s when upgrading ancient
filesystems.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 488d0710f7b9..ac933142aeda 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -556,7 +556,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
 		int ret2 = 0;
 
-		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
+		if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) {
 			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
 				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 			if (ret2)
-- 
cgit 


From b8db1bd8020d5fecb3bf46cd8b954a657c20ba14 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Aug 2024 16:13:16 -0400
Subject: bcachefs: fix time_stats_to_text()

Fixes: 7423330e30ab ("bcachefs: prt_printf() now respects \r\n\t")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/util.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 138320eaa2ad..1b8554460af4 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -416,7 +416,6 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	printbuf_tabstop_push(out, TABSTOP_SIZE + 2);
 
 	prt_printf(out, "\tsince mount\r\trecent\r\n");
-	prt_printf(out, "recent");
 
 	printbuf_tabstops_reset(out);
 	printbuf_tabstop_push(out, out->indent + 20);
-- 
cgit 


From cecc328240609df17395dfd0ea03cc813d8be36d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Aug 2024 11:25:39 -0400
Subject: bcachefs: fix missing bch2_err_str()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9138944c5ae6..267c2336b115 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2469,8 +2469,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 			: bch2_inode_unpack(inode_k, &inode);
 		if (ret) {
 			/* Should have been caught in dirents pass */
-			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				bch_err(c, "error looking up parent directory: %i", ret);
+			bch_err_msg(c, ret, "error looking up parent directory");
 			break;
 		}
 
-- 
cgit 


From 1dceae4cc12aa6389d9a8706f0d2a94d1679e79d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Aug 2024 12:10:33 -0400
Subject: bcachefs: unlock_long() before resort in journal replay

Fix another SRCU splat - this one pretty harmless.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d89eb43c5ce9..11368dfa96b2 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -322,6 +322,7 @@ int bch2_journal_replay(struct bch_fs *c)
 		}
 	}
 
+	bch2_trans_unlock_long(trans);
 	/*
 	 * Now, replay any remaining keys in the order in which they appear in
 	 * the journal, unpinning those journal entries as we go:
-- 
cgit 


From 3c5d0b72a8e8c19c960e8fefb7463067e58b6bc4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Aug 2024 15:22:55 -0400
Subject: bcachefs: fix failure to relock in bch2_btree_node_mem_alloc()

We weren't always so strict about trans->locked state - but now we are,
and new assertions are shaking some bugs out.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c           | 17 ++++++++++++++
 fs/bcachefs/btree_cache.h           |  2 ++
 fs/bcachefs/btree_update_interior.c | 46 ++++++++++++++++++++-----------------
 3 files changed, 44 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index cc778d7e769e..063725ecb2b3 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -159,6 +159,16 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 	return b;
 }
 
+void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
+{
+	mutex_lock(&c->btree_cache.lock);
+	list_move(&b->list, &c->btree_cache.freeable);
+	mutex_unlock(&c->btree_cache.lock);
+
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
+}
+
 /* Btree in memory cache - hash table */
 
 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
@@ -736,6 +746,13 @@ out:
 			       start_time);
 
 	memalloc_nofs_restore(flags);
+
+	int ret = bch2_trans_relock(trans);
+	if (unlikely(ret)) {
+		bch2_btree_node_to_freelist(c, b);
+		return ERR_PTR(ret);
+	}
+
 	return b;
 err:
 	mutex_lock(&bc->lock);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index c0eb87a057cc..f82064007127 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -12,6 +12,8 @@ struct btree_iter;
 
 void bch2_recalc_btree_reserve(struct bch_fs *);
 
+void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
+
 void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
 int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
 int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index b3454d4619e8..8fd112026e7a 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -317,6 +317,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 		: 0;
 	int ret;
 
+	b = bch2_btree_node_mem_alloc(trans, interior_node);
+	if (IS_ERR(b))
+		return b;
+
+	BUG_ON(b->ob.nr);
+
 	mutex_lock(&c->btree_reserve_cache_lock);
 	if (c->btree_reserve_cache_nr > nr_reserve) {
 		struct btree_alloc *a =
@@ -325,10 +331,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 		obs = a->ob;
 		bkey_copy(&tmp.k, &a->k);
 		mutex_unlock(&c->btree_reserve_cache_lock);
-		goto mem_alloc;
+		goto out;
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
-
 retry:
 	ret = bch2_alloc_sectors_start_trans(trans,
 				      c->opts.metadata_target ?:
@@ -341,7 +346,7 @@ retry:
 					  c->opts.metadata_replicas_required),
 				      watermark, 0, cl, &wp);
 	if (unlikely(ret))
-		return ERR_PTR(ret);
+		goto err;
 
 	if (wp->sectors_free < btree_sectors(c)) {
 		struct open_bucket *ob;
@@ -360,19 +365,16 @@ retry:
 
 	bch2_open_bucket_get(c, wp, &obs);
 	bch2_alloc_sectors_done(c, wp);
-mem_alloc:
-	b = bch2_btree_node_mem_alloc(trans, interior_node);
-	six_unlock_write(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-
-	/* we hold cannibalize_lock: */
-	BUG_ON(IS_ERR(b));
-	BUG_ON(b->ob.nr);
-
+out:
 	bkey_copy(&b->key, &tmp.k);
 	b->ob = obs;
+	six_unlock_write(&b->c.lock);
+	six_unlock_intent(&b->c.lock);
 
 	return b;
+err:
+	bch2_btree_node_to_freelist(c, b);
+	return ERR_PTR(ret);
 }
 
 static struct btree *bch2_btree_node_alloc(struct btree_update *as,
@@ -2439,6 +2441,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 		}
 
 		new_hash = bch2_btree_node_mem_alloc(trans, false);
+		ret = PTR_ERR_OR_ZERO(new_hash);
+		if (ret)
+			goto err;
 	}
 
 	path->intent_ref++;
@@ -2446,14 +2451,9 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 					   commit_flags, skip_triggers);
 	--path->intent_ref;
 
-	if (new_hash) {
-		mutex_lock(&c->btree_cache.lock);
-		list_move(&new_hash->list, &c->btree_cache.freeable);
-		mutex_unlock(&c->btree_cache.lock);
-
-		six_unlock_write(&new_hash->c.lock);
-		six_unlock_intent(&new_hash->c.lock);
-	}
+	if (new_hash)
+		bch2_btree_node_to_freelist(c, new_hash);
+err:
 	closure_sync(&cl);
 	bch2_btree_cache_cannibalize_unlock(trans);
 	return ret;
@@ -2522,6 +2522,10 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
 	b = bch2_btree_node_mem_alloc(trans, false);
 	bch2_btree_cache_cannibalize_unlock(trans);
 
+	ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		return ret;
+
 	set_btree_node_fake(b);
 	set_btree_node_need_rewrite(b);
 	b->c.level	= level;
@@ -2553,7 +2557,7 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id
 
 void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
 {
-	bch2_trans_run(c, bch2_btree_root_alloc_fake_trans(trans, id, level));
+	bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)));
 }
 
 static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
-- 
cgit 


From 5dbfc4ef72f15508882aff58c307b8425cf037a8 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Aug 2024 15:04:15 -0400
Subject: bcachefs: fix failure to relock in btree_node_fill()

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_cache.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 063725ecb2b3..e52a06d3418c 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -873,6 +873,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 
 		bch2_btree_node_read(trans, b, sync);
 
+		int ret = bch2_trans_relock(trans);
+		if (ret)
+			return ERR_PTR(ret);
+
 		if (!sync)
 			return NULL;
 
-- 
cgit 


From 6575b8c9877c3dd1f7db1d0d61bd250a0bf18b6d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Tue, 20 Aug 2024 19:31:20 -0400
Subject: bcachefs: Fix locking in bch2_ioc_setlabel()

Fixes: 7a254053a590 ("bcachefs: support FS_IOC_SETFSLABEL")
Reported-by: syzbot+7e9efdfec27fbde0141d@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fs-ioctl.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index aea8132d2c40..99c7fe987c74 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -328,9 +328,8 @@ static int bch2_ioc_setlabel(struct bch_fs *c,
 
 	mutex_lock(&c->sb_lock);
 	strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE);
-	mutex_unlock(&c->sb_lock);
-
 	ret = bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
 
 	mnt_drop_write_file(file);
 	return ret;
-- 
cgit 


From cab18be6957b6af8cbe3502fd5f6d7b9f02ccceb Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Aug 2024 20:49:07 -0400
Subject: bcachefs: Fix replay_now_at() assert

Journal replay, in the slowpath where we insert keys in journal order,
was inserting keys in the wrong order; keys from early repair come last.

Reported-by: syzbot+2c4fcb257ce2b6a29d0e@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/recovery.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 11368dfa96b2..36de1c6fe8c3 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -241,7 +241,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	const struct journal_key *l = *((const struct journal_key **)_l);
 	const struct journal_key *r = *((const struct journal_key **)_r);
 
-	return cmp_int(l->journal_seq, r->journal_seq);
+	/*
+	 * Map 0 to U64_MAX, so that keys with journal_seq === 0 come last
+	 *
+	 * journal_seq == 0 means that the key comes from early repair, and
+	 * should be inserted last so as to avoid overflowing the journal
+	 */
+	return cmp_int(l->journal_seq - 1, r->journal_seq - 1);
 }
 
 int bch2_journal_replay(struct bch_fs *c)
-- 
cgit 


From bdbdd4759f081ca2d0a5d9e8af21d742ffaf8439 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Aug 2024 21:10:45 -0400
Subject: bcachefs: Fix missing validation in bch2_sb_journal_v2_validate()

Reported-by: syzbot+47ecc948aadfb2ab3efc@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal_sb.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index db80e506e3ab..62b910f2fb27 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -104,6 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
 	struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
 	struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
 	int ret = -BCH_ERR_invalid_sb_journal;
+	u64 sum = 0;
 	unsigned nr;
 	unsigned i;
 	struct u64_range *b;
@@ -119,6 +120,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
 	for (i = 0; i < nr; i++) {
 		b[i].start = le64_to_cpu(journal->d[i].start);
 		b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+
+		if (b[i].end <= b[i].start) {
+			prt_printf(err, "journal buckets entry with bad nr: %llu+%llu",
+				   le64_to_cpu(journal->d[i].start),
+				   le64_to_cpu(journal->d[i].nr));
+			goto err;
+		}
+
+		sum += le64_to_cpu(journal->d[i].nr);
 	}
 
 	sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
@@ -148,6 +158,11 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, struct bch_sb_field *f
 		}
 	}
 
+	if (sum > UINT_MAX) {
+		prt_printf(err, "too many journal buckets: %llu > %u", sum, UINT_MAX);
+		goto err;
+	}
+
 	ret = 0;
 err:
 	kfree(b);
-- 
cgit 


From 06f67437ab356e3140f51aea272d33ce28421f66 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Aug 2024 22:06:44 -0400
Subject: fs/super.c: improve get_tree() error message

seeing an odd bug where we fail to correctly return an error from
.get_tree():

https://syzkaller.appspot.com/bug?extid=c0360e8367d6d8d04a66

we need to be able to distinguish between accidently returning a
positive error (as implied by the log) and no error.

Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 38d72a3cf6fc..b7913b55debc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1802,8 +1802,8 @@ int vfs_get_tree(struct fs_context *fc)
 		return error;
 
 	if (!fc->root) {
-		pr_err("Filesystem %s get_tree() didn't set fc->root\n",
-		       fc->fs_type->name);
+		pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n",
+		       fc->fs_type->name, error);
 		/* We don't know what the locking state of the superblock is -
 		 * if there is a superblock.
 		 */
-- 
cgit 


From 7f2de6947f92cfa4be8e5eaa1237e962bb8ee65f Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Aug 2024 22:27:45 -0400
Subject: bcachefs: Fix warning in bch2_fs_journal_stop()

j->last_empty_seq needs to match j->seq when the journal is empty

Reported-by: syzbot+4093905737cf289b6b38@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 649e3a01608a..f5f7db50ca31 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -1260,7 +1260,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	}
 
 	if (!had_entries)
-		j->last_empty_seq = cur_seq;
+		j->last_empty_seq = cur_seq - 1; /* to match j->seq */
 
 	spin_lock(&j->lock);
 
-- 
cgit 


From 8ed823b19214e403ca485532f48c0e02035021ae Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Aug 2024 22:57:56 -0400
Subject: bcachefs: Fix compat issue with old alloc_v4 keys

we allow new fields to be added to existing key types, and new versions
should treat them as being zeroed; this was not handled in
alloc_v4_validate.

Reported-by: syzbot+3b2968fa4953885dd66a@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c        | 50 ++++++++++++++++++-----------------
 fs/bcachefs/alloc_background_format.h |  1 +
 fs/bcachefs/btree_iter.h              |  9 +++++++
 3 files changed, 36 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index ac933142aeda..dc97b1f8bc08 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -240,71 +240,73 @@ fsck_err:
 int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
 			   enum bch_validate_flags flags)
 {
-	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
+	struct bch_alloc_v4 a;
 	int ret = 0;
 
-	bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k),
+	bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k));
+
+	bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k),
 			 c, alloc_v4_val_size_bad,
 			 "bad val size (%u > %zu)",
-			 alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k));
+			 alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k));
 
-	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
-			 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v),
+	bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) &&
+			 BCH_ALLOC_V4_NR_BACKPOINTERS(&a),
 			 c, alloc_v4_backpointers_start_bad,
 			 "invalid backpointers_start");
 
-	bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type,
+	bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type,
 			 c, alloc_key_data_type_bad,
 			 "invalid data type (got %u should be %u)",
-			 a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
+			 a.data_type, alloc_data_type(a, a.data_type));
 
 	for (unsigned i = 0; i < 2; i++)
-		bkey_fsck_err_on(a.v->io_time[i] > LRU_TIME_MAX,
+		bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX,
 				 c, alloc_key_io_time_bad,
 				 "invalid io_time[%s]: %llu, max %llu",
 				 i == READ ? "read" : "write",
-				 a.v->io_time[i], LRU_TIME_MAX);
+				 a.io_time[i], LRU_TIME_MAX);
 
-	unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(a.v) * sizeof(u64) >
+	unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) >
 		offsetof(struct bch_alloc_v4, stripe_sectors)
-		? a.v->stripe_sectors
+		? a.stripe_sectors
 		: 0;
 
-	switch (a.v->data_type) {
+	switch (a.data_type) {
 	case BCH_DATA_free:
 	case BCH_DATA_need_gc_gens:
 	case BCH_DATA_need_discard:
 		bkey_fsck_err_on(stripe_sectors ||
-				 a.v->dirty_sectors ||
-				 a.v->cached_sectors ||
-				 a.v->stripe,
+				 a.dirty_sectors ||
+				 a.cached_sectors ||
+				 a.stripe,
 				 c, alloc_key_empty_but_have_data,
 				 "empty data type free but have data %u.%u.%u %u",
 				 stripe_sectors,
-				 a.v->dirty_sectors,
-				 a.v->cached_sectors,
-				 a.v->stripe);
+				 a.dirty_sectors,
+				 a.cached_sectors,
+				 a.stripe);
 		break;
 	case BCH_DATA_sb:
 	case BCH_DATA_journal:
 	case BCH_DATA_btree:
 	case BCH_DATA_user:
 	case BCH_DATA_parity:
-		bkey_fsck_err_on(!a.v->dirty_sectors &&
+		bkey_fsck_err_on(!a.dirty_sectors &&
 				 !stripe_sectors,
 				 c, alloc_key_dirty_sectors_0,
 				 "data_type %s but dirty_sectors==0",
-				 bch2_data_type_str(a.v->data_type));
+				 bch2_data_type_str(a.data_type));
 		break;
 	case BCH_DATA_cached:
-		bkey_fsck_err_on(!a.v->cached_sectors ||
-				 a.v->dirty_sectors ||
+		bkey_fsck_err_on(!a.cached_sectors ||
+				 a.dirty_sectors ||
 				 stripe_sectors ||
-				 a.v->stripe,
+				 a.stripe,
 				 c, alloc_key_cached_inconsistency,
 				 "data type inconsistency");
 
-		bkey_fsck_err_on(!a.v->io_time[READ] &&
+		bkey_fsck_err_on(!a.io_time[READ] &&
 				 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
 				 c, alloc_key_cached_but_read_time_zero,
 				 "cached bucket with read_time == 0");
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
index 47d9d006502c..f754a2951d8a 100644
--- a/fs/bcachefs/alloc_background_format.h
+++ b/fs/bcachefs/alloc_background_format.h
@@ -69,6 +69,7 @@ struct bch_alloc_v4 {
 	__u64			io_time[2];
 	__u32			stripe;
 	__u32			nr_external_backpointers;
+	/* end of fields in original version of alloc_v4 */
 	__u64			fragmentation_lru;
 	__u32			stripe_sectors;
 	__u32			pad;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index dca62375d7d3..222b7ce8a901 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -569,6 +569,15 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
 	bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter,			\
 				       _btree_id, _pos, _flags, KEY_TYPE_##_type))
 
+#define bkey_val_copy(_dst_v, _src_k)					\
+do {									\
+	unsigned b = min_t(unsigned, sizeof(*_dst_v),			\
+			   bkey_val_bytes(_src_k.k));			\
+	memcpy(_dst_v, _src_k.v, b);					\
+	if (b < sizeof(*_dst_v))					\
+		memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b);	\
+} while (0)
+
 static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
 				unsigned btree_id, struct bpos pos,
 				unsigned flags, unsigned type,
-- 
cgit 


From 0b50b7313ef2494926df30ce8e2ce284f1b847fc Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 21 Aug 2024 23:21:52 -0400
Subject: bcachefs: Fix refcounting in discard path

bch_dev->io_ref does not protect against the filesystem going away;
bch_fs->writes does.

Thus the filesystem write ref needs to be the last ref we release.

Reported-by: syzbot+9e0404b505e604f67e41@syzkaller.appspotmail.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/alloc_background.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index dc97b1f8bc08..ba46f1c1d78a 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -1874,26 +1874,26 @@ static void bch2_do_discards_work(struct work_struct *work)
 	trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
 			      bch2_err_str(ret));
 
-	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 	percpu_ref_put(&ca->io_ref);
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
 void bch2_dev_do_discards(struct bch_dev *ca)
 {
 	struct bch_fs *c = ca->fs;
 
-	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
 		return;
 
-	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
-		goto put_ioref;
+	if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+		goto put_write_ref;
 
 	if (queue_work(c->write_ref_wq, &ca->discard_work))
 		return;
 
-	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
-put_ioref:
 	percpu_ref_put(&ca->io_ref);
+put_write_ref:
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
 void bch2_do_discards(struct bch_fs *c)
-- 
cgit 


From dedb2fe37574857c84e9598b9f5272505dedf7af Mon Sep 17 00:00:00 2001
From: Yuesong Li <liyuesong@vivo.com>
Date: Thu, 22 Aug 2024 14:21:58 +0800
Subject: bcachefs: Fix double assignment in check_dirent_to_subvol()

ret was assigned twice in check_dirent_to_subvol(). Reported by cocci.

Signed-off-by: Yuesong Li <liyuesong@vivo.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 267c2336b115..6801c37ee803 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2006,7 +2006,6 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 	if (ret) {
 		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
 		ret = -BCH_ERR_fsck_repair_unimplemented;
-		ret = 0;
 		goto err;
 	}
 
-- 
cgit 


From 87313ac1f134d6ee1e7c858da8bdea9147b537a9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 22 Aug 2024 02:13:02 -0400
Subject: bcachefs: clear path->should_be_locked in bch2_btree_key_cache_drop()

bch2_btree_key_cache_drop() evicts the key cache entry - it's used when
we're doing an update that bypasses the key cache, because for cache
coherency reasons a key can't be in the key cache unless it also exists
in the btree - i.e. creates have to bypass the cache.

After evicting, the path no longer points to a key cache key, and
relock() will always fail if should_be_locked is true.

Prep for improving path->should_be_locked assertions

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 79954490627c..9b3ec2a3b8ce 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -726,6 +726,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
 
 	mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
 	btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+	path->should_be_locked = false;
 }
 
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
-- 
cgit 


From 3e878fe5a0b139838a65f50a3df3caf3299dbc24 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 22 Aug 2024 03:57:39 -0400
Subject: bcachefs: add missing inode_walker_exit()

fix a small leak

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/fsck.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6801c37ee803..83bd31b44aad 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -2215,6 +2215,8 @@ int bch2_check_xattrs(struct bch_fs *c)
 			NULL, NULL,
 			BCH_TRANS_COMMIT_no_enospc,
 		check_xattr(trans, &iter, k, &hash_info, &inode)));
+
+	inode_walker_exit(&inode);
 	bch_err_fn(c, ret);
 	return ret;
 }
-- 
cgit 


From a592cdf5164d3feb821085df71f63e70e8b8b08c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 19 Aug 2024 16:41:00 -0400
Subject: bcachefs: don't use rht_bucket() in btree_key_cache_scan()

rht_bucket() does strange complicated things when a rehash is in
progress.

Instead, just skip scanning when a rehash is in progress: scanning is
going to be more expensive (many more empty slots to cover), and some
sort of infinite loop is being observed

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/btree_key_cache.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 9b3ec2a3b8ce..fda7998734cb 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -778,6 +778,20 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 
 	rcu_read_lock();
 	tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
+
+	/*
+	 * Scanning is expensive while a rehash is in progress - most elements
+	 * will be on the new hashtable, if it's in progress
+	 *
+	 * A rehash could still start while we're scanning - that's ok, we'll
+	 * still see most elements.
+	 */
+	if (unlikely(tbl->nest)) {
+		rcu_read_unlock();
+		srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+		return SHRINK_STOP;
+	}
+
 	if (bc->shrink_iter >= tbl->size)
 		bc->shrink_iter = 0;
 	start = bc->shrink_iter;
@@ -785,7 +799,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 	do {
 		struct rhash_head *pos, *next;
 
-		pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
+		pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]);
 
 		while (!rht_is_a_nulls(pos)) {
 			next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
@@ -866,12 +880,22 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	while (atomic_long_read(&bc->nr_keys)) {
 		rcu_read_lock();
 		tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-		if (tbl)
+		if (tbl) {
+			if (tbl->nest) {
+				/* wait for in progress rehash */
+				rcu_read_unlock();
+				mutex_lock(&bc->table.mutex);
+				mutex_unlock(&bc->table.mutex);
+				rcu_read_lock();
+				continue;
+			}
 			for (i = 0; i < tbl->size; i++)
-				rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+				while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) {
+					ck = container_of(pos, struct bkey_cached, hash);
 					bkey_cached_evict(bc, ck);
 					list_add(&ck->list, &items);
 				}
+		}
 		rcu_read_unlock();
 	}
 
-- 
cgit 


From ce61b605a00502c59311d0a4b1f58d62b48272d0 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <linkinjeon@kernel.org>
Date: Tue, 20 Aug 2024 22:07:38 +0900
Subject: ksmbd: the buffer of smb2 query dir response has at least 1 byte

When STATUS_NO_MORE_FILES status is set to smb2 query dir response,
->StructureSize is set to 9, which mean buffer has 1 byte.
This issue occurs because ->Buffer[1] in smb2_query_directory_rsp to
flex-array.

Fixes: eb3e28c1e89b ("smb3: Replace smb2pdu 1-element arrays with flex-arrays")
Cc: stable@vger.kernel.org # v6.1+
Signed-off-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0bc9edf22ba4..e9204180919e 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -4409,7 +4409,8 @@ int smb2_query_dir(struct ksmbd_work *work)
 		rsp->OutputBufferLength = cpu_to_le32(0);
 		rsp->Buffer[0] = 0;
 		rc = ksmbd_iov_pin_rsp(work, (void *)rsp,
-				       sizeof(struct smb2_query_directory_rsp));
+				       offsetof(struct smb2_query_directory_rsp, Buffer)
+				       + 1);
 		if (rc)
 			goto err_out;
 	} else {
-- 
cgit 


From 2186a116538a715b20e15f84fdd3545e5fe0a39b Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong@kylinos.cn>
Date: Thu, 22 Aug 2024 08:20:50 +0000
Subject: smb/server: fix return value of smb2_open()

In most error cases, error code is not returned in smb2_open(),
__process_request() will not print error message.

Fix this by returning the correct value at the end of smb2_open().

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index e9204180919e..55d4e69bd9c9 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -3713,7 +3713,7 @@ err_out2:
 	kfree(name);
 	kfree(lc);
 
-	return 0;
+	return rc;
 }
 
 static int readdir_info_level_struct_sz(int info_level)
-- 
cgit 


From 4e8771a3666c8f216eefd6bd2fd50121c6c437db Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong@kylinos.cn>
Date: Thu, 22 Aug 2024 08:20:51 +0000
Subject: smb/server: fix potential null-ptr-deref of lease_ctx_info in
 smb2_open()

null-ptr-deref will occur when (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
and parse_lease_state() return NULL.

Fix this by check if 'lease_ctx_info' is NULL.

Additionally, remove the redundant parentheses in
parse_durable_handle_context().

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/oplock.c  |  2 +-
 fs/smb/server/smb2pdu.c | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index a8f52c4ebbda..e546ffa57b55 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -1510,7 +1510,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
  * parse_lease_state() - parse lease context containted in file open request
  * @open_req:	buffer containing smb2 file open(create) request
  *
- * Return:  oplock state, -ENOENT if create lease context not found
+ * Return: allocated lease context object on success, otherwise NULL
  */
 struct lease_ctx_info *parse_lease_state(void *open_req)
 {
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 55d4e69bd9c9..5d170ab0817d 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -2770,8 +2770,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work,
 				}
 			}
 
-			if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
-			     req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+			if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+			    req_op_level == SMB2_OPLOCK_LEVEL_BATCH) {
 				dh_info->CreateGuid =
 					durable_v2_blob->CreateGuid;
 				dh_info->persistent =
@@ -2791,8 +2791,8 @@ static int parse_durable_handle_context(struct ksmbd_work *work,
 				goto out;
 			}
 
-			if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
-			     req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+			if ((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+			    req_op_level == SMB2_OPLOCK_LEVEL_BATCH) {
 				ksmbd_debug(SMB, "Request for durable open\n");
 				dh_info->type = dh_idx;
 			}
@@ -3414,7 +3414,7 @@ int smb2_open(struct ksmbd_work *work)
 			goto err_out1;
 		}
 	} else {
-		if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+		if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE && lc) {
 			if (S_ISDIR(file_inode(filp)->i_mode)) {
 				lc->req_state &= ~SMB2_LEASE_WRITE_CACHING_LE;
 				lc->is_dir = true;
-- 
cgit 


From 0dd771b7d60b8281f10f6721783c60716d22075f Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong@kylinos.cn>
Date: Thu, 22 Aug 2024 08:20:52 +0000
Subject: smb/server: remove useless assignment of 'file_present' in
 smb2_open()

The variable is already true here.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 5d170ab0817d..cd23517d9640 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -3096,7 +3096,6 @@ int smb2_open(struct ksmbd_work *work)
 			goto err_out;
 		}
 
-		file_present = true;
 		idmap = mnt_idmap(path.mnt);
 	} else {
 		if (rc != -ENOENT)
-- 
cgit 


From 2b7e0573a49064d9c94c114b4471327cd96ae39c Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong@kylinos.cn>
Date: Thu, 22 Aug 2024 08:20:54 +0000
Subject: smb/server: update misguided comment of smb2_allocate_rsp_buf()

smb2_allocate_rsp_buf() will return other error code except -ENOMEM.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Acked-by: Namjae Jeon <linkinjeon@kernel.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/server/smb2pdu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index cd23517d9640..20846a4d3031 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -519,7 +519,7 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work)
  * smb2_allocate_rsp_buf() - allocate smb2 response buffer
  * @work:	smb work containing smb request buffer
  *
- * Return:      0 on success, otherwise -ENOMEM
+ * Return:      0 on success, otherwise error
  */
 int smb2_allocate_rsp_buf(struct ksmbd_work *work)
 {
-- 
cgit 


From 15179cf2806f91685410e598f82813a7fcf90f6c Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Fri, 16 Aug 2024 16:47:39 -0500
Subject: smb3: fix problem unloading module due to leaked refcount on shutdown

The shutdown ioctl can leak a refcount on the tlink which can
prevent rmmod (unloading the cifs.ko) module from working.

Found while debugging xfstest generic/043

Fixes: 69ca1f57555f ("smb3: add dynamic tracepoints for shutdown ioctl")
Reviewed-by: Meetakshi Setiya <msetiya@microsoft.com>
Reviewed-by: Shyam Prasad N <sprasad@microsoft.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/connect.c | 3 +++
 fs/smb/client/ioctl.c   | 2 ++
 fs/smb/client/link.c    | 1 +
 3 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index d2307162a2de..c1c14274930a 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -4194,6 +4194,9 @@ tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink)
  *
  * If one doesn't exist then insert a new tcon_link struct into the tree and
  * try to construct a new one.
+ *
+ * REMEMBER to call cifs_put_tlink() after successful calls to cifs_sb_tlink,
+ * to avoid refcount issues
  */
 struct tcon_link *
 cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 44dbaf9929a4..9bb5c869f4db 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -229,9 +229,11 @@ static int cifs_shutdown(struct super_block *sb, unsigned long arg)
 
 shutdown_good:
 	trace_smb3_shutdown_done(flags, tcon->tid);
+	cifs_put_tlink(tlink);
 	return 0;
 shutdown_out_err:
 	trace_smb3_shutdown_err(rc, flags, tcon->tid);
+	cifs_put_tlink(tlink);
 	return rc;
 }
 
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index d86da949a919..80099bbb333b 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -588,6 +588,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
 		rc = PTR_ERR(tlink);
+		/* BB could be clearer if skipped put_tlink on error here, but harmless */
 		goto symlink_exit;
 	}
 	pTcon = tlink_tcon(tlink);
-- 
cgit 


From ec686804117a0421cf31d54427768aaf93aa0069 Mon Sep 17 00:00:00 2001
From: Paulo Alcantara <pc@manguebit.com>
Date: Wed, 21 Aug 2024 00:45:03 -0300
Subject: smb: client: ignore unhandled reparse tags

Just ignore reparse points that the client can't parse rather than
bailing out and not opening the file or directory.

Reported-by: Marc <1marc1@gmail.com>
Closes: https://lore.kernel.org/r/CAMHwNVv-B+Q6wa0FEXrAuzdchzcJRsPKDDRrNaYZJd6X-+iJzw@mail.gmail.com
Fixes: 539aad7f14da ("smb: client: introduce ->parse_reparse_point()")
Tested-by: Anthony Nandaa (Microsoft) <profnandaa@gmail.com>
Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/reparse.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index 689d8a506d45..48c27581ec51 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -378,6 +378,8 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
 			u32 plen, struct cifs_sb_info *cifs_sb,
 			bool unicode, struct cifs_open_info_data *data)
 {
+	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
 	data->reparse.buf = buf;
 
 	/* See MS-FSCC 2.1.2 */
@@ -394,12 +396,13 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
 	case IO_REPARSE_TAG_LX_FIFO:
 	case IO_REPARSE_TAG_LX_CHR:
 	case IO_REPARSE_TAG_LX_BLK:
-		return 0;
+		break;
 	default:
-		cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
-			 __func__, le32_to_cpu(buf->ReparseTag));
-		return -EOPNOTSUPP;
+		cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n",
+			      le32_to_cpu(buf->ReparseTag));
+		break;
 	}
+	return 0;
 }
 
 int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
-- 
cgit 


From f58bab6fd4063913bd8321e99874b8239e9ba726 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 22 Aug 2024 14:47:01 -0400
Subject: nfsd: ensure that nfsd4_fattr_args.context is zeroed out

If nfsd4_encode_fattr4 ends up doing a "goto out" before we get to
checking for the security label, then args.context will be set to
uninitialized junk on the stack, which we'll then try to free.
Initialize it early.

Fixes: f59388a579c6 ("NFSD: Add nfsd4_encode_fattr4_sec_label()")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4xdr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 42b41d55d4ed..43ccf6119cf1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3545,6 +3545,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	args.dentry = dentry;
 	args.ignore_crossmnt = (ignore_crossmnt != 0);
 	args.acl = NULL;
+#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+	args.context = NULL;
+#endif
 
 	/*
 	 * Make a local copy of the attribute bitmap that can be modified.
@@ -3617,7 +3620,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	args.contextsupport = false;
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-	args.context = NULL;
 	if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
 	     attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
 		if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
-- 
cgit 


From 5e51224d2afbda57f33f47485871ee5532145e18 Mon Sep 17 00:00:00 2001
From: ChenXiaoSong <chenxiaosong@kylinos.cn>
Date: Tue, 20 Aug 2024 14:33:15 +0000
Subject: smb/client: fix typo: GlobalMid_Sem -> GlobalMid_Lock

The comments have typos, fix that to not confuse readers.

Signed-off-by: ChenXiaoSong <chenxiaosong@kylinos.cn>
Reviewed-by: Namjae Jeon <linkinjeon@kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsfs.c   | 6 +++---
 fs/smb/client/cifsglob.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 2c4b357d85e2..d89485235425 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -75,9 +75,9 @@ unsigned int sign_CIFS_PDUs = 1;
 /*
  * Global transaction id (XID) information
  */
-unsigned int GlobalCurrentXid;	/* protected by GlobalMid_Sem */
-unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
-unsigned int GlobalMaxActiveXid;	/* prot by GlobalMid_Sem */
+unsigned int GlobalCurrentXid;	/* protected by GlobalMid_Lock */
+unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */
+unsigned int GlobalMaxActiveXid;	/* prot by GlobalMid_Lock */
 spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
 
 /*
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 5c9b3e6cd95f..7ebe80a25d04 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -2017,9 +2017,9 @@ extern spinlock_t		cifs_tcp_ses_lock;
 /*
  * Global transaction id (XID) information
  */
-extern unsigned int GlobalCurrentXid;	/* protected by GlobalMid_Sem */
-extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */
-extern unsigned int GlobalMaxActiveXid;	/* prot by GlobalMid_Sem */
+extern unsigned int GlobalCurrentXid;	/* protected by GlobalMid_Lock */
+extern unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */
+extern unsigned int GlobalMaxActiveXid;	/* prot by GlobalMid_Lock */
 extern spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
 
 /*
-- 
cgit 


From cb78f9b7d0c0c9f86d8c0ac9c46b8b684d8785a9 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Thu, 15 Aug 2024 10:18:41 -0400
Subject: nfs: fix the fetch of FATTR4_OPEN_ARGUMENTS

The client doesn't properly request FATTR4_OPEN_ARGUMENTS in the initial
SERVER_CAPS getattr. Add FATTR4_WORD2_OPEN_ARGUMENTS to the initial
request.

Fixes: 707f13b3d081 (NFSv4: Add support for the FATTR4_OPEN_ARGUMENTS attribute)
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8883016c551c..06df74362e94 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3931,7 +3931,8 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
 		     FATTR4_WORD0_CASE_INSENSITIVE |
 		     FATTR4_WORD0_CASE_PRESERVING;
 	if (minorversion)
-		bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+		bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT |
+			     FATTR4_WORD2_OPEN_ARGUMENTS;
 
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
 	if (status == 0) {
-- 
cgit 


From 95832998fb6edc50d4f2f6a958d9f90142d4be48 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 21 Aug 2024 08:28:25 -0400
Subject: nfs: fix bitmap decoder to handle a 3rd word

It only decodes the first two words at this point. Have it decode the
third word as well. Without this, the client doesn't send delegated
timestamps in the CB_GETATTR response.

With this change we also need to expand the on-stack bitmap in
decode_recallany_args to 3 elements, in case the server sends a larger
bitmap than expected.

Fixes: 43df7110f4a9 ("NFSv4: Add CB_GETATTR support for delegated attributes")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/callback_xdr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 29c49a7e5fe1..6df77f008d3f 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -118,7 +118,9 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
 	if (likely(attrlen > 0))
 		bitmap[0] = ntohl(*p++);
 	if (attrlen > 1)
-		bitmap[1] = ntohl(*p);
+		bitmap[1] = ntohl(*p++);
+	if (attrlen > 2)
+		bitmap[2] = ntohl(*p);
 	return 0;
 }
 
@@ -446,7 +448,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
 				      void *argp)
 {
 	struct cb_recallanyargs *args = argp;
-	uint32_t bitmap[2];
+	uint32_t bitmap[3];
 	__be32 *p, status;
 
 	p = xdr_inline_decode(xdr, 4);
-- 
cgit 


From a017ad1313fc91bdf235097fd0a02f673fc7bb11 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 21 Aug 2024 14:05:00 -0400
Subject: NFSv4: Add missing rescheduling points in
 nfs_client_return_marked_delegations

We're seeing reports of soft lockups when iterating through the loops,
so let's add rescheduling points.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index cbbd4866b0b7..97b386032b71 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
 #include <linux/vfs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <net/ipv6.h>
 #include <linux/netdevice.h>
@@ -228,6 +229,7 @@ static int __nfs_list_for_each_server(struct list_head *head,
 		ret = fn(server, data);
 		if (ret)
 			goto out;
+		cond_resched();
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
-- 
cgit 


From d72b7963115bea971a28eaa2cb76722c023f9fdf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 21 Aug 2024 14:05:01 -0400
Subject: NFSv4: Fix clearing of layout segments in layoutreturn

Make sure that we clear the layout segments in cases where we see a
fatal error, and also in the case where the layout is invalid.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/nfs4proc.c | 9 ++++++---
 fs/nfs/pnfs.c     | 5 ++---
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 06df74362e94..b8ffbe52ba15 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -9998,6 +9998,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 		fallthrough;
 	default:
 		task->tk_status = 0;
+		lrp->res.lrs_present = 0;
 		fallthrough;
 	case 0:
 		break;
@@ -10011,9 +10012,11 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 		task->tk_status = 0;
 		break;
 	case -NFS4ERR_DELAY:
-		if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
-			break;
-		goto out_restart;
+		if (nfs4_async_handle_error(task, server, NULL, NULL) ==
+		    -EAGAIN)
+			goto out_restart;
+		lrp->res.lrs_present = 0;
+		break;
 	}
 	return;
 out_restart:
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index aa698481bec8..0d16b383a452 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1284,10 +1284,9 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
 	LIST_HEAD(freeme);
 
 	spin_lock(&inode->i_lock);
-	if (!pnfs_layout_is_valid(lo) ||
-	    !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+	if (!nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
 		goto out_unlock;
-	if (stateid) {
+	if (stateid && pnfs_layout_is_valid(lo)) {
 		u32 seq = be32_to_cpu(arg_stateid->seqid);
 
 		pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
-- 
cgit 


From f92214e4c312f6ea9d78650cc6291d200f17abb6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 21 Aug 2024 14:05:02 -0400
Subject: NFS: Avoid unnecessary rescanning of the per-server delegation list

If the call to nfs_delegation_grab_inode() fails, we will not have
dropped any locks that require us to rescan the list.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 fs/nfs/delegation.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d5edb3b3eeef..20cb2008f9e4 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -647,6 +647,9 @@ restart:
 				prev = delegation;
 			continue;
 		}
+		inode = nfs_delegation_grab_inode(delegation);
+		if (inode == NULL)
+			continue;
 
 		if (prev) {
 			struct inode *tmp = nfs_delegation_grab_inode(prev);
@@ -657,12 +660,6 @@ restart:
 			}
 		}
 
-		inode = nfs_delegation_grab_inode(delegation);
-		if (inode == NULL) {
-			rcu_read_unlock();
-			iput(to_put);
-			goto restart;
-		}
 		delegation = nfs_start_delegation_return_locked(NFS_I(inode));
 		rcu_read_unlock();
 
@@ -1184,7 +1181,6 @@ static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server,
 	struct inode *inode;
 restart:
 	rcu_read_lock();
-restart_locked:
 	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		if (test_bit(NFS_DELEGATION_INODE_FREEING,
 					&delegation->flags) ||
@@ -1195,7 +1191,7 @@ restart_locked:
 			continue;
 		inode = nfs_delegation_grab_inode(delegation);
 		if (inode == NULL)
-			goto restart_locked;
+			continue;
 		delegation = nfs_start_delegation_return_locked(NFS_I(inode));
 		rcu_read_unlock();
 		if (delegation != NULL) {
@@ -1318,7 +1314,6 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server,
 
 restart:
 	rcu_read_lock();
-restart_locked:
 	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
 		if (test_bit(NFS_DELEGATION_INODE_FREEING,
 					&delegation->flags) ||
@@ -1330,7 +1325,7 @@ restart_locked:
 			continue;
 		inode = nfs_delegation_grab_inode(delegation);
 		if (inode == NULL)
-			goto restart_locked;
+			continue;
 		spin_lock(&delegation->lock);
 		cred = get_cred_rcu(delegation->cred);
 		nfs4_stateid_copy(&stateid, &delegation->stateid);
-- 
cgit 


From 996b37da1e0f51314d4186b326742c2a95a9f0dd Mon Sep 17 00:00:00 2001
From: Ed Tsai <ed.tsai@mediatek.com>
Date: Mon, 8 Jul 2024 15:22:06 +0800
Subject: backing-file: convert to using fops->splice_write

Filesystems may define their own splice write. Therefore, use the file
fops instead of invoking iter_file_splice_write() directly.

Signed-off-by: Ed Tsai <ed.tsai@mediatek.com>
Link: https://lore.kernel.org/r/20240708072208.25244-1-ed.tsai@mediatek.com
Fixes: 5ca73468612d ("fuse: implement splice read/write passthrough")
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/backing-file.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index afb557446c27..8860dac58c37 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -303,13 +303,16 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
 	if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
 		return -EIO;
 
+	if (!out->f_op->splice_write)
+		return -EINVAL;
+
 	ret = file_remove_privs(ctx->user_file);
 	if (ret)
 		return ret;
 
 	old_cred = override_creds(ctx->cred);
 	file_start_write(out);
-	ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+	ret = out->f_op->splice_write(pipe, out, ppos, len, flags);
 	file_end_write(out);
 	revert_creds(old_cred);
 
-- 
cgit 


From a204501e1743d695ca2930ed25a2be9f8ced96d3 Mon Sep 17 00:00:00 2001
From: Olga Kornievskaia <okorniev@redhat.com>
Date: Fri, 23 Aug 2024 11:51:08 -0400
Subject: nfsd: prevent panic for nfsv4.0 closed files in nfs4_show_open

Prior to commit 3f29cc82a84c ("nfsd: split sc_status out of
sc_type") states_show() relied on sc_type field to be of valid
type before calling into a subfunction to show content of a
particular stateid. From that commit, we split the validity of
the stateid into sc_status and no longer changed sc_type to 0
while unhashing the stateid. This resulted in kernel oopsing
for nfsv4.0 opens that stay around and in nfs4_show_open()
would derefence sc_file which was NULL.

Instead, for closed open stateids forgo displaying information
that relies of having a valid sc_file.

To reproduce: mount the server with 4.0, read and close
a file and then on the server cat /proc/fs/nfsd/clients/2/states

[  513.590804] Call trace:
[  513.590925]  _raw_spin_lock+0xcc/0x160
[  513.591119]  nfs4_show_open+0x78/0x2c0 [nfsd]
[  513.591412]  states_show+0x44c/0x488 [nfsd]
[  513.591681]  seq_read_iter+0x5d8/0x760
[  513.591896]  seq_read+0x188/0x208
[  513.592075]  vfs_read+0x148/0x470
[  513.592241]  ksys_read+0xcc/0x178

Fixes: 3f29cc82a84c ("nfsd: split sc_status out of sc_type")
Signed-off-by: Olga Kornievskaia <okorniev@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a20c2c9d7d45..dafff707e23a 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2789,15 +2789,18 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 		deny & NFS4_SHARE_ACCESS_READ ? "r" : "-",
 		deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
 
-	spin_lock(&nf->fi_lock);
-	file = find_any_file_locked(nf);
-	if (file) {
-		nfs4_show_superblock(s, file);
-		seq_puts(s, ", ");
-		nfs4_show_fname(s, file);
-		seq_puts(s, ", ");
-	}
-	spin_unlock(&nf->fi_lock);
+	if (nf) {
+		spin_lock(&nf->fi_lock);
+		file = find_any_file_locked(nf);
+		if (file) {
+			nfs4_show_superblock(s, file);
+			seq_puts(s, ", ");
+			nfs4_show_fname(s, file);
+			seq_puts(s, ", ");
+		}
+		spin_unlock(&nf->fi_lock);
+	} else
+		seq_puts(s, "closed, ");
 	nfs4_show_owner(s, oo);
 	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
 		seq_puts(s, ", admin-revoked");
-- 
cgit 


From 7eff3453cbd7e0bfc7524d59694119b5ca844778 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 5 Jul 2024 09:15:08 +0800
Subject: ovl: pass string to ovl_parse_layer()

So it can be used for parsing the Opt_lowerdir.

Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Link: https://lore.kernel.org/r/20240705011510.794025-2-chengzhihao1@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/overlayfs/params.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 4860fcc4611b..52e3860973b7 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -365,10 +365,9 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
 	}
 }
 
-static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
-			   enum ovl_opt layer)
+static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer)
 {
-	char *name = kstrdup(param->string, GFP_KERNEL);
+	char *name = kstrdup(layer_name, GFP_KERNEL);
 	bool upper = (layer == Opt_upperdir || layer == Opt_workdir);
 	struct path path;
 	int err;
@@ -582,7 +581,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
 	case Opt_datadir_add:
 	case Opt_upperdir:
 	case Opt_workdir:
-		err = ovl_parse_layer(fc, param, opt);
+		err = ovl_parse_layer(fc, param->string, opt);
 		break;
 	case Opt_default_permissions:
 		config->default_permissions = true;
-- 
cgit 


From ca76ac36bb6068866feca185045e7edf2a8f392f Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Fri, 5 Jul 2024 09:15:09 +0800
Subject: ovl: fix wrong lowerdir number check for parameter Opt_lowerdir

The max count of lowerdir is OVL_MAX_STACK[500], which is broken by
commit 37f32f526438("ovl: fix memory leak in ovl_parse_param()") for
parameter Opt_lowerdir. Since commit 819829f0319a("ovl: refactor layer
parsing helpers") and commit 24e16e385f22("ovl: add support for
appending lowerdirs one by one") added check ovl_mount_dir_check() in
function ovl_parse_param_lowerdir(), the 'ctx->nr' should be smaller
than OVL_MAX_STACK, after commit 37f32f526438("ovl: fix memory leak in
ovl_parse_param()") is applied, the 'ctx->nr' is updated before the
check ovl_mount_dir_check(), which leads the max count of lowerdir
to become 499 for parameter Opt_lowerdir.
Fix it by replacing lower layers parsing code with the existing helper
function ovl_parse_layer().

Fixes: 37f32f526438 ("ovl: fix memory leak in ovl_parse_param()")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Link: https://lore.kernel.org/r/20240705011510.794025-3-chengzhihao1@huawei.com
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/overlayfs/params.c | 40 +++++++---------------------------------
 1 file changed, 7 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 52e3860973b7..8dd834c7f291 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -353,6 +353,8 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
 	case Opt_datadir_add:
 		ctx->nr_data++;
 		fallthrough;
+	case Opt_lowerdir:
+		fallthrough;
 	case Opt_lowerdir_add:
 		WARN_ON(ctx->nr >= ctx->capacity);
 		l = &ctx->lower[ctx->nr++];
@@ -375,7 +377,7 @@ static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum o
 	if (!name)
 		return -ENOMEM;
 
-	if (upper)
+	if (upper || layer == Opt_lowerdir)
 		err = ovl_mount_dir(name, &path);
 	else
 		err = ovl_mount_dir_noesc(name, &path);
@@ -431,7 +433,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 {
 	int err;
 	struct ovl_fs_context *ctx = fc->fs_private;
-	struct ovl_fs_context_layer *l;
 	char *dup = NULL, *iter;
 	ssize_t nr_lower, nr;
 	bool data_layer = false;
@@ -471,35 +472,11 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 		goto out_err;
 	}
 
-	if (nr_lower > ctx->capacity) {
-		err = -ENOMEM;
-		l = krealloc_array(ctx->lower, nr_lower, sizeof(*ctx->lower),
-				   GFP_KERNEL_ACCOUNT);
-		if (!l)
-			goto out_err;
-
-		ctx->lower = l;
-		ctx->capacity = nr_lower;
-	}
-
 	iter = dup;
-	l = ctx->lower;
-	for (nr = 0; nr < nr_lower; nr++, l++) {
-		ctx->nr++;
-		memset(l, 0, sizeof(*l));
-
-		err = ovl_mount_dir(iter, &l->path);
+	for (nr = 0; nr < nr_lower; nr++) {
+		err = ovl_parse_layer(fc, iter, Opt_lowerdir);
 		if (err)
-			goto out_put;
-
-		err = ovl_mount_dir_check(fc, &l->path, Opt_lowerdir, iter, false);
-		if (err)
-			goto out_put;
-
-		err = -ENOMEM;
-		l->name = kstrdup(iter, GFP_KERNEL_ACCOUNT);
-		if (!l->name)
-			goto out_put;
+			goto out_err;
 
 		if (data_layer)
 			ctx->nr_data++;
@@ -517,7 +494,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 			 */
 			if (ctx->nr_data > 0) {
 				pr_err("regular lower layers cannot follow data lower layers");
-				goto out_put;
+				goto out_err;
 			}
 
 			data_layer = false;
@@ -531,9 +508,6 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 	kfree(dup);
 	return 0;
 
-out_put:
-	ovl_reset_lowerdirs(ctx);
-
 out_err:
 	kfree(dup);
 
-- 
cgit 


From 441e36ef5b347d9ab4f54f7b54853266be687556 Mon Sep 17 00:00:00 2001
From: Zhihao Cheng <chengzhihao1@huawei.com>
Date: Fri, 5 Jul 2024 09:15:10 +0800
Subject: ovl: ovl_parse_param_lowerdir: Add missed '\n' for pr_err

Add '\n' for pr_err in function ovl_parse_param_lowerdir(), which
ensures that error message is displayed at once.

Fixes: b36a5780cb44 ("ovl: modify layer parameter parsing")
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
Link: https://lore.kernel.org/r/20240705011510.794025-4-chengzhihao1@huawei.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/overlayfs/params.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 8dd834c7f291..d0568c091341 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -449,7 +449,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 		return 0;
 
 	if (*name == ':') {
-		pr_err("cannot append lower layer");
+		pr_err("cannot append lower layer\n");
 		return -EINVAL;
 	}
 
@@ -493,7 +493,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
 			 * there are no data layers.
 			 */
 			if (ctx->nr_data > 0) {
-				pr_err("regular lower layers cannot follow data lower layers");
+				pr_err("regular lower layers cannot follow data lower layers\n");
 				goto out_err;
 			}
 
-- 
cgit 


From a74ee0e878e262c0276966528d72d4e887174410 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Aug 2024 21:08:10 +0100
Subject: afs: Fix post-setattr file edit to do truncation correctly

At the end of an kAFS RPC operation, there is an "edit" phase (originally
intended for post-directory modification ops to edit the local image) that
the setattr VFS op uses to fix up the pagecache if the RPC that requested
truncation of a file was successful.

afs_setattr_edit_file() calls truncate_setsize() which sets i_size, expands
the pagecache if needed and truncates the pagecache.  The first two of
those, however, are redundant as they've already been done by
afs_setattr_success() under the io_lock and the first is also done under
the callback lock (cb_lock).

Fix afs_setattr_edit_file() to call truncate_pagecache() instead (which is
called by truncate_setsize(), thereby skipping the redundant parts.

Fixes: 100ccd18bb41 ("netfs: Optimise away reads above the point at which there can be no data")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20240823200819.532106-3-dhowells@redhat.com
cc: Matthew Wilcox (Oracle) <willy@infradead.org>
cc: Pankaj Raghav <p.raghav@samsung.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: netfs@lists.linux.dev
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/afs/inode.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 3acf5e050072..a95e77670b49 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -695,13 +695,18 @@ static void afs_setattr_edit_file(struct afs_operation *op)
 {
 	struct afs_vnode_param *vp = &op->file[0];
 	struct afs_vnode *vnode = vp->vnode;
+	struct inode *inode = &vnode->netfs.inode;
 
 	if (op->setattr.attr->ia_valid & ATTR_SIZE) {
 		loff_t size = op->setattr.attr->ia_size;
-		loff_t i_size = op->setattr.old_i_size;
+		loff_t old = op->setattr.old_i_size;
+
+		/* Note: inode->i_size was updated by afs_apply_status() inside
+		 * the I/O and callback locks.
+		 */
 
-		if (size != i_size) {
-			truncate_setsize(&vnode->netfs.inode, size);
+		if (size != old) {
+			truncate_pagecache(inode, size);
 			netfs_resize_file(&vnode->netfs, size, true);
 			fscache_resize_cookie(afs_vnode_cache(vnode), size);
 		}
-- 
cgit 


From 7dfc8f0c6144c290dbeb01835a67e81b34dda8cd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Aug 2024 21:08:11 +0100
Subject: netfs: Fix netfs_release_folio() to say no if folio dirty

Fix netfs_release_folio() to say no (ie. return false) if the folio is
dirty (analogous with iomap's behaviour).  Without this, it will say yes to
the release of a dirty page by split_huge_page_to_list_to_order(), which
will result in the loss of untruncated data in the folio.

Without this, the generic/075 and generic/112 xfstests (both fsx-based
tests) fail with minimum folio size patches applied[1].

Fixes: c1ec4d7c2e13 ("netfs: Provide invalidate_folio and release_folio calls")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20240815090849.972355-1-kernel@pankajraghav.com/ [1]
Link: https://lore.kernel.org/r/20240823200819.532106-4-dhowells@redhat.com
cc: Matthew Wilcox (Oracle) <willy@infradead.org>
cc: Pankaj Raghav <p.raghav@samsung.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: netfs@lists.linux.dev
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/misc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 554a1a4615ad..69324761fcf7 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -161,6 +161,9 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
 	struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
 	unsigned long long end;
 
+	if (folio_test_dirty(folio))
+		return false;
+
 	end = folio_pos(folio) + folio_size(folio);
 	if (end > ctx->zero_point)
 		ctx->zero_point = end;
-- 
cgit 


From cce6bfa6ca0e30af9927b0074c97fe6a92f28092 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Aug 2024 21:08:12 +0100
Subject: netfs: Fix trimming of streaming-write folios in netfs_inval_folio()

When netfslib writes to a folio that it doesn't have data for, but that
data exists on the server, it will make a 'streaming write' whereby it
stores data in a folio that is marked dirty, but not uptodate.  When it
does this, it attaches a record to folio->private to track the dirty
region.

When truncate() or fallocate() wants to invalidate part of such a folio, it
will call into ->invalidate_folio(), specifying the part of the folio that
is to be invalidated.  netfs_invalidate_folio(), on behalf of the
filesystem, must then determine how to trim the streaming write record.  In
a couple of cases, however, it does this incorrectly (the reduce-length and
move-start cases are switched over and don't, in any case, calculate the
value correctly).

Fix this by making the logic tree more obvious and fixing the cases.

Fixes: 9ebff83e6481 ("netfs: Prep to use folio->private for write grouping and streaming write")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20240823200819.532106-5-dhowells@redhat.com
cc: Matthew Wilcox (Oracle) <willy@infradead.org>
cc: Pankaj Raghav <p.raghav@samsung.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: netfs@lists.linux.dev
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/misc.c | 50 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 69324761fcf7..c1f321cf5999 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -97,10 +97,20 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
 void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 {
 	struct netfs_folio *finfo;
+	struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
 	size_t flen = folio_size(folio);
 
 	_enter("{%lx},%zx,%zx", folio->index, offset, length);
 
+	if (offset == 0 && length == flen) {
+		unsigned long long i_size = i_size_read(&ctx->inode);
+		unsigned long long fpos = folio_pos(folio), end;
+
+		end = umin(fpos + flen, i_size);
+		if (fpos < i_size && end > ctx->zero_point)
+			ctx->zero_point = end;
+	}
+
 	folio_wait_private_2(folio); /* [DEPRECATED] */
 
 	if (!folio_test_private(folio))
@@ -115,18 +125,34 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 		/* We have a partially uptodate page from a streaming write. */
 		unsigned int fstart = finfo->dirty_offset;
 		unsigned int fend = fstart + finfo->dirty_len;
-		unsigned int end = offset + length;
+		unsigned int iend = offset + length;
 
 		if (offset >= fend)
 			return;
-		if (end <= fstart)
+		if (iend <= fstart)
+			return;
+
+		/* The invalidation region overlaps the data.  If the region
+		 * covers the start of the data, we either move along the start
+		 * or just erase the data entirely.
+		 */
+		if (offset <= fstart) {
+			if (iend >= fend)
+				goto erase_completely;
+			/* Move the start of the data. */
+			finfo->dirty_len = fend - iend;
+			finfo->dirty_offset = offset;
+			return;
+		}
+
+		/* Reduce the length of the data if the invalidation region
+		 * covers the tail part.
+		 */
+		if (iend >= fend) {
+			finfo->dirty_len = offset - fstart;
 			return;
-		if (offset <= fstart && end >= fend)
-			goto erase_completely;
-		if (offset <= fstart && end > fstart)
-			goto reduce_len;
-		if (offset > fstart && end >= fend)
-			goto move_start;
+		}
+
 		/* A partial write was split.  The caller has already zeroed
 		 * it, so just absorb the hole.
 		 */
@@ -139,12 +165,6 @@ erase_completely:
 	folio_clear_uptodate(folio);
 	kfree(finfo);
 	return;
-reduce_len:
-	finfo->dirty_len = offset + length - finfo->dirty_offset;
-	return;
-move_start:
-	finfo->dirty_len -= offset - finfo->dirty_offset;
-	finfo->dirty_offset = offset;
 }
 EXPORT_SYMBOL(netfs_invalidate_folio);
 
@@ -164,7 +184,7 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
 	if (folio_test_dirty(folio))
 		return false;
 
-	end = folio_pos(folio) + folio_size(folio);
+	end = umin(folio_pos(folio) + folio_size(folio), i_size_read(&ctx->inode));
 	if (end > ctx->zero_point)
 		ctx->zero_point = end;
 
-- 
cgit 


From 950b03d0f664a54389a555d79215348ed413161f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Aug 2024 21:08:13 +0100
Subject: netfs: Fix missing iterator reset on retry of short read

Fix netfs_rreq_perform_resubmissions() to reset before retrying a short
read, otherwise the wrong part of the output buffer will be used.

Fixes: 92b6cc5d1e7c ("netfs: Add iov_iters to (sub)requests to describe various buffers")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/20240823200819.532106-6-dhowells@redhat.com
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/io.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 5367caf3fa28..4da0a494e860 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -313,6 +313,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
 			netfs_reset_subreq_iter(rreq, subreq);
 			netfs_read_from_server(rreq, subreq);
 		} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
+			netfs_reset_subreq_iter(rreq, subreq);
 			netfs_rreq_short_read(rreq, subreq);
 		}
 	}
-- 
cgit 


From e00e99ba6c6b8e5239e75cd6684a6827d93c39a2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 24 Aug 2024 12:56:53 +0100
Subject: netfs: Fix interaction of streaming writes with zero-point tracker

When a folio that is marked for streaming write (dirty, but not uptodate,
with partial content specified in the private data) is written back, the
folio is effectively switched to the blank state upon completion of the
write.  This means that if we want to read it in future, we need to reread
the whole folio.

However, if the folio is above the zero_point position, when it is read
back, it will just be cleared and the read skipped, leading to apparent
local corruption.

Fix this by increasing the zero_point to the end of the dirty data in the
folio when clearing the folio state after writeback.  This is analogous to
the folio having ->release_folio() called upon it.

This was causing the config.log generated by configuring a cpython tree on
a cifs share to get corrupted because the scripts involved were appending
text to the file in small pieces.

Fixes: 288ace2f57c9 ("netfs: New writeback implementation")
Signed-off-by: David Howells <dhowells@redhat.com>
Link: https://lore.kernel.org/r/563286.1724500613@warthog.procyon.org.uk
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/netfs/write_collect.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 426cf87aaf2e..ae7a2043f670 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -33,6 +33,7 @@
 int netfs_folio_written_back(struct folio *folio)
 {
 	enum netfs_folio_trace why = netfs_folio_trace_clear;
+	struct netfs_inode *ictx = netfs_inode(folio->mapping->host);
 	struct netfs_folio *finfo;
 	struct netfs_group *group = NULL;
 	int gcount = 0;
@@ -41,6 +42,12 @@ int netfs_folio_written_back(struct folio *folio)
 		/* Streaming writes cannot be redirtied whilst under writeback,
 		 * so discard the streaming record.
 		 */
+		unsigned long long fend;
+
+		fend = folio_pos(folio) + finfo->dirty_offset + finfo->dirty_len;
+		if (fend > ictx->zero_point)
+			ictx->zero_point = fend;
+
 		folio_detach_private(folio);
 		group = finfo->netfs_group;
 		gcount++;
-- 
cgit 


From d3204616a67e53fdcad14c7026869330fb382fd4 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 23 Aug 2024 17:38:41 -0400
Subject: bcachefs: Fix failure to flush moves before sleeping in copygc

This fixes an apparent deadlock - rebalance would get stuck trying to
take nocow locks because they weren't being released by copygc.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/movinggc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index deef4f024d20..d86565bf07c8 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -383,7 +383,7 @@ static int bch2_copygc_thread(void *arg)
 			if (min_member_capacity == U64_MAX)
 				min_member_capacity = 128 * 2048;
 
-			bch2_trans_unlock_long(ctxt.trans);
+			move_buckets_wait(&ctxt, buckets, true);
 			bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
 					MAX_SCHEDULE_TIMEOUT);
 		}
-- 
cgit 


From 49aa7830396bce33b00fa7ee734c35de36521138 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 23 Aug 2024 15:35:22 -0400
Subject: bcachefs: Fix rebalance_work accounting

rebalance_work was keying off of the presence of rebelance_opts in the
extent - but that was incorrect, we keep those around after rebalance
for indirect extents since the inode's options are not directly
available

Fixes: 20ac515a9cc7 ("bcachefs: bch_acct_rebalance_work")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h |  3 +-
 fs/bcachefs/buckets.c         | 74 ++++++++++++++++++++++++++++---------------
 fs/bcachefs/extents.c         | 39 +++++++++++++++++++++++
 fs/bcachefs/extents.h         |  1 +
 fs/bcachefs/sb-downgrade.c    |  8 ++++-
 5 files changed, 98 insertions(+), 27 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index c75f2e0f32bb..14ce726bf5a3 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -677,7 +677,8 @@ struct bch_sb_field_ext {
 	x(bucket_stripe_sectors,	BCH_VERSION(1,  8))		\
 	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
 	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
-	x(disk_accounting_inum,		BCH_VERSION(1, 11))
+	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
+	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index be2bbd248631..a2274429e7f4 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -699,7 +699,8 @@ err:
 static int __trigger_extent(struct btree_trans *trans,
 			    enum btree_id btree_id, unsigned level,
 			    struct bkey_s_c k,
-			    enum btree_iter_update_trigger_flags flags)
+			    enum btree_iter_update_trigger_flags flags,
+			    s64 *replicas_sectors)
 {
 	bool gc = flags & BTREE_TRIGGER_gc;
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -708,7 +709,6 @@ static int __trigger_extent(struct btree_trans *trans,
 	enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
 		? BCH_DATA_btree
 		: BCH_DATA_user;
-	s64 replicas_sectors = 0;
 	int ret = 0;
 
 	struct disk_accounting_pos acc_replicas_key = {
@@ -739,7 +739,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			if (ret)
 				return ret;
 		} else if (!p.has_ec) {
-			replicas_sectors       += disk_sectors;
+			*replicas_sectors       += disk_sectors;
 			acc_replicas_key.replicas.devs[acc_replicas_key.replicas.nr_devs++] = p.ptr.dev;
 		} else {
 			ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -777,7 +777,7 @@ static int __trigger_extent(struct btree_trans *trans,
 	}
 
 	if (acc_replicas_key.replicas.nr_devs) {
-		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
+		ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
 	}
@@ -787,7 +787,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			.type			= BCH_DISK_ACCOUNTING_snapshot,
 			.snapshot.id		= k.k->p.snapshot,
 		};
-		ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, &replicas_sectors, 1, gc);
+		ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
 	}
@@ -807,7 +807,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			.type		= BCH_DISK_ACCOUNTING_btree,
 			.btree.id	= btree_id,
 		};
-		ret = bch2_disk_accounting_mod(trans, &acc_btree_key, &replicas_sectors, 1, gc);
+		ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
 		if (ret)
 			return ret;
 	} else {
@@ -819,22 +819,13 @@ static int __trigger_extent(struct btree_trans *trans,
 		s64 v[3] = {
 			insert ? 1 : -1,
 			insert ? k.k->size : -((s64) k.k->size),
-			replicas_sectors,
+			*replicas_sectors,
 		};
 		ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
 		if (ret)
 			return ret;
 	}
 
-	if (bch2_bkey_rebalance_opts(k)) {
-		struct disk_accounting_pos acc = {
-			.type		= BCH_DISK_ACCOUNTING_rebalance_work,
-		};
-		ret = bch2_disk_accounting_mod(trans, &acc, &replicas_sectors, 1, gc);
-		if (ret)
-			return ret;
-	}
-
 	return 0;
 }
 
@@ -843,6 +834,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
 			struct bkey_s_c old, struct bkey_s new,
 			enum btree_iter_update_trigger_flags flags)
 {
+	struct bch_fs *c = trans->c;
 	struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
 	struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
 	unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
@@ -858,21 +850,53 @@ int bch2_trigger_extent(struct btree_trans *trans,
 		    new_ptrs_bytes))
 		return 0;
 
-	if (flags & BTREE_TRIGGER_transactional) {
-		struct bch_fs *c = trans->c;
-		int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
-			  (int) bch2_bkey_needs_rebalance(c, old);
+	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
+		s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
+
+		if (old.k->type) {
+			int ret = __trigger_extent(trans, btree, level, old,
+						   flags & ~BTREE_TRIGGER_insert,
+						   &old_replicas_sectors);
+			if (ret)
+				return ret;
+		}
+
+		if (new.k->type) {
+			int ret = __trigger_extent(trans, btree, level, new.s_c,
+						   flags & ~BTREE_TRIGGER_overwrite,
+						   &new_replicas_sectors);
+			if (ret)
+				return ret;
+		}
+
+		int need_rebalance_delta = 0;
+		s64 need_rebalance_sectors_delta = 0;
+
+		s64 s = bch2_bkey_sectors_need_rebalance(c, old);
+		need_rebalance_delta -= s != 0;
+		need_rebalance_sectors_delta -= s;
 
-		if (mod) {
+		s = bch2_bkey_sectors_need_rebalance(c, old);
+		need_rebalance_delta += s != 0;
+		need_rebalance_sectors_delta += s;
+
+		if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
 			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
-							      new.k->p, mod > 0);
+							  new.k->p, need_rebalance_delta > 0);
 			if (ret)
 				return ret;
 		}
-	}
 
-	if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc))
-		return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree, level, old, new, flags);
+		if (need_rebalance_sectors_delta) {
+			struct disk_accounting_pos acc = {
+				.type		= BCH_DISK_ACCOUNTING_rebalance_work,
+			};
+			int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
+							   flags & BTREE_TRIGGER_gc);
+			if (ret)
+				return ret;
+		}
+	}
 
 	return 0;
 }
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 9406f82fc255..e317df3644a1 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1379,6 +1379,45 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
 	return r != NULL;
 }
 
+static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+				       unsigned target, unsigned compression)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	u64 sectors = 0;
+
+	if (compression) {
+		unsigned compression_type = bch2_compression_opt_to_type(compression);
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+			    p.ptr.unwritten) {
+				sectors = 0;
+				goto incompressible;
+			}
+
+			if (!p.ptr.cached && p.crc.compression_type != compression_type)
+				sectors += p.crc.compressed_size;
+		}
+	}
+incompressible:
+	if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
+				sectors += p.crc.compressed_size;
+	}
+
+	return sectors;
+}
+
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+	const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+	return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
+}
+
 int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
 				  struct bch_io_opts *opts)
 {
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 1a6ddee48041..709dd83183be 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -692,6 +692,7 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
 unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
 				       unsigned, unsigned);
 bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
 
 int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
 				  struct bch_io_opts *);
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 650a1f77ca40..c7e4cdd3f6a5 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -74,6 +74,9 @@
 	  BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted,	\
 	  BCH_FSCK_ERR_accounting_key_junk_at_end)		\
 	x(disk_accounting_inum,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(rebalance_work_acct_fix,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
 	  BCH_FSCK_ERR_accounting_mismatch)
 
@@ -108,7 +111,10 @@
 	  BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong,	\
 	  BCH_FSCK_ERR_fs_usage_replicas_wrong,			\
 	  BCH_FSCK_ERR_accounting_replicas_not_marked,		\
-	  BCH_FSCK_ERR_bkey_version_in_future)
+	  BCH_FSCK_ERR_bkey_version_in_future)			\
+	x(rebalance_work_acct_fix,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_accounting_mismatch)
 
 struct upgrade_downgrade_entry {
 	u64		recovery_passes;
-- 
cgit 


From c724b2ab6a46435b4e7d58ad2fbbdb7a318823cf Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Wed, 21 Aug 2024 17:18:23 +0200
Subject: smb/client: avoid dereferencing rdata=NULL in smb2_new_read_req()

This happens when called from SMB2_read() while using rdma
and reaching the rdma_readwrite_threshold.

Cc: stable@vger.kernel.org
Fixes: a6559cc1d35d ("cifs: split out smb3_use_rdma_offload() helper")
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2pdu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 83facb54276a..8901de199a6b 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4441,7 +4441,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
 	 * If we want to do a RDMA write, fill in and append
 	 * smbd_buffer_descriptor_v1 to the end of read request
 	 */
-	if (smb3_use_rdma_offload(io_parms)) {
+	if (rdata && smb3_use_rdma_offload(io_parms)) {
 		struct smbd_buffer_descriptor_v1 *v1;
 		bool need_invalidate = server->dialect == SMB30_PROT_ID;
 
-- 
cgit 


From b608e2c318789aeba49055747166e13bee57df4a Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Wed, 21 Aug 2024 15:59:12 +0200
Subject: smb/client: remove unused rq_iter_size from struct smb_rqst

Reviewed-by: David Howells <dhowells@redhat.com>
Fixes: d08089f649a0 ("cifs: Change the I/O paths to use an iterator rather than a page list")
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsglob.h | 1 -
 fs/smb/client/cifssmb.c  | 1 -
 fs/smb/client/smb2ops.c  | 2 --
 fs/smb/client/smb2pdu.c  | 2 --
 4 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 7ebe80a25d04..f379b9dc93ba 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -254,7 +254,6 @@ struct cifs_open_info_data {
 struct smb_rqst {
 	struct kvec	*rq_iov;	/* array of kvecs */
 	unsigned int	rq_nvec;	/* number of kvecs in array */
-	size_t		rq_iter_size;	/* Amount of data in ->rq_iter */
 	struct iov_iter	rq_iter;	/* Data iterator */
 	struct xarray	rq_buffer;	/* Page buffer for encryption */
 };
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 595c4b673707..6dce70f17208 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -1713,7 +1713,6 @@ cifs_async_writev(struct cifs_io_subrequest *wdata)
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 2;
 	rqst.rq_iter = wdata->subreq.io_iter;
-	rqst.rq_iter_size = iov_iter_count(&wdata->subreq.io_iter);
 
 	cifs_dbg(FYI, "async write at %llu %zu bytes\n",
 		 wdata->subreq.start, wdata->subreq.len);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 322cabc69c6f..ea298456d841 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -4446,7 +4446,6 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
 			}
 			iov_iter_xarray(&new->rq_iter, ITER_SOURCE,
 					buffer, 0, size);
-			new->rq_iter_size = size;
 		}
 	}
 
@@ -4492,7 +4491,6 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
 	rqst.rq_nvec = 2;
 	if (iter) {
 		rqst.rq_iter = *iter;
-		rqst.rq_iter_size = iov_iter_count(iter);
 		iter_size = iov_iter_count(iter);
 	}
 
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 8901de199a6b..63a2541d4a05 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4523,7 +4523,6 @@ smb2_readv_callback(struct mid_q_entry *mid)
 
 	if (rdata->got_bytes) {
 		rqst.rq_iter	  = rdata->subreq.io_iter;
-		rqst.rq_iter_size = iov_iter_count(&rdata->subreq.io_iter);
 	}
 
 	WARN_ONCE(rdata->server != mid->server,
@@ -4975,7 +4974,6 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
 	rqst.rq_iov = iov;
 	rqst.rq_nvec = 1;
 	rqst.rq_iter = wdata->subreq.io_iter;
-	rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
 	if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
 		smb2_set_replay(server, &rqst);
 #ifdef CONFIG_CIFS_SMB_DIRECT
-- 
cgit 


From 017d1701743657fbfaea74397727a9d2b81846b7 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Wed, 21 Aug 2024 16:31:39 +0200
Subject: smb/client: fix rdma usage in smb2_async_writev()

rqst.rq_iter needs to be truncated otherwise we'll
also send the bytes into the stream socket...

This is the logic behind rqst.rq_npages = 0, which was removed in
"cifs: Change the I/O paths to use an iterator rather than a page list"
(d08089f649a0cfb2099c8551ac47eef0cc23fdf2).

Cc: stable@vger.kernel.org
Fixes: d08089f649a0 ("cifs: Change the I/O paths to use an iterator rather than a page list")
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2pdu.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 63a2541d4a05..2d7e6c42cf18 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4913,6 +4913,13 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
 	if (rc)
 		goto out;
 
+	rqst.rq_iov = iov;
+	rqst.rq_iter = wdata->subreq.io_iter;
+
+	rqst.rq_iov[0].iov_len = total_len - 1;
+	rqst.rq_iov[0].iov_base = (char *)req;
+	rqst.rq_nvec += 1;
+
 	if (smb3_encryption_required(tcon))
 		flags |= CIFS_TRANSFORM_REQ;
 
@@ -4924,6 +4931,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
 	req->WriteChannelInfoOffset = 0;
 	req->WriteChannelInfoLength = 0;
 	req->Channel = SMB2_CHANNEL_NONE;
+	req->Length = cpu_to_le32(io_parms->length);
 	req->Offset = cpu_to_le64(io_parms->offset);
 	req->DataOffset = cpu_to_le16(
 				offsetof(struct smb2_write_req, Buffer));
@@ -4943,7 +4951,6 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
 	 */
 	if (smb3_use_rdma_offload(io_parms)) {
 		struct smbd_buffer_descriptor_v1 *v1;
-		size_t data_size = iov_iter_count(&wdata->subreq.io_iter);
 		bool need_invalidate = server->dialect == SMB30_PROT_ID;
 
 		wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter,
@@ -4952,9 +4959,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
 			rc = -EAGAIN;
 			goto async_writev_out;
 		}
+		/* For RDMA read, I/O size is in RemainingBytes not in Length */
+		req->RemainingBytes = req->Length;
 		req->Length = 0;
 		req->DataOffset = 0;
-		req->RemainingBytes = cpu_to_le32(data_size);
 		req->Channel = SMB2_CHANNEL_RDMA_V1_INVALIDATE;
 		if (need_invalidate)
 			req->Channel = SMB2_CHANNEL_RDMA_V1;
@@ -4966,30 +4974,22 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
 		v1->offset = cpu_to_le64(wdata->mr->mr->iova);
 		v1->token = cpu_to_le32(wdata->mr->mr->rkey);
 		v1->length = cpu_to_le32(wdata->mr->mr->length);
+
+		rqst.rq_iov[0].iov_len += sizeof(*v1);
+
+		/*
+		 * We keep wdata->subreq.io_iter,
+		 * but we have to truncate rqst.rq_iter
+		 */
+		iov_iter_truncate(&rqst.rq_iter, 0);
 	}
 #endif
-	iov[0].iov_len = total_len - 1;
-	iov[0].iov_base = (char *)req;
 
-	rqst.rq_iov = iov;
-	rqst.rq_nvec = 1;
-	rqst.rq_iter = wdata->subreq.io_iter;
 	if (test_bit(NETFS_SREQ_RETRYING, &wdata->subreq.flags))
 		smb2_set_replay(server, &rqst);
-#ifdef CONFIG_CIFS_SMB_DIRECT
-	if (wdata->mr)
-		iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1);
-#endif
-	cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n",
-		 io_parms->offset, io_parms->length, iov_iter_count(&rqst.rq_iter));
 
-#ifdef CONFIG_CIFS_SMB_DIRECT
-	/* For RDMA read, I/O size is in RemainingBytes not in Length */
-	if (!wdata->mr)
-		req->Length = cpu_to_le32(io_parms->length);
-#else
-	req->Length = cpu_to_le32(io_parms->length);
-#endif
+	cifs_dbg(FYI, "async write at %llu %u bytes iter=%zx\n",
+		 io_parms->offset, io_parms->length, iov_iter_count(&wdata->subreq.io_iter));
 
 	if (wdata->credits.value > 0) {
 		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->subreq.len,
-- 
cgit 


From 416871f4fb84bc96822562e654941d5625a25bf8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 23 Aug 2024 14:22:42 +0100
Subject: cifs: Fix FALLOC_FL_PUNCH_HOLE support

The cifs filesystem doesn't quite emulate FALLOC_FL_PUNCH_HOLE correctly
(note that due to lack of protocol support, it can't actually implement it
directly).  Whilst it will (partially) invalidate dirty folios in the
pagecache, it doesn't write them back first, and so the EOF marker on the
server may be lower than inode->i_size.

This presents a problem, however, as if the punched hole invalidates the
tail of the locally cached dirty data, writeback won't know it needs to
move the EOF over to account for the hole punch (which isn't supposed to
move the EOF).  We could just write zeroes over the punched out region of
the pagecache and write that back - but this is supposed to be a
deallocatory operation.

Fix this by manually moving the EOF over on the server after the operation
if the hole punched would corrupt it.

Note that the FSCTL_SET_ZERO_DATA RPC and the setting of the EOF should
probably be compounded to stop a third party interfering (or, at least,
massively reduce the chance).

This was reproducible occasionally by using fsx with the following script:

	truncate 0x0 0x375e2 0x0
	punch_hole 0x2f6d3 0x6ab5 0x375e2
	truncate 0x0 0x3a71f 0x375e2
	mapread 0xee05 0xcf12 0x3a71f
	write 0x2078e 0x5604 0x3a71f
	write 0x3ebdf 0x1421 0x3a71f *
	punch_hole 0x379d0 0x8630 0x40000 *
	mapread 0x2aaa2 0x85b 0x40000
	fallocate 0x1b401 0x9ada 0x40000
	read 0x15f2 0x7d32 0x40000
	read 0x32f37 0x7a3b 0x40000 *

The second "write" should extend the EOF to 0x40000, and the "punch_hole"
should operate inside of that - but that depends on whether the VM gets in
and writes back the data first.  If it doesn't, the file ends up 0x3a71f in
size, not 0x40000.

Fixes: 31742c5a3317 ("enable fallocate punch hole ("fallocate -p") for SMB3")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Shyam Prasad N <nspmangalore@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2ops.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index ea298456d841..0b9cb1a60d4a 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -3305,6 +3305,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
 	struct inode *inode = file_inode(file);
 	struct cifsFileInfo *cfile = file->private_data;
 	struct file_zero_data_information fsctl_buf;
+	unsigned long long end = offset + len, i_size, remote_i_size;
 	long rc;
 	unsigned int xid;
 	__u8 set_sparse = 1;
@@ -3336,6 +3337,27 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
 			(char *)&fsctl_buf,
 			sizeof(struct file_zero_data_information),
 			CIFSMaxBufSize, NULL, NULL);
+
+	if (rc)
+		goto unlock;
+
+	/* If there's dirty data in the buffer that would extend the EOF if it
+	 * were written, then we need to move the EOF marker over to the lower
+	 * of the high end of the hole and the proposed EOF.  The problem is
+	 * that we locally hole-punch the tail of the dirty data, the proposed
+	 * EOF update will end up in the wrong place.
+	 */
+	i_size = i_size_read(inode);
+	remote_i_size = netfs_inode(inode)->remote_i_size;
+	if (end > remote_i_size && i_size > remote_i_size) {
+		unsigned long long extend_to = umin(end, i_size);
+		rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
+				  cfile->fid.volatile_fid, cfile->pid, extend_to);
+		if (rc >= 0)
+			netfs_inode(inode)->remote_i_size = extend_to;
+	}
+
+unlock:
 	filemap_invalidate_unlock(inode->i_mapping);
 out:
 	inode_unlock(inode);
-- 
cgit 


From 2d3447261031503b181dacc549fe65ffe2d93d65 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Wed, 21 Aug 2024 15:53:18 -0400
Subject: btrfs: run delayed iputs when flushing delalloc

We have transient failures with btrfs/301, specifically in the part
where we do

  for i in $(seq 0 10); do
	  write 50m to file
	  rm -f file
  done

Sometimes this will result in a transient quota error, and it's because
sometimes we start writeback on the file which results in a delayed
iput, and thus the rm doesn't actually clean the file up.  When we're
flushing the quota space we need to run the delayed iputs to make sure
all the unlinks that we think have completed have actually completed.
This removes the small window where we could fail to find enough space
in our quota.

CC: stable@vger.kernel.org # 5.15+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5d57a285d59b..7d6f5d9420ec 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -4185,6 +4185,8 @@ static int try_flush_qgroup(struct btrfs_root *root)
 		return 0;
 	}
 
+	btrfs_run_delayed_iputs(root->fs_info);
+	btrfs_wait_on_delayed_iputs(root->fs_info);
 	ret = btrfs_start_delalloc_snapshot(root, true);
 	if (ret < 0)
 		goto out;
-- 
cgit 


From e21fea4ac3cf12eba1921fbbf7764bf69c6d4b2c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 22 Aug 2024 16:59:01 -0700
Subject: xfs: fix di_onlink checking for V1/V2 inodes

"KjellR" complained on IRC that an old V4 filesystem suddenly stopped
mounting after upgrading from 6.9.11 to 6.10.3, with the following splat
when trying to read the rt bitmap inode:

00000000: 49 4e 80 00 01 02 00 01 00 00 00 00 00 00 00 00  IN..............
00000010: 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00 00  ................
00000020: 00 00 00 00 00 00 00 00 43 d2 a9 da 21 0f d6 30  ........C...!..0
00000030: 43 d2 a9 da 21 0f d6 30 00 00 00 00 00 00 00 00  C...!..0........
00000040: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
00000050: 00 00 00 02 00 00 00 00 00 00 00 04 00 00 00 00  ................
00000060: ff ff ff ff 00 00 00 00 00 00 00 00 00 00 00 00  ................
00000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................

As Dave Chinner points out, this is a V1 inode with both di_onlink and
di_nlink set to 1 and di_flushiter == 0.  In other words, this inode was
formatted this way by mkfs and hasn't been touched since then.

Back in the old days of xfsprogs 3.2.3, I observed that libxfs_ialloc
would set di_nlink, but if the filesystem didn't have NLINK, it would
then set di_version = 1.  libxfs_iflush_int later sees the V1 inode and
copies the value of di_nlink to di_onlink without zeroing di_onlink.

Eventually this filesystem must have been upgraded to support NLINK
because 6.10 doesn't support !NLINK filesystems, which is how we tripped
over this old behavior.  The filesystem doesn't have a realtime section,
so that's why the rtbitmap inode has never been touched.

Fix this by removing the di_onlink/di_nlink checking for all V1/V2
inodes because this is a muddy mess.  The V3 inode handling code has
always supported NLINK and written di_onlink==0 so keep that check.
The removal of the V1 inode handling code when we dropped support for
!NLINK obscured this old behavior.

Reported-by: kjell.m.randa@gmail.com
Fixes: 40cb8613d612 ("xfs: check unused nlink fields in the ondisk inode")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_inode_buf.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 513b50da6215..79babeac9d75 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -514,12 +514,18 @@ xfs_dinode_verify(
 			return __this_address;
 	}
 
-	if (dip->di_version > 1) {
+	/*
+	 * Historical note: xfsprogs in the 3.2 era set up its incore inodes to
+	 * have di_nlink track the link count, even if the actual filesystem
+	 * only supported V1 inodes (i.e. di_onlink).  When writing out the
+	 * ondisk inode, it would set both the ondisk di_nlink and di_onlink to
+	 * the the incore di_nlink value, which is why we cannot check for
+	 * di_nlink==0 on a V1 inode.  V2/3 inodes would get written out with
+	 * di_onlink==0, so we can check that.
+	 */
+	if (dip->di_version >= 2) {
 		if (dip->di_onlink)
 			return __this_address;
-	} else {
-		if (dip->di_nlink)
-			return __this_address;
 	}
 
 	/* don't allow invalid i_size */
-- 
cgit 


From 5335affcff91b53cfc45694171f911cb23257c8b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 22 Aug 2024 16:59:17 -0700
Subject: xfs: fix folio dirtying for XFILE_ALLOC callers

willy pointed out that folio_mark_dirty is the correct function to use
to mark an xfile folio dirty because it calls out to the mapping's aops
to mark it dirty.  For tmpfs this likely doesn't matter much since it
currently uses nop_dirty_folio, but let's use the abstractions properly.

Reported-by: willy@infradead.org
Fixes: 6907e3c00a40 ("xfs: add file_{get,put}_folio")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/scrub/xfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index d848222f802b..9b5d98fe1f8a 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -293,7 +293,7 @@ xfile_get_folio(
 	 * (potentially last) reference in xfile_put_folio.
 	 */
 	if (flags & XFILE_ALLOC)
-		folio_set_dirty(folio);
+		folio_mark_dirty(folio);
 	return folio;
 }
 
-- 
cgit 


From 95179935beadccaf0f0bb461adb778731e293da4 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 22 Aug 2024 16:59:33 -0700
Subject: xfs: xfs_finobt_count_blocks() walks the wrong btree

As a result of the factoring in commit 14dd46cf31f4 ("xfs: split
xfs_inobt_init_cursor"), mount started taking a long time on a
user's filesystem.  For Anders, this made mount times regress from
under a second to over 15 minutes for a filesystem with only 30
million inodes in it.

Anders bisected it down to the above commit, but even then the bug
was not obvious. In this commit, over 20 calls to
xfs_inobt_init_cursor() were modified, and some we modified to call
a new function named xfs_finobt_init_cursor().

If that takes you a moment to reread those function names to see
what the rename was, then you have realised why this bug wasn't
spotted during review. And it wasn't spotted on inspection even
after the bisect pointed at this commit - a single missing "f" isn't
the easiest thing for a human eye to notice....

The result is that xfs_finobt_count_blocks() now incorrectly calls
xfs_inobt_init_cursor() so it is now walking the inobt instead of
the finobt. Hence when there are lots of allocated inodes in a
filesystem, mount takes a -long- time run because it now walks a
massive allocated inode btrees instead of the small, nearly empty
free inode btrees. It also means all the finobt space reservations
are wrong, so mount could potentially given ENOSPC on kernel
upgrade.

In hindsight, commit 14dd46cf31f4 should have been two commits - the
first to convert the finobt callers to the new API, the second to
modify the xfs_inobt_init_cursor() API for the inobt callers. That
would have made the bug very obvious during review.

Fixes: 14dd46cf31f4 ("xfs: split xfs_inobt_init_cursor")
Reported-by: Anders Blomdell <anders.blomdell@gmail.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 496e2f72a85b..797d5b5f7b72 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -749,7 +749,7 @@ xfs_finobt_count_blocks(
 	if (error)
 		return error;
 
-	cur = xfs_inobt_init_cursor(pag, tp, agbp);
+	cur = xfs_finobt_init_cursor(pag, tp, agbp);
 	error = xfs_btree_count_blocks(cur, tree_blocks);
 	xfs_btree_del_cursor(cur, error);
 	xfs_trans_brelse(tp, agbp);
-- 
cgit 


From 410e8a18f8e9311c6bf29ae47f32ad46f0219569 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 22 Aug 2024 16:59:48 -0700
Subject: xfs: don't bother reporting blocks trimmed via FITRIM

Don't bother reporting the number of bytes that we "trimmed" because the
underlying storage isn't required to do anything(!) and failed discard
IOs aren't reported to the caller anyway.  It's not like userspace can
use the reported value for anything useful like adjusting the offset
parameter of the next call, and it's not like anyone ever wrote a
manpage about FITRIM's out parameters.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_discard.c | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 6f0fc7fe1f2b..25f5dffeab2a 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -158,8 +158,7 @@ static int
 xfs_trim_gather_extents(
 	struct xfs_perag	*pag,
 	struct xfs_trim_cur	*tcur,
-	struct xfs_busy_extents	*extents,
-	uint64_t		*blocks_trimmed)
+	struct xfs_busy_extents	*extents)
 {
 	struct xfs_mount	*mp = pag->pag_mount;
 	struct xfs_trans	*tp;
@@ -280,7 +279,6 @@ xfs_trim_gather_extents(
 
 		xfs_extent_busy_insert_discard(pag, fbno, flen,
 				&extents->extent_list);
-		*blocks_trimmed += flen;
 next_extent:
 		if (tcur->by_bno)
 			error = xfs_btree_increment(cur, 0, &i);
@@ -327,8 +325,7 @@ xfs_trim_perag_extents(
 	struct xfs_perag	*pag,
 	xfs_agblock_t		start,
 	xfs_agblock_t		end,
-	xfs_extlen_t		minlen,
-	uint64_t		*blocks_trimmed)
+	xfs_extlen_t		minlen)
 {
 	struct xfs_trim_cur	tcur = {
 		.start		= start,
@@ -354,8 +351,7 @@ xfs_trim_perag_extents(
 		extents->owner = extents;
 		INIT_LIST_HEAD(&extents->extent_list);
 
-		error = xfs_trim_gather_extents(pag, &tcur, extents,
-				blocks_trimmed);
+		error = xfs_trim_gather_extents(pag, &tcur, extents);
 		if (error) {
 			kfree(extents);
 			break;
@@ -389,8 +385,7 @@ xfs_trim_datadev_extents(
 	struct xfs_mount	*mp,
 	xfs_daddr_t		start,
 	xfs_daddr_t		end,
-	xfs_extlen_t		minlen,
-	uint64_t		*blocks_trimmed)
+	xfs_extlen_t		minlen)
 {
 	xfs_agnumber_t		start_agno, end_agno;
 	xfs_agblock_t		start_agbno, end_agbno;
@@ -411,8 +406,7 @@ xfs_trim_datadev_extents(
 
 		if (start_agno == end_agno)
 			agend = end_agbno;
-		error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen,
-				blocks_trimmed);
+		error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen);
 		if (error)
 			last_error = error;
 
@@ -431,9 +425,6 @@ struct xfs_trim_rtdev {
 	/* list of rt extents to free */
 	struct list_head	extent_list;
 
-	/* pointer to count of blocks trimmed */
-	uint64_t		*blocks_trimmed;
-
 	/* minimum length that caller allows us to trim */
 	xfs_rtblock_t		minlen_fsb;
 
@@ -551,7 +542,6 @@ xfs_trim_gather_rtextent(
 	busyp->length = rlen;
 	INIT_LIST_HEAD(&busyp->list);
 	list_add_tail(&busyp->list, &tr->extent_list);
-	*tr->blocks_trimmed += rlen;
 
 	tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
 	return 0;
@@ -562,13 +552,11 @@ xfs_trim_rtdev_extents(
 	struct xfs_mount	*mp,
 	xfs_daddr_t		start,
 	xfs_daddr_t		end,
-	xfs_daddr_t		minlen,
-	uint64_t		*blocks_trimmed)
+	xfs_daddr_t		minlen)
 {
 	struct xfs_rtalloc_rec	low = { };
 	struct xfs_rtalloc_rec	high = { };
 	struct xfs_trim_rtdev	tr = {
-		.blocks_trimmed	= blocks_trimmed,
 		.minlen_fsb	= XFS_BB_TO_FSB(mp, minlen),
 	};
 	struct xfs_trans	*tp;
@@ -634,7 +622,7 @@ xfs_trim_rtdev_extents(
 	return error;
 }
 #else
-# define xfs_trim_rtdev_extents(m,s,e,n,b)	(-EOPNOTSUPP)
+# define xfs_trim_rtdev_extents(...)	(-EOPNOTSUPP)
 #endif /* CONFIG_XFS_RT */
 
 /*
@@ -661,7 +649,6 @@ xfs_ioc_trim(
 	xfs_daddr_t		start, end;
 	xfs_extlen_t		minlen;
 	xfs_rfsblock_t		max_blocks;
-	uint64_t		blocks_trimmed = 0;
 	int			error, last_error = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -706,15 +693,13 @@ xfs_ioc_trim(
 	end = start + BTOBBT(range.len) - 1;
 
 	if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
-		error = xfs_trim_datadev_extents(mp, start, end, minlen,
-				&blocks_trimmed);
+		error = xfs_trim_datadev_extents(mp, start, end, minlen);
 		if (error)
 			last_error = error;
 	}
 
 	if (rt_bdev && !xfs_trim_should_stop()) {
-		error = xfs_trim_rtdev_extents(mp, start, end, minlen,
-				&blocks_trimmed);
+		error = xfs_trim_rtdev_extents(mp, start, end, minlen);
 		if (error)
 			last_error = error;
 	}
@@ -722,7 +707,8 @@ xfs_ioc_trim(
 	if (last_error)
 		return last_error;
 
-	range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+	range.len = min_t(unsigned long long, range.len,
+			  XFS_FSB_TO_B(mp, max_blocks));
 	if (copy_to_user(urange, &range, sizeof(range)))
 		return -EFAULT;
 	return 0;
-- 
cgit 


From 68415b349f3f16904f006275757f4fcb34b8ee43 Mon Sep 17 00:00:00 2001
From: Zizhi Wo <wozizhi@huawei.com>
Date: Thu, 22 Aug 2024 17:00:04 -0700
Subject: xfs: Fix the owner setting issue for rmap query in xfs fsmap

I notice a rmap query bug in xfs_io fsmap:
[root@fedora ~]# xfs_io -c 'fsmap -vvvv' /mnt
 EXT: DEV    BLOCK-RANGE           OWNER              FILE-OFFSET      AG AG-OFFSET             TOTAL
   0: 253:16 [0..7]:               static fs metadata                  0  (0..7)                    8
   1: 253:16 [8..23]:              per-AG metadata                     0  (8..23)                  16
   2: 253:16 [24..39]:             inode btree                         0  (24..39)                 16
   3: 253:16 [40..47]:             per-AG metadata                     0  (40..47)                  8
   4: 253:16 [48..55]:             refcount btree                      0  (48..55)                  8
   5: 253:16 [56..103]:            per-AG metadata                     0  (56..103)                48
   6: 253:16 [104..127]:           free space                          0  (104..127)               24
   ......

Bug:
[root@fedora ~]# xfs_io -c 'fsmap -vvvv -d 0 3' /mnt
[root@fedora ~]#
Normally, we should be able to get one record, but we got nothing.

The root cause of this problem lies in the incorrect setting of rm_owner in
the rmap query. In the case of the initial query where the owner is not
set, __xfs_getfsmap_datadev() first sets info->high.rm_owner to ULLONG_MAX.
This is done to prevent any omissions when comparing rmap items. However,
if the current ag is detected to be the last one, the function sets info's
high_irec based on the provided key. If high->rm_owner is not specified, it
should continue to be set to ULLONG_MAX; otherwise, there will be issues
with interval omissions. For example, consider "start" and "end" within the
same block. If high->rm_owner == 0, it will be smaller than the founded
record in rmapbt, resulting in a query with no records. The main call stack
is as follows:

xfs_ioc_getfsmap
  xfs_getfsmap
    xfs_getfsmap_datadev_rmapbt
      __xfs_getfsmap_datadev
        info->high.rm_owner = ULLONG_MAX
        if (pag->pag_agno == end_ag)
	  xfs_fsmap_owner_to_rmap
	    // set info->high.rm_owner = 0 because fmr_owner == -1ULL
	    dest->rm_owner = 0
	// get nothing
	xfs_getfsmap_datadev_rmapbt_query

The problem can be resolved by simply modify the xfs_fsmap_owner_to_rmap
function internal logic to achieve.

After applying this patch, the above problem have been solved:
[root@fedora ~]# xfs_io -c 'fsmap -vvvv -d 0 3' /mnt
 EXT: DEV    BLOCK-RANGE      OWNER              FILE-OFFSET      AG AG-OFFSET        TOTAL
   0: 253:16 [0..7]:          static fs metadata                  0  (0..7)               8

Fixes: e89c041338ed ("xfs: implement the GETFSMAP ioctl")
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_fsmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 85dbb46452ca..3a30b36779db 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -71,7 +71,7 @@ xfs_fsmap_owner_to_rmap(
 	switch (src->fmr_owner) {
 	case 0:			/* "lowest owner id possible" */
 	case -1ULL:		/* "highest owner id possible" */
-		dest->rm_owner = 0;
+		dest->rm_owner = src->fmr_owner;
 		break;
 	case XFS_FMR_OWN_FREE:
 		dest->rm_owner = XFS_RMAP_OWN_NULL;
-- 
cgit 


From 33f58a0480bb9e2479ccdf556f61363723a50d47 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.com>
Date: Wed, 21 Aug 2024 01:19:57 +0200
Subject: btrfs: initialize last_extent_end to fix -Wmaybe-uninitialized
 warning in extent_fiemap()

There's a warning (probably on some older compiler version):

fs/btrfs/fiemap.c: warning: 'last_extent_end' may be used uninitialized in this function [-Wmaybe-uninitialized]:  => 822:19

Initialize the variable to 0 although it's not necessary as it's either
properly set or not used after an error. The called function is in the
same file so this is a false alert but we want to fix all
-Wmaybe-uninitialized reports.

Link: https://lore.kernel.org/all/20240819070639.2558629-1-geert@linux-m68k.org/
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/fiemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index 8f95f3e44e99..df7f09f3b02e 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -637,7 +637,7 @@ static int extent_fiemap(struct btrfs_inode *inode,
 	struct btrfs_path *path;
 	struct fiemap_cache cache = { 0 };
 	struct btrfs_backref_share_check_ctx *backref_ctx;
-	u64 last_extent_end;
+	u64 last_extent_end = 0;
 	u64 prev_extent_end;
 	u64 range_start;
 	u64 range_end;
-- 
cgit 


From da05ba23d4c8d3e8a45846b952e53dd76c4b5e36 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 23 Aug 2024 18:27:38 -0400
Subject: nfsd: hold reference to delegation when updating it for cb_getattr

Once we've dropped the flc_lock, there is nothing that ensures that the
delegation that was found will still be around later. Take a reference
to it while holding the lock and then drop it when we've finished with
the delegation.

Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index dafff707e23a..19d39872be32 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8837,7 +8837,6 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct file_lock_context *ctx;
 	struct file_lease *fl;
-	struct nfs4_delegation *dp;
 	struct iattr attrs;
 	struct nfs4_cb_fattr *ncf;
 
@@ -8862,7 +8861,8 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
 			goto break_lease;
 		}
 		if (type == F_WRLCK) {
-			dp = fl->c.flc_owner;
+			struct nfs4_delegation *dp = fl->c.flc_owner;
+
 			if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
 				spin_unlock(&ctx->flc_lock);
 				return 0;
@@ -8870,6 +8870,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
 break_lease:
 			nfsd_stats_wdeleg_getattr_inc(nn);
 			dp = fl->c.flc_owner;
+			refcount_inc(&dp->dl_stid.sc_count);
 			ncf = &dp->dl_cb_fattr;
 			nfs4_cb_getattr(&dp->dl_cb_fattr);
 			spin_unlock(&ctx->flc_lock);
@@ -8879,8 +8880,10 @@ break_lease:
 				/* Recall delegation only if client didn't respond */
 				status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
 				if (status != nfserr_jukebox ||
-						!nfsd_wait_for_delegreturn(rqstp, inode))
+						!nfsd_wait_for_delegreturn(rqstp, inode)) {
+					nfs4_put_stid(&dp->dl_stid);
 					return status;
+				}
 			}
 			if (!ncf->ncf_file_modified &&
 					(ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
@@ -8900,6 +8903,7 @@ break_lease:
 				*size = ncf->ncf_cur_fsize;
 				*modified = true;
 			}
+			nfs4_put_stid(&dp->dl_stid);
 			return 0;
 		}
 		break;
-- 
cgit 


From 1116e0e372eb16dd907ec571ce5d4af325c55c10 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 23 Aug 2024 18:27:39 -0400
Subject: nfsd: fix potential UAF in nfsd4_cb_getattr_release

Once we drop the delegation reference, the fields embedded in it are no
longer safe to access. Do that last.

Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation")
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 19d39872be32..02d43f95146e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3078,9 +3078,9 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
 	struct nfs4_delegation *dp =
 			container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
 
-	nfs4_put_stid(&dp->dl_stid);
 	clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
 	wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
+	nfs4_put_stid(&dp->dl_stid);
 }
 
 static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
-- 
cgit 


From c6a09e342f8e6d3cac7f7c5c14085236aca284b9 Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Sun, 25 Aug 2024 20:27:45 -0700
Subject: binfmt_elf_fdpic: fix AUXV size calculation when ELF_HWCAP2 is
 defined

create_elf_fdpic_tables() does not correctly account the space for the
AUX vector when an architecture has ELF_HWCAP2 defined. Prior to the
commit 10e29251be0e ("binfmt_elf_fdpic: fix /proc/<pid>/auxv") it
resulted in the last entry of the AUX vector being set to zero, but with
that change it results in a kernel BUG.

Fix that by adding one to the number of AUXV entries (nitems) when
ELF_HWCAP2 is defined.

Fixes: 10e29251be0e ("binfmt_elf_fdpic: fix /proc/<pid>/auxv")
Cc: stable@vger.kernel.org
Reported-by: Greg Ungerer <gerg@kernel.org>
Closes: https://lore.kernel.org/lkml/5b51975f-6d0b-413c-8b38-39a6a45e8821@westnet.com.au/
Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Tested-by: Greg Ungerer <gerg@kernel.org>
Link: https://lore.kernel.org/r/20240826032745.3423812-1-jcmvbkbc@gmail.com
Signed-off-by: Kees Cook <kees@kernel.org>
---
 fs/binfmt_elf_fdpic.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 28a3439f163a..4fe5bb9f1b1f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -589,6 +589,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
 
 	if (bprm->have_execfd)
 		nitems++;
+#ifdef ELF_HWCAP2
+	nitems++;
+#endif
 
 	csp = sp;
 	sp -= nitems * 2 * sizeof(unsigned long);
-- 
cgit 


From 7e8ae8486e4471513e2111aba6ac29f2357bed2a Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 26 Aug 2024 10:32:34 -0400
Subject: fs/nfsd: fix update of inode attrs in CB_GETATTR

Currently, we copy the mtime and ctime to the in-core inode and then
mark the inode dirty. This is fine for certain types of filesystems, but
not all. Some require a real setattr to properly change these values
(e.g. ceph or reexported NFS).

Fix this code to call notify_change() instead, which is the proper way
to effect a setattr. There is one problem though:

In this case, the client is holding a write delegation and has sent us
attributes to update our cache. We don't want to break the delegation
for this since that would defeat the purpose. Add a new ATTR_DELEG flag
that makes notify_change bypass the try_break_deleg call.

Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation")
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/attr.c           | 14 +++++++++++---
 fs/nfsd/nfs4state.c | 18 +++++++++++++-----
 fs/nfsd/nfs4xdr.c   |  2 +-
 fs/nfsd/state.h     |  2 +-
 include/linux/fs.h  |  1 +
 5 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/attr.c b/fs/attr.c
index 960a310581eb..0dbf43b6555c 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -489,9 +489,17 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 	error = security_inode_setattr(idmap, dentry, attr);
 	if (error)
 		return error;
-	error = try_break_deleg(inode, delegated_inode);
-	if (error)
-		return error;
+
+	/*
+	 * If ATTR_DELEG is set, then these attributes are being set on
+	 * behalf of the holder of a write delegation. We want to avoid
+	 * breaking the delegation in this case.
+	 */
+	if (!(ia_valid & ATTR_DELEG)) {
+		error = try_break_deleg(inode, delegated_inode);
+		if (error)
+			return error;
+	}
 
 	if (inode->i_op->setattr)
 		error = inode->i_op->setattr(idmap, dentry, attr);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 02d43f95146e..07f2496850c4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8815,7 +8815,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
 /**
  * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
  * @rqstp: RPC transaction context
- * @inode: file to be checked for a conflict
+ * @dentry: dentry of inode to be checked for a conflict
  * @modified: return true if file was modified
  * @size: new size of file if modified is true
  *
@@ -8830,7 +8830,7 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
  * code is returned.
  */
 __be32
-nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
+nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 				bool *modified, u64 *size)
 {
 	__be32 status;
@@ -8839,6 +8839,7 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
 	struct file_lease *fl;
 	struct iattr attrs;
 	struct nfs4_cb_fattr *ncf;
+	struct inode *inode = d_inode(dentry);
 
 	*modified = false;
 	ctx = locks_inode_context(inode);
@@ -8890,15 +8891,22 @@ break_lease:
 					ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
 				ncf->ncf_file_modified = true;
 			if (ncf->ncf_file_modified) {
+				int err;
+
 				/*
 				 * Per section 10.4.3 of RFC 8881, the server would
 				 * not update the file's metadata with the client's
 				 * modified size
 				 */
 				attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
-				attrs.ia_valid = ATTR_MTIME | ATTR_CTIME;
-				setattr_copy(&nop_mnt_idmap, inode, &attrs);
-				mark_inode_dirty(inode);
+				attrs.ia_valid = ATTR_MTIME | ATTR_CTIME | ATTR_DELEG;
+				inode_lock(inode);
+				err = notify_change(&nop_mnt_idmap, dentry, &attrs, NULL);
+				inode_unlock(inode);
+				if (err) {
+					nfs4_put_stid(&dp->dl_stid);
+					return nfserrno(err);
+				}
 				ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
 				*size = ncf->ncf_cur_fsize;
 				*modified = true;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 43ccf6119cf1..97f583777972 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3565,7 +3565,7 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	}
 	args.size = 0;
 	if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
-		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry),
+		status = nfsd4_deleg_getattr_conflict(rqstp, dentry,
 					&file_modified, &size);
 		if (status)
 			goto out;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ffc217099d19..ec4559ecd193 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -781,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
 }
 
 extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
-		struct inode *inode, bool *file_modified, u64 *size);
+		struct dentry *dentry, bool *file_modified, u64 *size);
 #endif   /* NFSD4_STATE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0283cf366c2a..bafc1d134b94 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -208,6 +208,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define ATTR_OPEN	(1 << 15) /* Truncating from open(O_TRUNC) */
 #define ATTR_TIMES_SET	(1 << 16)
 #define ATTR_TOUCH	(1 << 17)
+#define ATTR_DELEG	(1 << 18) /* Delegated attrs. Don't break write delegations */
 
 /*
  * Whiteout is represented by a char device.  The following constants define the
-- 
cgit 


From 10d9d8c3512f16cad47b2ff81ec6fc4b27d8ee10 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Sat, 17 Aug 2024 18:34:30 +0930
Subject: btrfs: fix a use-after-free when hitting errors inside
 btrfs_submit_chunk()

[BUG]
There is an internal report that KASAN is reporting use-after-free, with
the following backtrace:

  BUG: KASAN: slab-use-after-free in btrfs_check_read_bio+0xa68/0xb70 [btrfs]
  Read of size 4 at addr ffff8881117cec28 by task kworker/u16:2/45
  CPU: 1 UID: 0 PID: 45 Comm: kworker/u16:2 Not tainted 6.11.0-rc2-next-20240805-default+ #76
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014
  Workqueue: btrfs-endio btrfs_end_bio_work [btrfs]
  Call Trace:
   dump_stack_lvl+0x61/0x80
   print_address_description.constprop.0+0x5e/0x2f0
   print_report+0x118/0x216
   kasan_report+0x11d/0x1f0
   btrfs_check_read_bio+0xa68/0xb70 [btrfs]
   process_one_work+0xce0/0x12a0
   worker_thread+0x717/0x1250
   kthread+0x2e3/0x3c0
   ret_from_fork+0x2d/0x70
   ret_from_fork_asm+0x11/0x20

  Allocated by task 20917:
   kasan_save_stack+0x37/0x60
   kasan_save_track+0x10/0x30
   __kasan_slab_alloc+0x7d/0x80
   kmem_cache_alloc_noprof+0x16e/0x3e0
   mempool_alloc_noprof+0x12e/0x310
   bio_alloc_bioset+0x3f0/0x7a0
   btrfs_bio_alloc+0x2e/0x50 [btrfs]
   submit_extent_page+0x4d1/0xdb0 [btrfs]
   btrfs_do_readpage+0x8b4/0x12a0 [btrfs]
   btrfs_readahead+0x29a/0x430 [btrfs]
   read_pages+0x1a7/0xc60
   page_cache_ra_unbounded+0x2ad/0x560
   filemap_get_pages+0x629/0xa20
   filemap_read+0x335/0xbf0
   vfs_read+0x790/0xcb0
   ksys_read+0xfd/0x1d0
   do_syscall_64+0x6d/0x140
   entry_SYSCALL_64_after_hwframe+0x4b/0x53

  Freed by task 20917:
   kasan_save_stack+0x37/0x60
   kasan_save_track+0x10/0x30
   kasan_save_free_info+0x37/0x50
   __kasan_slab_free+0x4b/0x60
   kmem_cache_free+0x214/0x5d0
   bio_free+0xed/0x180
   end_bbio_data_read+0x1cc/0x580 [btrfs]
   btrfs_submit_chunk+0x98d/0x1880 [btrfs]
   btrfs_submit_bio+0x33/0x70 [btrfs]
   submit_one_bio+0xd4/0x130 [btrfs]
   submit_extent_page+0x3ea/0xdb0 [btrfs]
   btrfs_do_readpage+0x8b4/0x12a0 [btrfs]
   btrfs_readahead+0x29a/0x430 [btrfs]
   read_pages+0x1a7/0xc60
   page_cache_ra_unbounded+0x2ad/0x560
   filemap_get_pages+0x629/0xa20
   filemap_read+0x335/0xbf0
   vfs_read+0x790/0xcb0
   ksys_read+0xfd/0x1d0
   do_syscall_64+0x6d/0x140
   entry_SYSCALL_64_after_hwframe+0x4b/0x53

[CAUSE]
Although I cannot reproduce the error, the report itself is good enough
to pin down the cause.

The call trace is the regular endio workqueue context, but the
free-by-task trace is showing that during btrfs_submit_chunk() we
already hit a critical error, and is calling btrfs_bio_end_io() to error
out.  And the original endio function called bio_put() to free the whole
bio.

This means a double freeing thus causing use-after-free, e.g.:

1. Enter btrfs_submit_bio() with a read bio
   The read bio length is 128K, crossing two 64K stripes.

2. The first run of btrfs_submit_chunk()

2.1 Call btrfs_map_block(), which returns 64K
2.2 Call btrfs_split_bio()
    Now there are two bios, one referring to the first 64K, the other
    referring to the second 64K.
2.3 The first half is submitted.

3. The second run of btrfs_submit_chunk()

3.1 Call btrfs_map_block(), which by somehow failed
    Now we call btrfs_bio_end_io() to handle the error

3.2 btrfs_bio_end_io() calls the original endio function
    Which is end_bbio_data_read(), and it calls bio_put() for the
    original bio.

    Now the original bio is freed.

4. The submitted first 64K bio finished
   Now we call into btrfs_check_read_bio() and tries to advance the bio
   iter.
   But since the original bio (thus its iter) is already freed, we
   trigger the above use-after free.

   And even if the memory is not poisoned/corrupted, we will later call
   the original endio function, causing a double freeing.

[FIX]
Instead of calling btrfs_bio_end_io(), call btrfs_orig_bbio_end_io(),
which has the extra check on split bios and do the proper refcounting
for cloned bios.

Furthermore there is already one extra btrfs_cleanup_bio() call, but
that is duplicated to btrfs_orig_bbio_end_io() call, so remove that
label completely.

Reported-by: David Sterba <dsterba@suse.com>
Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
CC: stable@vger.kernel.org # 6.6+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/bio.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index f04d93109960..b4e31ae17cd9 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -668,7 +668,6 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 {
 	struct btrfs_inode *inode = bbio->inode;
 	struct btrfs_fs_info *fs_info = bbio->fs_info;
-	struct btrfs_bio *orig_bbio = bbio;
 	struct bio *bio = &bbio->bio;
 	u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
 	u64 length = bio->bi_iter.bi_size;
@@ -706,7 +705,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 		bbio->saved_iter = bio->bi_iter;
 		ret = btrfs_lookup_bio_sums(bbio);
 		if (ret)
-			goto fail_put_bio;
+			goto fail;
 	}
 
 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
@@ -740,13 +739,13 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 
 			ret = btrfs_bio_csum(bbio);
 			if (ret)
-				goto fail_put_bio;
+				goto fail;
 		} else if (use_append ||
 			   (btrfs_is_zoned(fs_info) && inode &&
 			    inode->flags & BTRFS_INODE_NODATASUM)) {
 			ret = btrfs_alloc_dummy_sum(bbio);
 			if (ret)
-				goto fail_put_bio;
+				goto fail;
 		}
 	}
 
@@ -754,12 +753,23 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
 done:
 	return map_length == length;
 
-fail_put_bio:
-	if (map_length < length)
-		btrfs_cleanup_bio(bbio);
 fail:
 	btrfs_bio_counter_dec(fs_info);
-	btrfs_bio_end_io(orig_bbio, ret);
+	/*
+	 * We have split the original bbio, now we have to end both the current
+	 * @bbio and remaining one, as the remaining one will never be submitted.
+	 */
+	if (map_length < length) {
+		struct btrfs_bio *remaining = bbio->private;
+
+		ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
+		ASSERT(remaining);
+
+		remaining->bio.bi_status = ret;
+		btrfs_orig_bbio_end_io(remaining);
+	}
+	bbio->bio.bi_status = ret;
+	btrfs_orig_bbio_end_io(bbio);
 	/* Do not submit another chunk */
 	return true;
 }
-- 
cgit 


From 66927b89289974dab6d3b3cdd7706d0376034114 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Aug 2024 15:11:38 -0400
Subject: bcachefs: Fix failure to return error in data_update_index_update()

This fixes an assertion pop in io_write.c - if we don't return an error
we're supposed to have completed all the btree updates.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/data_update.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 65176d51b502..004894ad4147 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -337,6 +337,7 @@ restart_drop_extra_replicas:
 			printbuf_exit(&buf);
 
 			bch2_fatal_error(c);
+			ret = -EIO;
 			goto out;
 		}
 
-- 
cgit 


From d26935690c03fe8159d42358bed1c56252700cd1 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Mon, 26 Aug 2024 19:11:00 -0400
Subject: bcachefs: Fix bch2_extents_match() false positive

This was caught as a very rare nonce inconsistency, on systems with
encryption and replication (and tiering, or some form of rebalance
operation running):

[Wed Jul 17 13:30:03 2024] about to insert invalid key in data update path
[Wed Jul 17 13:30:03 2024] old: u64s 10 type extent 671283510:6392:U32_MAX len 16 ver 106595503: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:104 gen 7 ptr: 4:513244:48 gen 6 rebalance: target hdd compression zstd
[Wed Jul 17 13:30:03 2024] k:   u64s 10 type extent 671283510:6400:U32_MAX len 16 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 ptr: 4:513244:56 gen 6 rebalance: target hdd compression zstd
[Wed Jul 17 13:30:03 2024] new: u64s 14 type extent 671283510:6392:U32_MAX len 8 ver 106595508: durability: 2 crc: c_size 8 size 16 offset 0 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 3:355968:112 gen 7 cached ptr: 4:513244:56 gen 6 cached rebalance: target hdd compression zstd crc: c_size 8 size 16 offset 8 nonce 0 csum chacha20_poly1305_80 compress zstd ptr: 1:10860085:32 gen 0 ptr: 0:17285918:408 gen 0
[Wed Jul 17 13:30:03 2024] bcachefs (cca5bc65-fe77-409d-a9fa-465a6e7f4eae): fatal error - emergency read only

bch2_extents_match() was reporting true for extents that did not
actually point to the same data.

bch2_extent_match() iterates over pairs of pointers, looking for
pointers that point to the same location on disk (with matching
generation numbers). However one or both extents may have been trimmed
(or merged) and they might not have the same disk offset: it corrects
for this by subtracting the key offset and the checksum entry offset.

However, this failed when an extent was immediately partially
overwritten, and the new overwrite was allocated the next adjacent disk
space.

Normally, with compression off, this would never cause a bug, since the
new extent would have to be immediately after the old extent for the
pointer offsets to match, and the rebalance index update path is not
looking for an extent outside the range of the extent it moved.

However with compression enabled, extents take up less space on disk
than they do in the btree index space - and spuriously matching after
partial overwrite is possible.

To fix this, add a secondary check, that strictly checks that the
regions pointed to on disk overlap.

https://github.com/koverstreet/bcachefs/issues/717

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/extents.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e317df3644a1..eb31bda19544 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
 				if (p1.ptr.dev		== p2.ptr.dev &&
 				    p1.ptr.gen		== p2.ptr.gen &&
+
+				    /*
+				     * This checks that the two pointers point
+				     * to the same region on disk - adjusting
+				     * for the difference in where the extents
+				     * start, since one may have been trimmed:
+				     */
 				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
+
+				    /*
+				     * This additionally checks that the
+				     * extents overlap on disk, since the
+				     * previous check may trigger spuriously
+				     * when one extent is immediately partially
+				     * overwritten with another extent (so that
+				     * on disk they are adjacent) and
+				     * compression is in use:
+				     */
+				    ((p1.ptr.offset >= p2.ptr.offset &&
+				      p1.ptr.offset  < p2.ptr.offset + p2.crc.compressed_size) ||
+				     (p2.ptr.offset >= p1.ptr.offset &&
+				      p2.ptr.offset  < p1.ptr.offset + p1.crc.compressed_size)))
 					return true;
 
 		return false;
-- 
cgit 


From 6b35cc8d9239569700cc7cc737c8ed40b8b9cfdb Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 22 Aug 2024 17:00:20 -0700
Subject: xfs: use XFS_BUF_DADDR_NULL for daddrs in getfsmap code

Use XFS_BUF_DADDR_NULL (instead of a magic sentinel value) to mean "this
field is null" like the rest of xfs.

Cc: wozizhi@huawei.com
Fixes: e89c041338ed6 ("xfs: implement the GETFSMAP ioctl")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_fsmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 3a30b36779db..613a0ec20412 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -252,7 +252,7 @@ xfs_getfsmap_rec_before_start(
 	const struct xfs_rmap_irec	*rec,
 	xfs_daddr_t			rec_daddr)
 {
-	if (info->low_daddr != -1ULL)
+	if (info->low_daddr != XFS_BUF_DADDR_NULL)
 		return rec_daddr < info->low_daddr;
 	if (info->low.rm_blockcount)
 		return xfs_rmap_compare(rec, &info->low) < 0;
@@ -983,7 +983,7 @@ xfs_getfsmap(
 		info.dev = handlers[i].dev;
 		info.last = false;
 		info.pag = NULL;
-		info.low_daddr = -1ULL;
+		info.low_daddr = XFS_BUF_DADDR_NULL;
 		info.low.rm_blockcount = 0;
 		error = handlers[i].fn(tp, dkeys, &info);
 		if (error)
-- 
cgit 


From ca6448aed4f10ad88eba79055f181eb9a589a7b3 Mon Sep 17 00:00:00 2001
From: Zizhi Wo <wozizhi@huawei.com>
Date: Thu, 22 Aug 2024 17:00:35 -0700
Subject: xfs: Fix missing interval for missing_owner in xfs fsmap

In the fsmap query of xfs, there is an interval missing problem:
[root@fedora ~]# xfs_io -c 'fsmap -vvvv' /mnt
 EXT: DEV    BLOCK-RANGE           OWNER              FILE-OFFSET      AG AG-OFFSET             TOTAL
   0: 253:16 [0..7]:               static fs metadata                  0  (0..7)                    8
   1: 253:16 [8..23]:              per-AG metadata                     0  (8..23)                  16
   2: 253:16 [24..39]:             inode btree                         0  (24..39)                 16
   3: 253:16 [40..47]:             per-AG metadata                     0  (40..47)                  8
   4: 253:16 [48..55]:             refcount btree                      0  (48..55)                  8
   5: 253:16 [56..103]:            per-AG metadata                     0  (56..103)                48
   6: 253:16 [104..127]:           free space                          0  (104..127)               24
   ......

BUG:
[root@fedora ~]# xfs_io -c 'fsmap -vvvv -d 104 107' /mnt
[root@fedora ~]#
Normally, we should be able to get [104, 107), but we got nothing.

The problem is caused by shifting. The query for the problem-triggered
scenario is for the missing_owner interval (e.g. freespace in rmapbt/
unknown space in bnobt), which is obtained by subtraction (gap). For this
scenario, the interval is obtained by info->last. However, rec_daddr is
calculated based on the start_block recorded in key[1], which is converted
by calling XFS_BB_TO_FSBT. Then if rec_daddr does not exceed
info->next_daddr, which means keys[1].fmr_physical >> (mp)->m_blkbb_log
<= info->next_daddr, no records will be displayed. In the above example,
104 >> (mp)->m_blkbb_log = 12 and 107 >> (mp)->m_blkbb_log = 12, so the two
are reduced to 0 and the gap is ignored:

 before calculate ----------------> after shifting
 104(st)  107(ed)		      12(st/ed)
  |---------|				  |
  sector size			      block size

Resolve this issue by introducing the "end_daddr" field in
xfs_getfsmap_info. This records |key[1].fmr_physical + key[1].length| at
the granularity of sector. If the current query is the last, the rec_daddr
is end_daddr to prevent missing interval problems caused by shifting. We
only need to focus on the last query, because xfs disks are internally
aligned with disk blocksize that are powers of two and minimum 512, so
there is no problem with shifting in previous queries.

After applying this patch, the above problem have been solved:
[root@fedora ~]# xfs_io -c 'fsmap -vvvv -d 104 107' /mnt
 EXT: DEV    BLOCK-RANGE      OWNER            FILE-OFFSET      AG AG-OFFSET        TOTAL
   0: 253:16 [104..106]:      free space                        0  (104..106)           3

Fixes: e89c041338ed ("xfs: implement the GETFSMAP ioctl")
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: limit the range of end_addr correctly]
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_fsmap.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 613a0ec20412..71f32354944e 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -162,6 +162,7 @@ struct xfs_getfsmap_info {
 	xfs_daddr_t		next_daddr;	/* next daddr we expect */
 	/* daddr of low fsmap key when we're using the rtbitmap */
 	xfs_daddr_t		low_daddr;
+	xfs_daddr_t		end_daddr;	/* daddr of high fsmap key */
 	u64			missing_owner;	/* owner of holes */
 	u32			dev;		/* device id */
 	/*
@@ -182,6 +183,7 @@ struct xfs_getfsmap_dev {
 	int			(*fn)(struct xfs_trans *tp,
 				      const struct xfs_fsmap *keys,
 				      struct xfs_getfsmap_info *info);
+	sector_t		nr_sectors;
 };
 
 /* Compare two getfsmap device handlers. */
@@ -294,6 +296,18 @@ xfs_getfsmap_helper(
 		return 0;
 	}
 
+	/*
+	 * For an info->last query, we're looking for a gap between the last
+	 * mapping emitted and the high key specified by userspace.  If the
+	 * user's query spans less than 1 fsblock, then info->high and
+	 * info->low will have the same rm_startblock, which causes rec_daddr
+	 * and next_daddr to be the same.  Therefore, use the end_daddr that
+	 * we calculated from userspace's high key to synthesize the record.
+	 * Note that if the btree query found a mapping, there won't be a gap.
+	 */
+	if (info->last && info->end_daddr != XFS_BUF_DADDR_NULL)
+		rec_daddr = info->end_daddr;
+
 	/* Are we just counting mappings? */
 	if (info->head->fmh_count == 0) {
 		if (info->head->fmh_entries == UINT_MAX)
@@ -904,17 +918,21 @@ xfs_getfsmap(
 
 	/* Set up our device handlers. */
 	memset(handlers, 0, sizeof(handlers));
+	handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
 	handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
 	if (use_rmap)
 		handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
 	else
 		handlers[0].fn = xfs_getfsmap_datadev_bnobt;
 	if (mp->m_logdev_targp != mp->m_ddev_targp) {
+		handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
+						       mp->m_sb.sb_logblocks);
 		handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
 		handlers[1].fn = xfs_getfsmap_logdev;
 	}
 #ifdef CONFIG_XFS_RT
 	if (mp->m_rtdev_targp) {
+		handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
 		handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
 		handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
 	}
@@ -946,6 +964,7 @@ xfs_getfsmap(
 
 	info.next_daddr = head->fmh_keys[0].fmr_physical +
 			  head->fmh_keys[0].fmr_length;
+	info.end_daddr = XFS_BUF_DADDR_NULL;
 	info.fsmap_recs = fsmap_recs;
 	info.head = head;
 
@@ -966,8 +985,11 @@ xfs_getfsmap(
 		 * low key, zero out the low key so that we get
 		 * everything from the beginning.
 		 */
-		if (handlers[i].dev == head->fmh_keys[1].fmr_device)
+		if (handlers[i].dev == head->fmh_keys[1].fmr_device) {
 			dkeys[1] = head->fmh_keys[1];
+			info.end_daddr = min(handlers[i].nr_sectors - 1,
+					     dkeys[1].fmr_physical);
+		}
 		if (handlers[i].dev > head->fmh_keys[0].fmr_device)
 			memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
 
-- 
cgit 


From 16e1fbdce9c8d084863fd63cdaff8fb2a54e2f88 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 22 Aug 2024 17:00:51 -0700
Subject: xfs: take m_growlock when running growfsrt

Take the grow lock when we're expanding the realtime volume, like we do
for the other growfs calls.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_rtalloc.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 0c3e96c621a6..776d6c401f62 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -821,34 +821,39 @@ xfs_growfs_rt(
 	/* Needs to have been mounted with an rt device. */
 	if (!XFS_IS_REALTIME_MOUNT(mp))
 		return -EINVAL;
+
+	if (!mutex_trylock(&mp->m_growlock))
+		return -EWOULDBLOCK;
 	/*
 	 * Mount should fail if the rt bitmap/summary files don't load, but
 	 * we'll check anyway.
 	 */
+	error = -EINVAL;
 	if (!mp->m_rbmip || !mp->m_rsumip)
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Shrink not supported. */
 	if (in->newblocks <= sbp->sb_rblocks)
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Can only change rt extent size when adding rt volume. */
 	if (sbp->sb_rblocks > 0 && in->extsize != sbp->sb_rextsize)
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Range check the extent size. */
 	if (XFS_FSB_TO_B(mp, in->extsize) > XFS_MAX_RTEXTSIZE ||
 	    XFS_FSB_TO_B(mp, in->extsize) < XFS_MIN_RTEXTSIZE)
-		return -EINVAL;
+		goto out_unlock;
 
 	/* Unsupported realtime features. */
+	error = -EOPNOTSUPP;
 	if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp))
-		return -EOPNOTSUPP;
+		goto out_unlock;
 
 	nrblocks = in->newblocks;
 	error = xfs_sb_validate_fsb_count(sbp, nrblocks);
 	if (error)
-		return error;
+		goto out_unlock;
 	/*
 	 * Read in the last block of the device, make sure it exists.
 	 */
@@ -856,7 +861,7 @@ xfs_growfs_rt(
 				XFS_FSB_TO_BB(mp, nrblocks - 1),
 				XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
 	if (error)
-		return error;
+		goto out_unlock;
 	xfs_buf_relse(bp);
 
 	/*
@@ -864,8 +869,10 @@ xfs_growfs_rt(
 	 */
 	nrextents = nrblocks;
 	do_div(nrextents, in->extsize);
-	if (!xfs_validate_rtextents(nrextents))
-		return -EINVAL;
+	if (!xfs_validate_rtextents(nrextents)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
 	nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
 	nrextslog = xfs_compute_rextslog(nrextents);
 	nrsumlevels = nrextslog + 1;
@@ -876,8 +883,11 @@ xfs_growfs_rt(
 	 * the log.  This prevents us from getting a log overflow,
 	 * since we'll log basically the whole summary file at once.
 	 */
-	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
-		return -EINVAL;
+	if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
 	/*
 	 * Get the old block counts for bitmap and summary inodes.
 	 * These can't change since other growfs callers are locked out.
@@ -889,10 +899,10 @@ xfs_growfs_rt(
 	 */
 	error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
 	if (error)
-		return error;
+		goto out_unlock;
 	error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
 	if (error)
-		return error;
+		goto out_unlock;
 
 	rsum_cache = mp->m_rsum_cache;
 	if (nrbmblocks != sbp->sb_rbmblocks)
@@ -1059,6 +1069,8 @@ out_free:
 		}
 	}
 
+out_unlock:
+	mutex_unlock(&mp->m_growlock);
 	return error;
 }
 
-- 
cgit 


From a24cae8fc1f13f6f6929351309f248fd2e9351ce Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Thu, 22 Aug 2024 17:01:07 -0700
Subject: xfs: reset rootdir extent size hint after growfsrt

If growfsrt is run on a filesystem that doesn't have a rt volume, it's
possible to change the rt extent size.  If the root directory was
previously set up with an inherited extent size hint and rtinherit, it's
possible that the hint is no longer a multiple of the rt extent size.
Although the verifiers don't complain about this, xfs_repair will, so if
we detect this situation, log the root directory to clean it up.  This
is still racy, but it's better than nothing.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
---
 fs/xfs/xfs_rtalloc.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 776d6c401f62..ebeab8e4dab1 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -784,6 +784,39 @@ xfs_alloc_rsum_cache(
 		xfs_warn(mp, "could not allocate realtime summary cache");
 }
 
+/*
+ * If we changed the rt extent size (meaning there was no rt volume previously)
+ * and the root directory had EXTSZINHERIT and RTINHERIT set, it's possible
+ * that the extent size hint on the root directory is no longer congruent with
+ * the new rt extent size.  Log the rootdir inode to fix this.
+ */
+static int
+xfs_growfs_rt_fixup_extsize(
+	struct xfs_mount	*mp)
+{
+	struct xfs_inode	*ip = mp->m_rootip;
+	struct xfs_trans	*tp;
+	int			error = 0;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	if (!(ip->i_diflags & XFS_DIFLAG_RTINHERIT) ||
+	    !(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT))
+		goto out_iolock;
+
+	error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_ichange, 0, 0, false,
+			&tp);
+	if (error)
+		goto out_iolock;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+out_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
 /*
  * Visible (exported) functions.
  */
@@ -812,6 +845,7 @@ xfs_growfs_rt(
 	xfs_extlen_t	rsumblocks;	/* current number of rt summary blks */
 	xfs_sb_t	*sbp;		/* old superblock */
 	uint8_t		*rsum_cache;	/* old summary cache */
+	xfs_agblock_t	old_rextsize = mp->m_sb.sb_rextsize;
 
 	sbp = &mp->m_sb;
 
@@ -1046,6 +1080,12 @@ error_cancel:
 	if (error)
 		goto out_free;
 
+	if (old_rextsize != in->extsize) {
+		error = xfs_growfs_rt_fixup_extsize(mp);
+		if (error)
+			goto out_free;
+	}
+
 	/* Update secondary superblocks now the physical grow has completed */
 	error = xfs_update_secondary_sbs(mp);
 
-- 
cgit 


From ecb54277cb63c273e8d74272e5b9bfd80c2185d9 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 27 Aug 2024 11:30:10 +0100
Subject: btrfs: fix uninitialized return value from btrfs_reclaim_sweep()

The return variable 'ret' at btrfs_reclaim_sweep() is never assigned if
none of the space infos is reclaimable (for example if periodic reclaim
is disabled, which is the default), so we return an undefined value.

This can be fixed my making btrfs_reclaim_sweep() not return any value
as well as do_reclaim_sweep() because:

1) do_reclaim_sweep() always returns 0, so we can make it return void;

2) The only caller of btrfs_reclaim_sweep() (btrfs_reclaim_bgs()) doesn't
   care about its return value, and in its context there's nothing to do
   about any errors anyway.

Therefore remove the return value from btrfs_reclaim_sweep() and
do_reclaim_sweep().

Fixes: e4ca3932ae90 ("btrfs: periodic block_group reclaim")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/space-info.c | 17 +++++------------
 fs/btrfs/space-info.h |  2 +-
 2 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 68e14fd48638..c691784b4660 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1985,8 +1985,8 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
 	return unalloc < data_chunk_size;
 }
 
-static int do_reclaim_sweep(struct btrfs_fs_info *fs_info,
-			    struct btrfs_space_info *space_info, int raid)
+static void do_reclaim_sweep(struct btrfs_fs_info *fs_info,
+			     struct btrfs_space_info *space_info, int raid)
 {
 	struct btrfs_block_group *bg;
 	int thresh_pct;
@@ -2031,7 +2031,6 @@ again:
 	}
 
 	up_read(&space_info->groups_sem);
-	return 0;
 }
 
 void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes)
@@ -2074,21 +2073,15 @@ bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
 	return ret;
 }
 
-int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
+void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info)
 {
-	int ret;
 	int raid;
 	struct btrfs_space_info *space_info;
 
 	list_for_each_entry(space_info, &fs_info->space_info, list) {
 		if (!btrfs_should_periodic_reclaim(space_info))
 			continue;
-		for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) {
-			ret = do_reclaim_sweep(fs_info, space_info, raid);
-			if (ret)
-				return ret;
-		}
+		for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
+			do_reclaim_sweep(fs_info, space_info, raid);
 	}
-
-	return ret;
 }
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 88b44221ce97..5602026c5e14 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -294,6 +294,6 @@ void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s6
 void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
 bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
 int btrfs_calc_reclaim_threshold(struct btrfs_space_info *space_info);
-int btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info);
+void btrfs_reclaim_sweep(struct btrfs_fs_info *fs_info);
 
 #endif /* BTRFS_SPACE_INFO_H */
-- 
cgit 


From 6a5dcd487791e0c2d86622064602a5c7459941ed Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Aug 2024 23:06:48 +0100
Subject: cifs: Fix lack of credit renegotiation on read retry

When netfslib asks cifs to issue a read operation, it prefaces this with a
call to ->clamp_length() which cifs uses to negotiate credits, providing
receive capacity on the server; however, in the event that a read op needs
reissuing, netfslib doesn't call ->clamp_length() again as that could
shorten the subrequest, leaving a gap.

This causes the retried read to be done with zero credits which causes the
server to reject it with STATUS_INVALID_PARAMETER.  This is a problem for a
DIO read that is requested that would go over the EOF.  The short read will
be retried, causing EINVAL to be returned to the user when it fails.

Fix this by making cifs_req_issue_read() negotiate new credits if retrying
(NETFS_SREQ_RETRYING now gets set in the read side as well as the write
side in this instance).

This isn't sufficient, however: the new credits might not be sufficient to
complete the remainder of the read, so also add an additional field,
rreq->actual_len, that holds the actual size of the op we want to perform
without having to alter subreq->len.

We then rely on repeated short reads being retried until we finish the read
or reach the end of file and make a zero-length read.

Also fix a couple of places where the subrequest start and length need to
be altered by the amount so far transferred when being used.

Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/netfs/io.c            |  2 ++
 fs/smb/client/cifsglob.h |  1 +
 fs/smb/client/file.c     | 37 +++++++++++++++++++++++++++++++++----
 fs/smb/client/smb2ops.c  |  2 +-
 fs/smb/client/smb2pdu.c  | 28 +++++++++++++++++-----------
 fs/smb/client/trace.h    |  1 +
 6 files changed, 55 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 4da0a494e860..3303b515b536 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -306,6 +306,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
 				break;
 			subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
 			subreq->error = 0;
+			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
 			netfs_stat(&netfs_n_rh_download_instead);
 			trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
 			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
@@ -313,6 +314,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
 			netfs_reset_subreq_iter(rreq, subreq);
 			netfs_read_from_server(rreq, subreq);
 		} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
+			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
 			netfs_reset_subreq_iter(rreq, subreq);
 			netfs_rreq_short_read(rreq, subreq);
 		}
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index f379b9dc93ba..9eae8649f90c 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1485,6 +1485,7 @@ struct cifs_io_subrequest {
 		struct cifs_io_request *req;
 	};
 	ssize_t				got_bytes;
+	size_t				actual_len;
 	unsigned int			xid;
 	int				result;
 	bool				have_xid;
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index f9b302cb8233..2d387485f05b 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -111,6 +111,7 @@ static void cifs_issue_write(struct netfs_io_subrequest *subreq)
 		goto fail;
 	}
 
+	wdata->actual_len = wdata->subreq.len;
 	rc = adjust_credits(wdata->server, wdata, cifs_trace_rw_credits_issue_write_adjust);
 	if (rc)
 		goto fail;
@@ -153,7 +154,7 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
 	struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
 	struct TCP_Server_Info *server = req->server;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
-	size_t rsize = 0;
+	size_t rsize;
 	int rc;
 
 	rdata->xid = get_xid();
@@ -166,8 +167,8 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
 						     cifs_sb->ctx);
 
 
-	rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize,
-					   &rdata->credits);
+	rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
+					   &rsize, &rdata->credits);
 	if (rc) {
 		subreq->error = rc;
 		return false;
@@ -183,7 +184,8 @@ static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
 			      server->credits, server->in_flight, 0,
 			      cifs_trace_rw_credits_read_submit);
 
-	subreq->len = min_t(size_t, subreq->len, rsize);
+	subreq->len = umin(subreq->len, rsize);
+	rdata->actual_len = subreq->len;
 
 #ifdef CONFIG_CIFS_SMB_DIRECT
 	if (server->smbd_conn)
@@ -203,12 +205,39 @@ static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
 	struct netfs_io_request *rreq = subreq->rreq;
 	struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
 	struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+	struct TCP_Server_Info *server = req->server;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
 	int rc = 0;
 
 	cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
 		 __func__, rreq->debug_id, subreq->debug_index, rreq->mapping,
 		 subreq->transferred, subreq->len);
 
+	if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
+		/*
+		 * As we're issuing a retry, we need to negotiate some new
+		 * credits otherwise the server may reject the op with
+		 * INVALID_PARAMETER.  Note, however, we may get back less
+		 * credit than we need to complete the op, in which case, we
+		 * shorten the op and rely on additional rounds of retry.
+		 */
+		size_t rsize = umin(subreq->len - subreq->transferred,
+				    cifs_sb->ctx->rsize);
+
+		rc = server->ops->wait_mtu_credits(server, rsize, &rdata->actual_len,
+						   &rdata->credits);
+		if (rc)
+			goto out;
+
+		rdata->credits.in_flight_check = 1;
+
+		trace_smb3_rw_credits(rdata->rreq->debug_id,
+				      rdata->subreq.debug_index,
+				      rdata->credits.value,
+				      server->credits, server->in_flight, 0,
+				      cifs_trace_rw_credits_read_resubmit);
+	}
+
 	if (req->cfile->invalidHandle) {
 		do {
 			rc = cifs_reopen_file(req->cfile, true);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 0b9cb1a60d4a..a6f00b157275 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -301,7 +301,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
 		    unsigned int /*enum smb3_rw_credits_trace*/ trace)
 {
 	struct cifs_credits *credits = &subreq->credits;
-	int new_val = DIV_ROUND_UP(subreq->subreq.len, SMB2_MAX_BUFFER_SIZE);
+	int new_val = DIV_ROUND_UP(subreq->actual_len, SMB2_MAX_BUFFER_SIZE);
 	int scredits, in_flight;
 
 	if (!credits->value || credits->value == new_val)
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 2d7e6c42cf18..be7a1a9c691d 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4529,9 +4529,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
 		  "rdata server %p != mid server %p",
 		  rdata->server, mid->server);
 
-	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu\n",
+	cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%zu/%zu\n",
 		 __func__, mid->mid, mid->mid_state, rdata->result,
-		 rdata->subreq.len);
+		 rdata->actual_len, rdata->subreq.len - rdata->subreq.transferred);
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
@@ -4585,15 +4585,18 @@ smb2_readv_callback(struct mid_q_entry *mid)
 				    rdata->subreq.debug_index,
 				    rdata->xid,
 				    rdata->req->cfile->fid.persistent_fid,
-				    tcon->tid, tcon->ses->Suid, rdata->subreq.start,
-				    rdata->subreq.len, rdata->result);
+				    tcon->tid, tcon->ses->Suid,
+				    rdata->subreq.start + rdata->subreq.transferred,
+				    rdata->actual_len,
+				    rdata->result);
 	} else
 		trace_smb3_read_done(rdata->rreq->debug_id,
 				     rdata->subreq.debug_index,
 				     rdata->xid,
 				     rdata->req->cfile->fid.persistent_fid,
 				     tcon->tid, tcon->ses->Suid,
-				     rdata->subreq.start, rdata->got_bytes);
+				     rdata->subreq.start + rdata->subreq.transferred,
+				     rdata->got_bytes);
 
 	if (rdata->result == -ENODATA) {
 		/* We may have got an EOF error because fallocate
@@ -4621,6 +4624,7 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
 {
 	int rc, flags = 0;
 	char *buf;
+	struct netfs_io_subrequest *subreq = &rdata->subreq;
 	struct smb2_hdr *shdr;
 	struct cifs_io_parms io_parms;
 	struct smb_rqst rqst = { .rq_iov = rdata->iov,
@@ -4631,15 +4635,15 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
 	int credit_request;
 
 	cifs_dbg(FYI, "%s: offset=%llu bytes=%zu\n",
-		 __func__, rdata->subreq.start, rdata->subreq.len);
+		 __func__, subreq->start, subreq->len);
 
 	if (!rdata->server)
 		rdata->server = cifs_pick_channel(tcon->ses);
 
 	io_parms.tcon = tlink_tcon(rdata->req->cfile->tlink);
 	io_parms.server = server = rdata->server;
-	io_parms.offset = rdata->subreq.start;
-	io_parms.length = rdata->subreq.len;
+	io_parms.offset = subreq->start + subreq->transferred;
+	io_parms.length = rdata->actual_len;
 	io_parms.persistent_fid = rdata->req->cfile->fid.persistent_fid;
 	io_parms.volatile_fid = rdata->req->cfile->fid.volatile_fid;
 	io_parms.pid = rdata->req->pid;
@@ -4654,11 +4658,13 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
 
 	rdata->iov[0].iov_base = buf;
 	rdata->iov[0].iov_len = total_len;
+	rdata->got_bytes = 0;
+	rdata->result = 0;
 
 	shdr = (struct smb2_hdr *)buf;
 
 	if (rdata->credits.value > 0) {
-		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->subreq.len,
+		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->actual_len,
 						SMB2_MAX_BUFFER_SIZE));
 		credit_request = le16_to_cpu(shdr->CreditCharge) + 8;
 		if (server->credits >= server->max_credits)
@@ -4682,11 +4688,11 @@ smb2_async_readv(struct cifs_io_subrequest *rdata)
 	if (rc) {
 		cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
 		trace_smb3_read_err(rdata->rreq->debug_id,
-				    rdata->subreq.debug_index,
+				    subreq->debug_index,
 				    rdata->xid, io_parms.persistent_fid,
 				    io_parms.tcon->tid,
 				    io_parms.tcon->ses->Suid,
-				    io_parms.offset, io_parms.length, rc);
+				    io_parms.offset, rdata->actual_len, rc);
 	}
 
 async_readv_out:
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 0f0c10c7ada7..8e9964001e2a 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -30,6 +30,7 @@
 	EM(cifs_trace_rw_credits_old_session,		"old-session") \
 	EM(cifs_trace_rw_credits_read_response_add,	"rd-resp-add") \
 	EM(cifs_trace_rw_credits_read_response_clear,	"rd-resp-clr") \
+	EM(cifs_trace_rw_credits_read_resubmit,		"rd-resubmit") \
 	EM(cifs_trace_rw_credits_read_submit,		"rd-submit  ") \
 	EM(cifs_trace_rw_credits_write_prepare,		"wr-prepare ") \
 	EM(cifs_trace_rw_credits_write_response_add,	"wr-resp-add") \
-- 
cgit 


From 1da29f2c39b67b846b74205c81bf0ccd96d34727 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 22 Aug 2024 23:06:49 +0100
Subject: netfs, cifs: Fix handling of short DIO read

Short DIO reads, particularly in relation to cifs, are not being handled
correctly by cifs and netfslib.  This can be tested by doing a DIO read of
a file where the size of read is larger than the size of the file.  When it
crosses the EOF, it gets a short read and this gets retried, and in the
case of cifs, the retry read fails, with the failure being translated to
ENODATA.

Fix this by the following means:

 (1) Add a flag, NETFS_SREQ_HIT_EOF, for the filesystem to set when it
     detects that the read did hit the EOF.

 (2) Make the netfslib read assessment stop processing subrequests when it
     encounters one with that flag set.

 (3) Return rreq->transferred, the accumulated contiguous amount read to
     that point, to userspace for a DIO read.

 (4) Make cifs set the flag and clear the error if the read RPC returned
     ENODATA.

 (5) Make cifs set the flag and clear the error if a short read occurred
     without error and the read-to file position is now at the remote inode
     size.

Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/netfs/io.c           | 17 +++++++++++------
 fs/smb/client/smb2pdu.c | 13 +++++++++----
 include/linux/netfs.h   |  1 +
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 3303b515b536..943128507af5 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -368,7 +368,8 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
 		if (subreq->error || subreq->transferred == 0)
 			break;
 		transferred += subreq->transferred;
-		if (subreq->transferred < subreq->len)
+		if (subreq->transferred < subreq->len ||
+		    test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags))
 			break;
 	}
 
@@ -503,7 +504,8 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
 
 	subreq->error = 0;
 	subreq->transferred += transferred_or_error;
-	if (subreq->transferred < subreq->len)
+	if (subreq->transferred < subreq->len &&
+	    !test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags))
 		goto incomplete;
 
 complete:
@@ -782,10 +784,13 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 			    TASK_UNINTERRUPTIBLE);
 
 		ret = rreq->error;
-		if (ret == 0 && rreq->submitted < rreq->len &&
-		    rreq->origin != NETFS_DIO_READ) {
-			trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
-			ret = -EIO;
+		if (ret == 0) {
+			if (rreq->origin == NETFS_DIO_READ) {
+				ret = rreq->transferred;
+			} else if (rreq->submitted < rreq->len) {
+				trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+				ret = -EIO;
+			}
 		}
 	} else {
 		/* If we decrement nr_outstanding to 0, the ref belongs to us. */
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index be7a1a9c691d..88dc49d67037 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4507,6 +4507,7 @@ static void
 smb2_readv_callback(struct mid_q_entry *mid)
 {
 	struct cifs_io_subrequest *rdata = mid->callback_data;
+	struct netfs_inode *ictx = netfs_inode(rdata->rreq->inode);
 	struct cifs_tcon *tcon = tlink_tcon(rdata->req->cfile->tlink);
 	struct TCP_Server_Info *server = rdata->server;
 	struct smb2_hdr *shdr =
@@ -4599,11 +4600,15 @@ smb2_readv_callback(struct mid_q_entry *mid)
 				     rdata->got_bytes);
 
 	if (rdata->result == -ENODATA) {
-		/* We may have got an EOF error because fallocate
-		 * failed to enlarge the file.
-		 */
-		if (rdata->subreq.start < rdata->subreq.rreq->i_size)
+		__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
+		rdata->result = 0;
+	} else {
+		if (rdata->got_bytes < rdata->actual_len &&
+		    rdata->subreq.start + rdata->subreq.transferred + rdata->got_bytes ==
+		    ictx->remote_i_size) {
+			__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
 			rdata->result = 0;
+		}
 	}
 	trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
 			      server->credits, server->in_flight,
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 983816608f15..c47443e7a97e 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -198,6 +198,7 @@ struct netfs_io_subrequest {
 #define NETFS_SREQ_NEED_RETRY		9	/* Set if the filesystem requests a retry */
 #define NETFS_SREQ_RETRYING		10	/* Set if we're retrying */
 #define NETFS_SREQ_FAILED		11	/* Set if the subreq failed unretryably */
+#define NETFS_SREQ_HIT_EOF		12	/* Set if we hit the EOF */
 };
 
 enum netfs_io_origin {
-- 
cgit 


From 8101d6e112e2524e967368f920c404ae445a9757 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 27 Aug 2024 15:47:27 +0100
Subject: cifs: Fix copy offload to flush destination region

Fix cifs_file_copychunk_range() to flush the destination region before
invalidating it to avoid potential loss of data should the copy fail, in
whole or in part, in some way.

Fixes: 7b2404a886f8 ("cifs: Fix flushing, invalidation and file size with copy_file_range()")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <stfrench@microsoft.com>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Shyam Prasad N <nspmangalore@gmail.com>
cc: Rohith Surabattula <rohiths.msft@gmail.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/cifsfs.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index d89485235425..2a2523c93944 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1341,7 +1341,6 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 	struct cifsFileInfo *smb_file_target;
 	struct cifs_tcon *src_tcon;
 	struct cifs_tcon *target_tcon;
-	unsigned long long destend, fstart, fend;
 	ssize_t rc;
 
 	cifs_dbg(FYI, "copychunk range\n");
@@ -1391,25 +1390,13 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
 			goto unlock;
 	}
 
-	destend = destoff + len - 1;
-
-	/* Flush the folios at either end of the destination range to prevent
-	 * accidental loss of dirty data outside of the range.
+	/* Flush and invalidate all the folios in the destination region.  If
+	 * the copy was successful, then some of the flush is extra overhead,
+	 * but we need to allow for the copy failing in some way (eg. ENOSPC).
 	 */
-	fstart = destoff;
-	fend = destend;
-
-	rc = cifs_flush_folio(target_inode, destoff, &fstart, &fend, true);
+	rc = filemap_invalidate_inode(target_inode, true, destoff, destoff + len - 1);
 	if (rc)
 		goto unlock;
-	rc = cifs_flush_folio(target_inode, destend, &fstart, &fend, false);
-	if (rc)
-		goto unlock;
-	if (fend > target_cifsi->netfs.zero_point)
-		target_cifsi->netfs.zero_point = fend + 1;
-
-	/* Discard all the folios that overlap the destination region. */
-	truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
 
 	fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
 			   i_size_read(target_inode), 0);
-- 
cgit 


From 91d1dfae464987aaf6c79ff51d8674880fb3be77 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 28 Aug 2024 21:08:25 +0100
Subject: cifs: Fix FALLOC_FL_ZERO_RANGE to preflush buffered part of target
 region

Under certain conditions, the range to be cleared by FALLOC_FL_ZERO_RANGE
may only be buffered locally and not yet have been flushed to the server.
For example:

	xfs_io -f -t -c "pwrite -S 0x41 0 4k" \
		     -c "pwrite -S 0x42 4k 4k" \
		     -c "fzero 0 4k" \
		     -c "pread -v 0 8k" /xfstest.test/foo

will write two 4KiB blocks of data, which get buffered in the pagecache,
and then fallocate() is used to clear the first 4KiB block on the server -
but we don't flush the data first, which means the EOF position on the
server is wrong, and so the FSCTL_SET_ZERO_DATA RPC fails (and xfs_io
ignores the error), but then when we try to read it, we see the old data.

Fix this by preflushing any part of the target region that above the
server's idea of the EOF position to force the server to update its EOF
position.

Note, however, that we don't want to simply expand the file by moving the
EOF before doing the FSCTL_SET_ZERO_DATA[*] because someone else might see
the zeroed region or if the RPC fails we then have to try to clean it up or
risk getting corruption.

[*] And we have to move the EOF first otherwise FSCTL_SET_ZERO_DATA won't
do what we want.

This fixes the generic/008 xfstest.

[!] Note: A better way to do this might be to split the operation into two
parts: we only do FSCTL_SET_ZERO_DATA for the part of the range below the
server's EOF and then, if that worked, invalidate the buffered pages for the
part above the range.

Fixes: 6b69040247e1 ("cifs/smb3: Fix data inconsistent when zero file range")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <stfrench@microsoft.com>
cc: Zhang Xiaoxu <zhangxiaoxu5@huawei.com>
cc: Pavel Shilovsky <pshilov@microsoft.com>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Shyam Prasad N <nspmangalore@gmail.com>
cc: Rohith Surabattula <rohiths.msft@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/smb/client/smb2ops.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index a6f00b157275..4df84ebe8dbe 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -3237,13 +3237,15 @@ static long smb3_zero_data(struct file *file, struct cifs_tcon *tcon,
 }
 
 static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
-			    loff_t offset, loff_t len, bool keep_size)
+			    unsigned long long offset, unsigned long long len,
+			    bool keep_size)
 {
 	struct cifs_ses *ses = tcon->ses;
 	struct inode *inode = file_inode(file);
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct cifsFileInfo *cfile = file->private_data;
-	unsigned long long new_size;
+	struct netfs_inode *ictx = netfs_inode(inode);
+	unsigned long long i_size, new_size, remote_size;
 	long rc;
 	unsigned int xid;
 
@@ -3255,6 +3257,16 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
 	inode_lock(inode);
 	filemap_invalidate_lock(inode->i_mapping);
 
+	i_size = i_size_read(inode);
+	remote_size = ictx->remote_i_size;
+	if (offset + len >= remote_size && offset < i_size) {
+		unsigned long long top = umin(offset + len, i_size);
+
+		rc = filemap_write_and_wait_range(inode->i_mapping, offset, top - 1);
+		if (rc < 0)
+			goto zero_range_exit;
+	}
+
 	/*
 	 * We zero the range through ioctl, so we need remove the page caches
 	 * first, otherwise the data may be inconsistent with the server.
-- 
cgit 


From 04c8abae1b7b2abeb638a3d5d5950fa2a031c244 Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Thu, 29 Aug 2024 11:20:49 -0700
Subject: dcache: keep dentry_hashtable or d_hash_shift even when not used

The runtime constant feature removes all the users of these variables,
allowing the compiler to optimize them away.  It's quite difficult to
extract their values from the kernel text, and the memory saved by
removing them is tiny, and it was never the point of this optimization.

Since the dentry_hashtable is a core data structure, it's valuable for
debugging tools to be able to read it easily.  For instance, scripts
built on drgn, like the dentrycache script[1], rely on it to be able to
perform diagnostics on the contents of the dcache.  Annotate it as used,
so the compiler doesn't discard it.

Link: https://github.com/oracle-samples/drgn-tools/blob/3afc56146f54d09dfd1f6d3c1b7436eda7e638be/drgn_tools/dentry.py#L325-L355 [1]
Fixes: e3c92e81711d ("runtime constants: add x86 architecture support")
Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 3d8daaecb6d1..6386b9b625dd 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -96,11 +96,16 @@ EXPORT_SYMBOL(dotdot_name);
  *
  * This hash-function tries to avoid losing too many bits of hash
  * information, yet avoid using a prime hash-size or similar.
+ *
+ * Marking the variables "used" ensures that the compiler doesn't
+ * optimize them away completely on architectures with runtime
+ * constant infrastructure, this allows debuggers to see their
+ * values. But updating these values has no effect on those arches.
  */
 
-static unsigned int d_hash_shift __ro_after_init;
+static unsigned int d_hash_shift __ro_after_init __used;
 
-static struct hlist_bl_head *dentry_hashtable __ro_after_init;
+static struct hlist_bl_head *dentry_hashtable __ro_after_init __used;
 
 static inline struct hlist_bl_head *d_hash(unsigned long hashlen)
 {
-- 
cgit 


From 40927f3d0972bf86357a32a5749be71a551241b6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 29 Aug 2024 09:06:28 +1000
Subject: nfsd: fix nfsd4_deleg_getattr_conflict in presence of third party
 lease

It is not safe to dereference fl->c.flc_owner without first confirming
fl->fl_lmops is the expected manager.  nfsd4_deleg_getattr_conflict()
tests fl_lmops but largely ignores the result and assumes that flc_owner
is an nfs4_delegation anyway.  This is wrong.

With this patch we restore the "!= &nfsd_lease_mng_ops" case to behave
as it did before the change mentioned below.  This is the same as the
current code, but without any reference to a possible delegation.

Fixes: c5967721e106 ("NFSD: handle GETATTR conflict with write delegation")
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4state.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 07f2496850c4..a366fb1c1b9b 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -8859,7 +8859,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 			 */
 			if (type == F_RDLCK)
 				break;
-			goto break_lease;
+
+			nfsd_stats_wdeleg_getattr_inc(nn);
+			spin_unlock(&ctx->flc_lock);
+
+			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+			if (status != nfserr_jukebox ||
+			    !nfsd_wait_for_delegreturn(rqstp, inode))
+				return status;
+			return 0;
 		}
 		if (type == F_WRLCK) {
 			struct nfs4_delegation *dp = fl->c.flc_owner;
@@ -8868,7 +8876,6 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 				spin_unlock(&ctx->flc_lock);
 				return 0;
 			}
-break_lease:
 			nfsd_stats_wdeleg_getattr_inc(nn);
 			dp = fl->c.flc_owner;
 			refcount_inc(&dp->dl_stid.sc_count);
-- 
cgit 


From e3e6940940910c2287fe962bdf72015efd4fee81 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 31 Aug 2024 17:44:51 -0400
Subject: bcachefs: Revert lockless buffered IO path

We had a report of data corruption on nixos when building installer
images.

https://github.com/NixOS/nixpkgs/pull/321055#issuecomment-2184131334

It seems that writes are being dropped, but only when issued by QEMU,
and possibly only in snapshot mode. It's undetermined if it's write
calls are being dropped or dirty folios.

Further testing, via minimizing the original patch to just the change
that skips the inode lock on non appends/truncates, reveals that it
really is just not taking the inode lock that causes the corruption: it
has nothing to do with the other logic changes for preserving write
atomicity in corner cases.

It's also kernel config dependent: it doesn't reproduce with the minimal
kernel config that ktest uses, but it does reproduce with nixos's distro
config. Bisection the kernel config initially pointer the finger at page
migration or compaction, but it appears that was erroneous; we haven't
yet determined what kernel config option actually triggers it.

Sadly it appears this will have to be reverted since we're getting too
close to release and my plate is full, but we'd _really_ like to fully
debug it.

My suspicion is that this patch is exposing a preexisting bug - the
inode lock actually covers very little in IO paths, and we have a
different lock (the pagecache add lock) that guards against races with
truncate here.

Fixes: 7e64c86cdc6c ("bcachefs: Buffered write path now can avoid the inode lock")
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/errcode.h        |   1 -
 fs/bcachefs/fs-io-buffered.c | 149 ++++++++++++-------------------------------
 2 files changed, 40 insertions(+), 110 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index ab5a7adece10..742dcdd3e5d7 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -257,7 +257,6 @@
 	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
 	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
 	x(BCH_ERR_nopromote,		nopromote_enomem)			\
-	x(0,				need_inode_lock)			\
 	x(0,				invalid_snapshot_node)			\
 	x(0,				option_needs_open_fs)
 
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 184d03851676..ec8c427bf588 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -802,8 +802,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
 static int __bch2_buffered_write(struct bch_inode_info *inode,
 				 struct address_space *mapping,
 				 struct iov_iter *iter,
-				 loff_t pos, unsigned len,
-				 bool inode_locked)
+				 loff_t pos, unsigned len)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation res;
@@ -827,15 +826,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 
 	BUG_ON(!fs.nr);
 
-	/*
-	 * If we're not using the inode lock, we need to lock all the folios for
-	 * atomiticity of writes vs. other writes:
-	 */
-	if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
-		ret = -BCH_ERR_need_inode_lock;
-		goto out;
-	}
-
 	f = darray_first(fs);
 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
 		ret = bch2_read_single_folio(f, mapping);
@@ -932,10 +922,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	end = pos + copied;
 
 	spin_lock(&inode->v.i_lock);
-	if (end > inode->v.i_size) {
-		BUG_ON(!inode_locked);
+	if (end > inode->v.i_size)
 		i_size_write(&inode->v, end);
-	}
 	spin_unlock(&inode->v.i_lock);
 
 	f_pos = pos;
@@ -979,68 +967,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(file);
-	loff_t pos;
-	bool inode_locked = false;
-	ssize_t written = 0, written2 = 0, ret = 0;
-
-	/*
-	 * We don't take the inode lock unless i_size will be changing. Folio
-	 * locks provide exclusion with other writes, and the pagecache add lock
-	 * provides exclusion with truncate and hole punching.
-	 *
-	 * There is one nasty corner case where atomicity would be broken
-	 * without great care: when copying data from userspace to the page
-	 * cache, we do that with faults disable - a page fault would recurse
-	 * back into the filesystem, taking filesystem locks again, and
-	 * deadlock; so it's done with faults disabled, and we fault in the user
-	 * buffer when we aren't holding locks.
-	 *
-	 * If we do part of the write, but we then race and in the userspace
-	 * buffer have been evicted and are no longer resident, then we have to
-	 * drop our folio locks to re-fault them in, breaking write atomicity.
-	 *
-	 * To fix this, we restart the write from the start, if we weren't
-	 * holding the inode lock.
-	 *
-	 * There is another wrinkle after that; if we restart the write from the
-	 * start, and then get an unrecoverable error, we _cannot_ claim to
-	 * userspace that we did not write data we actually did - so we must
-	 * track (written2) the most we ever wrote.
-	 */
-
-	if ((iocb->ki_flags & IOCB_APPEND) ||
-	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
-		inode_lock(&inode->v);
-		inode_locked = true;
-	}
-
-	ret = generic_write_checks(iocb, iter);
-	if (ret <= 0)
-		goto unlock;
-
-	ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
-	if (ret) {
-		if (!inode_locked) {
-			inode_lock(&inode->v);
-			inode_locked = true;
-			ret = file_remove_privs_flags(file, 0);
-		}
-		if (ret)
-			goto unlock;
-	}
-
-	ret = file_update_time(file);
-	if (ret)
-		goto unlock;
-
-	pos = iocb->ki_pos;
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
 
 	bch2_pagecache_add_get(inode);
 
-	if (!inode_locked &&
-	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
-		goto get_inode_lock;
-
 	do {
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		unsigned bytes = iov_iter_count(iter);
@@ -1065,17 +997,12 @@ again:
 			}
 		}
 
-		if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
-			goto get_inode_lock;
-
 		if (unlikely(fatal_signal_pending(current))) {
 			ret = -EINTR;
 			break;
 		}
 
-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
-		if (ret == -BCH_ERR_need_inode_lock)
-			goto get_inode_lock;
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
 		if (unlikely(ret < 0))
 			break;
 
@@ -1096,46 +1023,50 @@ again:
 		}
 		pos += ret;
 		written += ret;
-		written2 = max(written, written2);
-
-		if (ret != bytes && !inode_locked)
-			goto get_inode_lock;
 		ret = 0;
 
 		balance_dirty_pages_ratelimited(mapping);
-
-		if (0) {
-get_inode_lock:
-			bch2_pagecache_add_put(inode);
-			inode_lock(&inode->v);
-			inode_locked = true;
-			bch2_pagecache_add_get(inode);
-
-			iov_iter_revert(iter, written);
-			pos -= written;
-			written = 0;
-			ret = 0;
-		}
 	} while (iov_iter_count(iter));
-	bch2_pagecache_add_put(inode);
-unlock:
-	if (inode_locked)
-		inode_unlock(&inode->v);
 
-	iocb->ki_pos += written;
+	bch2_pagecache_add_put(inode);
 
-	ret = max(written, written2) ?: ret;
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-	return ret;
+	return written ? written : ret;
 }
 
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	ssize_t ret = iocb->ki_flags & IOCB_DIRECT
-		? bch2_direct_write(iocb, iter)
-		: bch2_buffered_write(iocb, iter);
+	struct file *file = iocb->ki_filp;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	ssize_t ret;
+
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		ret = bch2_direct_write(iocb, from);
+		goto out;
+	}
+
+	inode_lock(&inode->v);
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto unlock;
+
+	ret = file_remove_privs(file);
+	if (ret)
+		goto unlock;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+
+	ret = bch2_buffered_write(iocb, from);
+	if (likely(ret > 0))
+		iocb->ki_pos += ret;
+unlock:
+	inode_unlock(&inode->v);
 
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+out:
 	return bch2_err_class(ret);
 }
 
-- 
cgit 


From 3d3020c461936009dc58702e267ff67b0076cbf2 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 22 Aug 2024 11:47:32 -0400
Subject: bcachefs: Mark more errors as autofix

errors that are known to always be safe to fix should be autofix: this
should be most errors even at this point, but that will need some
thorough review.

note that errors are still logged in the superblock, so we'll still know
that they happened.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/sb-errors_format.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index d3a498617303..f0c14702f9e6 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -23,7 +23,7 @@ enum bch_fsck_flags {
 	x(jset_past_bucket_end,					  9,	0)		\
 	x(jset_seq_blacklisted,					 10,	0)		\
 	x(journal_entries_missing,				 11,	0)		\
-	x(journal_entry_replicas_not_marked,			 12,	0)		\
+	x(journal_entry_replicas_not_marked,			 12,	FSCK_AUTOFIX)	\
 	x(journal_entry_past_jset_end,				 13,	0)		\
 	x(journal_entry_replicas_data_mismatch,			 14,	0)		\
 	x(journal_entry_bkey_u64s_0,				 15,	0)		\
@@ -288,10 +288,10 @@ enum bch_fsck_flags {
 	x(invalid_btree_id,					274,	0)		\
 	x(alloc_key_io_time_bad,				275,	0)		\
 	x(alloc_key_fragmentation_lru_wrong,			276,	FSCK_AUTOFIX)	\
-	x(accounting_key_junk_at_end,				277,	0)		\
-	x(accounting_key_replicas_nr_devs_0,			278,	0)		\
-	x(accounting_key_replicas_nr_required_bad,		279,	0)		\
-	x(accounting_key_replicas_devs_unsorted,		280,	0)		\
+	x(accounting_key_junk_at_end,				277,	FSCK_AUTOFIX)	\
+	x(accounting_key_replicas_nr_devs_0,			278,	FSCK_AUTOFIX)	\
+	x(accounting_key_replicas_nr_required_bad,		279,	FSCK_AUTOFIX)	\
+	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
-- 
cgit