From 163eae0fb0d4c610c59a8de38040f8e12f89fd43 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Sat, 8 Jun 2024 17:13:51 +0200 Subject: netfs: Switch debug logging to pr_debug() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of inventing a custom way to conditionally enable debugging, just make use of pr_debug(), which also has dynamic debugging facilities and is more likely known to someone who hunts a problem in the netfs code. Also drop the module parameter netfs_debug which didn't have any effect without further source changes. (The variable netfs_debug was only used in #ifdef blocks for cpp vars that don't exist; Note that CONFIG_NETFS_DEBUG isn't settable via kconfig, a variable with that name never existed in the mainline and is probably just taken over (and renamed) from similar custom debug logging implementations.) Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20240608151352.22860-2-ukleinek@kernel.org Signed-off-by: Christian Brauner --- fs/netfs/buffered_read.c | 14 +++++++------- fs/netfs/buffered_write.c | 12 ++++++------ fs/netfs/direct_read.c | 2 +- fs/netfs/direct_write.c | 8 ++++---- fs/netfs/fscache_cache.c | 4 ++-- fs/netfs/fscache_cookie.c | 28 ++++++++++++++-------------- fs/netfs/fscache_io.c | 12 ++++++------ fs/netfs/fscache_main.c | 2 +- fs/netfs/fscache_volume.c | 4 ++-- fs/netfs/internal.h | 33 +-------------------------------- fs/netfs/io.c | 12 ++++++------ fs/netfs/main.c | 4 ---- fs/netfs/misc.c | 4 ++-- fs/netfs/write_collect.c | 16 ++++++++-------- fs/netfs/write_issue.c | 36 ++++++++++++++++++------------------ 15 files changed, 78 insertions(+), 113 deletions(-) (limited to 'fs') diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index a6bb03bea920..4c0401dbbfcf 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -117,7 +117,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { if (folio->index == rreq->no_unlock_folio && test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) - _debug("no unlock"); + kdebug("no unlock"); else folio_unlock(folio); } @@ -204,7 +204,7 @@ void netfs_readahead(struct readahead_control *ractl) struct netfs_inode *ctx = netfs_inode(ractl->mapping->host); int ret; - _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); + kenter("%lx,%x", readahead_index(ractl), readahead_count(ractl)); if (readahead_count(ractl) == 0) return; @@ -268,7 +268,7 @@ int netfs_read_folio(struct file *file, struct folio *folio) struct folio *sink = NULL; int ret; - _enter("%lx", folio->index); + kenter("%lx", folio->index); rreq = netfs_alloc_request(mapping, file, folio_file_pos(folio), folio_size(folio), @@ -508,7 +508,7 @@ retry: have_folio: *_folio = folio; - _leave(" = 0"); + kleave(" = 0"); return 0; error_put: @@ -518,7 +518,7 @@ error: folio_unlock(folio); folio_put(folio); } - _leave(" = %d", ret); + kleave(" = %d", ret); return ret; } EXPORT_SYMBOL(netfs_write_begin); @@ -536,7 +536,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t flen = folio_size(folio); int ret; - _enter("%zx @%llx", flen, start); + kenter("%zx @%llx", flen, start); ret = -ENOMEM; @@ -567,7 +567,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); error: - _leave(" = %d", ret); + kleave(" = %d", ret); return ret; } diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index 1121601536d1..42b48a2b99cd 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -56,7 +56,7 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx, struct netfs_group *group = netfs_folio_group(folio); loff_t pos = folio_file_pos(folio); - _enter(""); + kenter(""); if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) return NETFS_FLUSH_CONTENT; @@ -272,12 +272,12 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, */ howto = netfs_how_to_modify(ctx, file, folio, netfs_group, flen, offset, part, maybe_trouble); - _debug("howto %u", howto); + kdebug("howto %u", howto); switch (howto) { case NETFS_JUST_PREFETCH: ret = netfs_prefetch_for_write(file, folio, offset, part); if (ret < 0) { - _debug("prefetch = %zd", ret); + kdebug("prefetch = %zd", ret); goto error_folio_unlock; } break; @@ -418,7 +418,7 @@ out: } iocb->ki_pos += written; - _leave(" = %zd [%zd]", written, ret); + kleave(" = %zd [%zd]", written, ret); return written ? written : ret; error_folio_unlock: @@ -491,7 +491,7 @@ ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct netfs_inode *ictx = netfs_inode(inode); ssize_t ret; - _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); + kenter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; @@ -528,7 +528,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr vm_fault_t ret = VM_FAULT_RETRY; int err; - _enter("%lx", folio->index); + kenter("%lx", folio->index); sb_start_pagefault(inode->i_sb); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 10a1e4da6bda..b6debac6205f 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -33,7 +33,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i size_t orig_count = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); - _enter(""); + kenter(""); if (!orig_count) return 0; /* Don't update atime */ diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index f516460e994e..cce072abfd18 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -37,7 +37,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * size_t len = iov_iter_count(iter); bool async = !is_sync_kiocb(iocb); - _enter(""); + kenter(""); /* We're going to need a bounce buffer if what we transmit is going to * be different in some way to the source buffer, e.g. because it gets @@ -45,7 +45,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * */ // TODO - _debug("uw %llx-%llx", start, end); + kdebug("uw %llx-%llx", start, end); wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start, iocb->ki_flags & IOCB_DIRECT ? @@ -95,7 +95,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * wreq->cleanup = netfs_cleanup_dio_write; ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter)); if (ret < 0) { - _debug("begin = %zd", ret); + kdebug("begin = %zd", ret); goto out; } @@ -142,7 +142,7 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t pos = iocb->ki_pos; unsigned long long end = pos + iov_iter_count(from) - 1; - _enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); + kenter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode)); if (!iov_iter_count(from)) return 0; diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index 9397ed39b0b4..288a73c3072d 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -237,7 +237,7 @@ int fscache_add_cache(struct fscache_cache *cache, { int n_accesses; - _enter("{%s,%s}", ops->name, cache->name); + kenter("{%s,%s}", ops->name, cache->name); BUG_ON(fscache_cache_state(cache) != FSCACHE_CACHE_IS_PREPARING); @@ -257,7 +257,7 @@ int fscache_add_cache(struct fscache_cache *cache, up_write(&fscache_addremove_sem); pr_notice("Cache \"%s\" added (type %s)\n", cache->name, ops->name); - _leave(" = 0 [%s]", cache->name); + kleave(" = 0 [%s]", cache->name); return 0; } EXPORT_SYMBOL(fscache_add_cache); diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index bce2492186d0..4d1e8bf4c615 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -456,7 +456,7 @@ struct fscache_cookie *__fscache_acquire_cookie( { struct fscache_cookie *cookie; - _enter("V=%x", volume->debug_id); + kenter("V=%x", volume->debug_id); if (!index_key || !index_key_len || index_key_len > 255 || aux_data_len > 255) return NULL; @@ -484,7 +484,7 @@ struct fscache_cookie *__fscache_acquire_cookie( trace_fscache_acquire(cookie); fscache_stat(&fscache_n_acquires_ok); - _leave(" = c=%08x", cookie->debug_id); + kleave(" = c=%08x", cookie->debug_id); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); @@ -505,7 +505,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) enum fscache_access_trace trace = fscache_access_lookup_cookie_end_failed; bool need_withdraw = false; - _enter(""); + kenter(""); if (!cookie->volume->cache_priv) { fscache_create_volume(cookie->volume, true); @@ -519,7 +519,7 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) if (cookie->state != FSCACHE_COOKIE_STATE_FAILED) fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_QUIESCENT); need_withdraw = true; - _leave(" [fail]"); + kleave(" [fail]"); goto out; } @@ -572,7 +572,7 @@ void __fscache_use_cookie(struct fscache_cookie *cookie, bool will_modify) bool queue = false; int n_active; - _enter("c=%08x", cookie->debug_id); + kenter("c=%08x", cookie->debug_id); if (WARN(test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), "Trying to use relinquished cookie\n")) @@ -636,7 +636,7 @@ again: spin_unlock(&cookie->lock); if (queue) fscache_queue_cookie(cookie, fscache_cookie_get_use_work); - _leave(""); + kleave(""); } EXPORT_SYMBOL(__fscache_use_cookie); @@ -702,7 +702,7 @@ static void fscache_cookie_state_machine(struct fscache_cookie *cookie) enum fscache_cookie_state state; bool wake = false; - _enter("c=%x", cookie->debug_id); + kenter("c=%x", cookie->debug_id); again: spin_lock(&cookie->lock); @@ -820,7 +820,7 @@ out: spin_unlock(&cookie->lock); if (wake) wake_up_cookie_state(cookie); - _leave(""); + kleave(""); } static void fscache_cookie_worker(struct work_struct *work) @@ -867,7 +867,7 @@ static void fscache_cookie_lru_do_one(struct fscache_cookie *cookie) set_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags); spin_unlock(&cookie->lock); fscache_stat(&fscache_n_cookies_lru_expired); - _debug("lru c=%x", cookie->debug_id); + kdebug("lru c=%x", cookie->debug_id); __fscache_withdraw_cookie(cookie); } @@ -971,7 +971,7 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) if (retire) fscache_stat(&fscache_n_relinquishes_retire); - _enter("c=%08x{%d},%d", + kenter("c=%08x{%d},%d", cookie->debug_id, atomic_read(&cookie->n_active), retire); if (WARN(test_and_set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags), @@ -1050,7 +1050,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, { bool is_caching; - _enter("c=%x", cookie->debug_id); + kenter("c=%x", cookie->debug_id); fscache_stat(&fscache_n_invalidates); @@ -1072,7 +1072,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, case FSCACHE_COOKIE_STATE_INVALIDATING: /* is_still_valid will catch it */ default: spin_unlock(&cookie->lock); - _leave(" [no %u]", cookie->state); + kleave(" [no %u]", cookie->state); return; case FSCACHE_COOKIE_STATE_LOOKING_UP: @@ -1081,7 +1081,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, fallthrough; case FSCACHE_COOKIE_STATE_CREATING: spin_unlock(&cookie->lock); - _leave(" [look %x]", cookie->inval_counter); + kleave(" [look %x]", cookie->inval_counter); return; case FSCACHE_COOKIE_STATE_ACTIVE: @@ -1094,7 +1094,7 @@ void __fscache_invalidate(struct fscache_cookie *cookie, if (is_caching) fscache_queue_cookie(cookie, fscache_cookie_get_inval_work); - _leave(" [inv]"); + kleave(" [inv]"); return; } } diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index 38637e5c9b57..bf4eaeec44fb 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -28,12 +28,12 @@ bool fscache_wait_for_operation(struct netfs_cache_resources *cres, again: if (!fscache_cache_is_live(cookie->volume->cache)) { - _leave(" [broken]"); + kleave(" [broken]"); return false; } state = fscache_cookie_state(cookie); - _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); + kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state); switch (state) { case FSCACHE_COOKIE_STATE_CREATING: @@ -52,7 +52,7 @@ again: case FSCACHE_COOKIE_STATE_DROPPED: case FSCACHE_COOKIE_STATE_RELINQUISHING: default: - _leave(" [not live]"); + kleave(" [not live]"); return false; } @@ -92,7 +92,7 @@ again: spin_lock(&cookie->lock); state = fscache_cookie_state(cookie); - _enter("c=%08x{%u},%x", cookie->debug_id, state, want_state); + kenter("c=%08x{%u},%x", cookie->debug_id, state, want_state); switch (state) { case FSCACHE_COOKIE_STATE_LOOKING_UP: @@ -140,7 +140,7 @@ failed: cres->cache_priv = NULL; cres->ops = NULL; fscache_end_cookie_access(cookie, fscache_access_io_not_live); - _leave(" = -ENOBUFS"); + kleave(" = -ENOBUFS"); return -ENOBUFS; } @@ -224,7 +224,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, if (len == 0) goto abandon; - _enter("%llx,%zx", start, len); + kenter("%llx,%zx", start, len); wreq = kzalloc(sizeof(struct fscache_write_request), GFP_NOFS); if (!wreq) diff --git a/fs/netfs/fscache_main.c b/fs/netfs/fscache_main.c index 42e98bb523e3..bf9b33d26e31 100644 --- a/fs/netfs/fscache_main.c +++ b/fs/netfs/fscache_main.c @@ -99,7 +99,7 @@ error_wq: */ void __exit fscache_exit(void) { - _enter(""); + kenter(""); kmem_cache_destroy(fscache_cookie_jar); fscache_proc_cleanup(); diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index cdf991bdd9de..fbdc428aaea9 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -251,7 +251,7 @@ static struct fscache_volume *fscache_alloc_volume(const char *volume_key, fscache_see_volume(volume, fscache_volume_new_acquire); fscache_stat(&fscache_n_volumes); up_write(&fscache_addremove_sem); - _leave(" = v=%x", volume->debug_id); + kleave(" = v=%x", volume->debug_id); return volume; err_vol: @@ -452,7 +452,7 @@ void fscache_withdraw_volume(struct fscache_volume *volume) { int n_accesses; - _debug("withdraw V=%x", volume->debug_id); + kdebug("withdraw V=%x", volume->debug_id); /* Allow wakeups on dec-to-0 */ n_accesses = atomic_dec_return(&volume->n_accesses); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 95e281a8af78..de59e39e39a7 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -34,7 +34,6 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync); /* * main.c */ -extern unsigned int netfs_debug; extern struct list_head netfs_io_requests; extern spinlock_t netfs_proc_lock; extern mempool_t netfs_request_pool; @@ -365,42 +364,12 @@ void fscache_create_volume(struct fscache_volume *volume, bool wait); * debug tracing */ #define dbgprintk(FMT, ...) \ - printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) + pr_debug("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__) #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__) -#ifdef __KDEBUG -#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__) -#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__) -#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__) - -#elif defined(CONFIG_NETFS_DEBUG) -#define _enter(FMT, ...) \ -do { \ - if (netfs_debug) \ - kenter(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _leave(FMT, ...) \ -do { \ - if (netfs_debug) \ - kleave(FMT, ##__VA_ARGS__); \ -} while (0) - -#define _debug(FMT, ...) \ -do { \ - if (netfs_debug) \ - kdebug(FMT, ##__VA_ARGS__); \ -} while (0) - -#else -#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__) -#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__) -#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__) -#endif - /* * assertions */ diff --git a/fs/netfs/io.c b/fs/netfs/io.c index c93851b98368..c7576481c321 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -130,7 +130,7 @@ static void netfs_reset_subreq_iter(struct netfs_io_request *rreq, if (count == remaining) return; - _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", + kdebug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n", rreq->debug_id, subreq->debug_index, iov_iter_count(&subreq->io_iter), subreq->transferred, subreq->len, rreq->i_size, @@ -326,7 +326,7 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq, struct netfs_io_request *rreq = subreq->rreq; int u; - _enter("R=%x[%x]{%llx,%lx},%zd", + kenter("R=%x[%x]{%llx,%lx},%zd", rreq->debug_id, subreq->debug_index, subreq->start, subreq->flags, transferred_or_error); @@ -435,7 +435,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq, struct netfs_inode *ictx = netfs_inode(rreq->inode); size_t lsize; - _enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); + kenter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size); if (rreq->origin != NETFS_DIO_READ) { source = netfs_cache_prepare_read(subreq, rreq->i_size); @@ -518,7 +518,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq, subreq->start = rreq->start + rreq->submitted; subreq->len = io_iter->count; - _debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); + kdebug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted); list_add_tail(&subreq->rreq_link, &rreq->subrequests); /* Call out to the cache to find out what it can do with the remaining @@ -570,7 +570,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) struct iov_iter io_iter; int ret; - _enter("R=%x %llx-%llx", + kenter("R=%x %llx-%llx", rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); if (rreq->len == 0) { @@ -593,7 +593,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync) atomic_set(&rreq->nr_outstanding, 1); io_iter = rreq->io_iter; do { - _debug("submit %llx + %llx >= %llx", + kdebug("submit %llx + %llx >= %llx", rreq->start, rreq->submitted, rreq->i_size); if (rreq->origin == NETFS_DIO_READ && rreq->start + rreq->submitted >= rreq->i_size) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 5f0f438e5d21..db824c372842 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -20,10 +20,6 @@ MODULE_LICENSE("GPL"); EXPORT_TRACEPOINT_SYMBOL(netfs_sreq); -unsigned netfs_debug; -module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask"); - static struct kmem_cache *netfs_request_slab; static struct kmem_cache *netfs_subrequest_slab; mempool_t netfs_request_pool; diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index bc1fc54fb724..607dd6327a60 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -107,7 +107,7 @@ bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio) struct fscache_cookie *cookie = netfs_i_cookie(ictx); bool need_use = false; - _enter(""); + kenter(""); if (!filemap_dirty_folio(mapping, folio)) return false; @@ -180,7 +180,7 @@ void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) struct netfs_folio *finfo; size_t flen = folio_size(folio); - _enter("{%lx},%zx,%zx", folio->index, offset, length); + kenter("{%lx},%zx,%zx", folio->index, offset, length); if (!folio_test_private(folio)) return; diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 60112e4b2c5e..79058e238660 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -161,7 +161,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, { struct list_head *next; - _enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); + kenter("R=%x[%x:]", wreq->debug_id, stream->stream_nr); if (list_empty(&stream->subrequests)) return; @@ -374,7 +374,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq) unsigned int notes; int s; - _enter("%llx-%llx", wreq->start, wreq->start + wreq->len); + kenter("%llx-%llx", wreq->start, wreq->start + wreq->len); trace_netfs_collect(wreq); trace_netfs_rreq(wreq, netfs_rreq_trace_collect); @@ -409,7 +409,7 @@ reassess_streams: front = stream->front; while (front) { trace_netfs_collect_sreq(wreq, front); - //_debug("sreq [%x] %llx %zx/%zx", + //kdebug("sreq [%x] %llx %zx/%zx", // front->debug_index, front->start, front->transferred, front->len); /* Stall if there may be a discontinuity. */ @@ -598,7 +598,7 @@ reassess_streams: out: netfs_put_group_many(wreq->group, wreq->nr_group_rel); wreq->nr_group_rel = 0; - _leave(" = %x", notes); + kleave(" = %x", notes); return; need_retry: @@ -606,7 +606,7 @@ need_retry: * that any partially completed op will have had any wholly transferred * folios removed from it. */ - _debug("retry"); + kdebug("retry"); netfs_retry_writes(wreq); goto out; } @@ -621,7 +621,7 @@ void netfs_write_collection_worker(struct work_struct *work) size_t transferred; int s; - _enter("R=%x", wreq->debug_id); + kenter("R=%x", wreq->debug_id); netfs_see_request(wreq, netfs_rreq_trace_see_work); if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) { @@ -684,7 +684,7 @@ void netfs_write_collection_worker(struct work_struct *work) if (wreq->origin == NETFS_DIO_WRITE) inode_dio_end(wreq->inode); - _debug("finished"); + kdebug("finished"); trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags); wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS); @@ -743,7 +743,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, struct netfs_io_request *wreq = subreq->rreq; struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; - _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); + kenter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); switch (subreq->source) { case NETFS_UPLOAD_TO_SERVER: diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index e190043bc0da..f61f30ed8546 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -99,7 +99,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, if (IS_ERR(wreq)) return wreq; - _enter("R=%x", wreq->debug_id); + kenter("R=%x", wreq->debug_id); ictx = netfs_inode(wreq->inode); if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags)) @@ -159,7 +159,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq, subreq->max_nr_segs = INT_MAX; subreq->stream_nr = stream->stream_nr; - _enter("R=%x[%x]", wreq->debug_id, subreq->debug_index); + kenter("R=%x[%x]", wreq->debug_id, subreq->debug_index); trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), @@ -215,7 +215,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, { struct netfs_io_request *wreq = subreq->rreq; - _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); + kenter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) return netfs_write_subrequest_terminated(subreq, subreq->error, false); @@ -272,11 +272,11 @@ int netfs_advance_write(struct netfs_io_request *wreq, size_t part; if (!stream->avail) { - _leave("no write"); + kleave("no write"); return len; } - _enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); + kenter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0); if (subreq && start != subreq->start + subreq->len) { netfs_issue_write(wreq, stream); @@ -288,7 +288,7 @@ int netfs_advance_write(struct netfs_io_request *wreq, subreq = stream->construct; part = min(subreq->max_len - subreq->len, len); - _debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); + kdebug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len); subreq->len += part; subreq->nr_segs++; @@ -319,7 +319,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, bool to_eof = false, streamw = false; bool debug = false; - _enter(""); + kenter(""); /* netfs_perform_write() may shift i_size around the page or from out * of the page to beyond it, but cannot move i_size into or through the @@ -329,7 +329,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, if (fpos >= i_size) { /* mmap beyond eof. */ - _debug("beyond eof"); + kdebug("beyond eof"); folio_start_writeback(folio); folio_unlock(folio); wreq->nr_group_rel += netfs_folio_written_back(folio); @@ -363,7 +363,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, } flen -= foff; - _debug("folio %zx %zx %zx", foff, flen, fsize); + kdebug("folio %zx %zx %zx", foff, flen, fsize); /* Deal with discontinuities in the stream of dirty pages. These can * arise from a number of sources: @@ -487,7 +487,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq, for (int s = 0; s < NR_IO_STREAMS; s++) netfs_issue_write(wreq, &wreq->io_streams[s]); - _leave(" = 0"); + kleave(" = 0"); return 0; } @@ -522,7 +522,7 @@ int netfs_writepages(struct address_space *mapping, netfs_stat(&netfs_n_wh_writepages); do { - _debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); + kdebug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted); /* It appears we don't have to handle cyclic writeback wrapping. */ WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted); @@ -546,14 +546,14 @@ int netfs_writepages(struct address_space *mapping, mutex_unlock(&ictx->wb_lock); netfs_put_request(wreq, false, netfs_rreq_trace_put_return); - _leave(" = %d", error); + kleave(" = %d", error); return error; couldnt_start: netfs_kill_dirty_pages(mapping, wbc, folio); out: mutex_unlock(&ictx->wb_lock); - _leave(" = %d", error); + kleave(" = %d", error); return error; } EXPORT_SYMBOL(netfs_writepages); @@ -590,7 +590,7 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c struct folio *folio, size_t copied, bool to_page_end, struct folio **writethrough_cache) { - _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", + kenter("R=%x ic=%zu ws=%u cp=%zu tp=%u", wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end); if (!*writethrough_cache) { @@ -624,7 +624,7 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr struct netfs_inode *ictx = netfs_inode(wreq->inode); int ret; - _enter("R=%x", wreq->debug_id); + kenter("R=%x", wreq->debug_id); if (writethrough_cache) netfs_write_folio(wreq, wbc, writethrough_cache); @@ -652,7 +652,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t loff_t start = wreq->start; int error = 0; - _enter("%zx", len); + kenter("%zx", len); if (wreq->origin == NETFS_DIO_WRITE) inode_dio_begin(wreq->inode); @@ -660,7 +660,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t while (len) { // TODO: Prepare content encryption - _debug("unbuffered %zx", len); + kdebug("unbuffered %zx", len); part = netfs_advance_write(wreq, upload, start, len, false); start += part; len -= part; @@ -679,6 +679,6 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t if (list_empty(&upload->subrequests)) netfs_wake_write_collector(wreq, false); - _leave(" = %d", error); + kleave(" = %d", error); return error; } -- cgit From 759b2e800f160d0749696669a0874436a72482c6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Jun 2024 11:06:31 -0400 Subject: bcachefs: Switch online_reserved shutdown assert to WARN() Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index fb906467201e..24953cd9bbbc 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -563,8 +563,11 @@ static void __bch2_fs_free(struct bch_fs *c) BUG_ON(atomic_read(&c->journal_keys.ref)); bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); - EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved)); - free_percpu(c->online_reserved); + if (c->online_reserved) { + u64 v = percpu_u64_get(c->online_reserved); + WARN(v, "online_reserved not 0 at shutdown: %lli", v); + free_percpu(c->online_reserved); + } darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); -- cgit From 84db60001619b08e336a9e0d2344e71cec9011d1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Jun 2024 13:27:54 -0400 Subject: bcachefs: Delete old faulty bch2_trans_unlock() call the unlock is now in read_extent, this fixes an assertion pop in read_from_stale_dirty_pointer() Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index c97fa7002b06..2a5c4371f6f8 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -389,7 +389,6 @@ retry: bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(trans); if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, -- cgit From 600b8be5e74713f220a18061c82c05f1e95ce9f6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Jun 2024 19:16:09 -0400 Subject: bcachefs: Change bch2_fs_journal_stop() BUG_ON() to warning Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 13669dd0e375..51c2fa09d951 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1184,9 +1184,11 @@ void bch2_fs_journal_stop(struct journal *j) journal_quiesce(j); cancel_delayed_work_sync(&j->write_work); - BUG_ON(!bch2_journal_error(j) && - test_bit(JOURNAL_replay_done, &j->flags) && - j->last_empty_seq != journal_cur_seq(j)); + WARN(!bch2_journal_error(j) && + test_bit(JOURNAL_replay_done, &j->flags) && + j->last_empty_seq != journal_cur_seq(j), + "journal shutdown error: cur seq %llu but last empty seq %llu", + journal_cur_seq(j), j->last_empty_seq); if (!bch2_journal_error(j)) clear_bit(JOURNAL_running, &j->flags); -- cgit From a0bd30e4ea9da9499c5527dc84a0e2d291f273d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Jun 2024 19:41:00 -0400 Subject: bcachefs: Fix shift greater than integer size Reported-by: syzbot+e5292b50f1957164a4b6@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 94a1d1982fa8..587d7318a2e8 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -660,8 +660,9 @@ int bch2_bkey_format_invalid(struct bch_fs *c, bch2_bkey_format_field_overflows(f, i)) { unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); - u64 packed_max = f->bits_per_field[i] - ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + unsigned packed_bits = min(64, f->bits_per_field[i]); + u64 packed_max = packed_bits + ? ~((~0ULL << 1) << (packed_bits - 1)) : 0; prt_printf(err, "field %u too large: %llu + %llu > %llu", -- cgit From 44ec5990357b9ebb8e5e88bb4f98a2746b938fab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Jun 2024 19:47:31 -0400 Subject: bcachefs: Don't use the new_fs() bucket alloc path on an initialized fs On a new filesystem or device we have to allocate the journal with a bump allocator, because allocation info isn't ready yet - but when hot-adding a device that doesn't have a journal, we don't want to use that path. Reported-by: syzbot+24a867cb90d8315cccff@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 6 +++--- fs/bcachefs/journal.h | 2 +- fs/bcachefs/super.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 51c2fa09d951..0f4ad9e64ad1 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1095,7 +1095,7 @@ unlock: return ret; } -int bch2_dev_journal_alloc(struct bch_dev *ca) +int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { unsigned nr; int ret; @@ -1117,7 +1117,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); + ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL); err: bch_err_fn(ca, ret); return ret; @@ -1129,7 +1129,7 @@ int bch2_fs_journal_alloc(struct bch_fs *c) if (ca->journal.nr) continue; - int ret = bch2_dev_journal_alloc(ca); + int ret = bch2_dev_journal_alloc(ca, true); if (ret) { percpu_ref_put(&ca->io_ref); return ret; diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index fd1f7cdaa8bc..bc6b9c39dcb4 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -433,7 +433,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); -int bch2_dev_journal_alloc(struct bch_dev *); +int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); void bch2_dev_journal_stop(struct journal *, struct bch_dev *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 24953cd9bbbc..da735608d47c 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1772,7 +1772,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, true); bch_err_msg(c, ret, "allocating journal"); if (ret) goto err; @@ -1932,7 +1932,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) } if (!ca->journal.nr) { - ret = bch2_dev_journal_alloc(ca); + ret = bch2_dev_journal_alloc(ca, false); bch_err_msg(ca, ret, "allocating journal"); if (ret) goto err; -- cgit From 1539bdf516601cf05ab11cbc6ddbfc31d37d74f7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Jun 2024 20:40:24 -0400 Subject: bcachefs: Fix bch2_read_retry_nodecode() BCH_READ_NODECODE mode - used by the move paths - really wants to use only the original rbio, but the retry path really wants to clone - oof. Make sure to copy the crc of the pointer we read from back to the original rbio, or we'll see spurious checksum errors later. Signed-off-by: Kent Overstreet --- fs/bcachefs/io_read.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 2a5c4371f6f8..ebf39ef72fb2 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -1003,6 +1003,9 @@ get_bio: rbio->promote = promote; INIT_WORK(&rbio->work, NULL); + if (flags & BCH_READ_NODECODE) + orig->pick = pick; + rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; -- cgit From 67c564111f0e705dc272035197606c37bae94610 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Jun 2024 21:08:01 -0400 Subject: bcachefs: Fix loop restart in bch2_btree_transactions_read() Accidental infinite loop; also fix btree_deadlock_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/debug.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index f0d4727c4dc2..ebabab171fe5 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -610,7 +610,7 @@ restart: list_sort(&c->btree_trans_list, list_ptr_order_cmp); list_for_each_entry(trans, &c->btree_trans_list, list) { - if ((ulong) trans < i->iter) + if ((ulong) trans <= i->iter) continue; i->iter = (ulong) trans; @@ -832,16 +832,16 @@ static const struct file_operations btree_transaction_stats_op = { static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans *trans; - pid_t iter = 0; + ulong iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); - list_for_each_entry(trans, &c->btree_trans_list, list) { - struct task_struct *task = READ_ONCE(trans->locking_wait.task); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); - if (!task || task->pid <= iter) + list_for_each_entry(trans, &c->btree_trans_list, list) { + if ((ulong) trans <= iter) continue; - iter = task->pid; + iter = (ulong) trans; if (!closure_get_not_zero(&trans->ref)) continue; -- cgit From ef05bdf5d61529daa82058db87fab2752adfc8ee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Jun 2024 14:08:49 -0400 Subject: bcachefs: Add missing printbuf_tabstops_reset() calls Fixes warnings from bch2_print_allocator_stuck() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 2 ++ fs/bcachefs/journal.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 9d3d64746a5b..27d97c22ae27 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1703,6 +1703,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; + printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 24); percpu_down_read(&c->mark_lock); @@ -1736,6 +1737,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; + printbuf_tabstops_reset(out); printbuf_tabstop_push(out, 12); printbuf_tabstop_push(out, 16); printbuf_tabstop_push(out, 16); diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 0f4ad9e64ad1..10b19791ec98 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1420,8 +1420,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) unsigned long now = jiffies; u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes; - if (!out->nr_tabstops) - printbuf_tabstop_push(out, 28); + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 28); out->atomic++; rcu_read_lock(); -- cgit From 92e1c29ae803f85d2f50b4450becb258acae4311 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Jun 2024 18:32:01 -0400 Subject: bcachefs: bch2_btree_write_buffer_maybe_flush() Add a new helper for checking references to write buffer btrees, where we need a flush before we definitively know we have an inconsistency. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 68 ++++++++++++++-------------------------- fs/bcachefs/bkey.h | 7 +++++ fs/bcachefs/btree_write_buffer.c | 37 ++++++++++++++++++++++ fs/bcachefs/btree_write_buffer.h | 3 ++ 4 files changed, 71 insertions(+), 44 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 4321f9fb73bd..e8037b5d8332 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -434,13 +434,6 @@ int bch2_check_btree_backpointers(struct bch_fs *c) return ret; } -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - struct extents_to_bp_state { struct bpos bucket_start; struct bpos bucket_end; @@ -536,11 +529,8 @@ static int check_bp_exists(struct btree_trans *trans, struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c bp_k; - struct bkey_buf tmp; int ret = 0; - bch2_bkey_buf_init(&tmp); - struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); if (!ca) { prt_str(&buf, "extent for nonexistent device:bucket "); @@ -565,22 +555,9 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { - bch2_bkey_buf_reassemble(&tmp, c, orig_k); - - if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) { - if (bp.level) { - bch2_trans_unlock(trans); - bch2_btree_interior_updates_flush(c); - } - - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; - - bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); + if (ret) + goto err; goto check_existing_bp; } @@ -589,7 +566,6 @@ err: fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); - bch2_bkey_buf_exit(&tmp, c); bch2_dev_put(ca); printbuf_exit(&buf); return ret; @@ -905,7 +881,7 @@ static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, struct bkey_s_c_backpointer bp, - struct bpos *last_flushed_pos) + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -925,20 +901,18 @@ static int check_one_backpointer(struct btree_trans *trans, if (ret) return ret; - if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { - *last_flushed_pos = bp.k->p; - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; - goto out; - } + if (!k.k) { + ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); + if (ret) + goto out; - if (fsck_err_on(!k.k, c, - backpointer_to_missing_ptr, - "backpointer for missing %s\n %s", - bp.v->level ? "btree node" : "extent", - (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); - goto out; + if (fsck_err(c, backpointer_to_missing_ptr, + "backpointer for missing %s\n %s", + bp.v->level ? "btree node" : "extent", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + goto out; + } } out: fsck_err: @@ -951,14 +925,20 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct bpos last_flushed_pos = SPOS_MAX; + struct bkey_buf last_flushed; - return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_one_backpointer(trans, start, end, bkey_s_c_to_backpointer(k), - &last_flushed_pos)); + &last_flushed)); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + return ret; } int bch2_check_backpointers_to_extents(struct bch_fs *c) diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index fcd43915df07..936357149cf0 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -194,6 +194,13 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) return bkey_gt(l, r) ? l : r; } +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return bpos_eq(l.k->p, r.k->p) && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 75c8a196b3f6..d0e92d948002 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_locking.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "error.h" +#include "extents.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -492,6 +494,41 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) return ret; } +/** + * In check and repair code, when checking references to write buffer btrees we + * need to issue a flush before we have a definitive error: this issues a flush + * if this is a key we haven't yet checked. + */ +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_buf tmp; + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + bch2_bkey_buf_reassemble(&tmp, c, referring_k); + + if (bkey_is_btree_ptr(referring_k.k)) { + bch2_trans_unlock(trans); + bch2_btree_interior_updates_flush(c); + } + + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + + bch2_bkey_buf_copy(last_flushed, c, tmp.k); + ret = -BCH_ERR_transaction_restart_write_buffer_flush; + } +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + static void bch2_btree_write_buffer_flush_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index eebcd2b15249..dd5e64218b50 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -23,6 +23,9 @@ int bch2_btree_write_buffer_flush_sync(struct btree_trans *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); +struct bkey_buf; +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); + struct journal_keys_to_wb { struct btree_write_buffer_keys *wb; size_t room; -- cgit From d39881d2da2a621df49118bfe5b594c7e8099aa6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Jun 2024 18:35:18 -0400 Subject: bcachefs: add check for missing fragmentation in check_alloc_to_lru_ref() We need to make sure we're not missing any fragmenation entries in the LRU BTREE after repairing ALLOC BTREE Also, use the new bch2_btree_write_buffer_maybe_flush() helper; this was only working without it before since bucket invalidation (usually) wasn't happening while fsck was running. Co-developed-by: Daniel Hill Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 48 ++++++++++++++++++++---------------------- fs/bcachefs/lru.c | 39 ++++++++++++++++++++++++++++++++++ fs/bcachefs/lru.h | 3 +++ 3 files changed, 65 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 1de9fac3bcf4..658f11aebda1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -3,6 +3,7 @@ #include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" +#include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_key_cache.h" @@ -1553,13 +1554,13 @@ err: } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, - struct btree_iter *alloc_iter) + struct btree_iter *alloc_iter, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter lru_iter; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - struct bkey_s_c alloc_k, lru_k; + struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret; @@ -1573,6 +1574,14 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); + if (a->fragmentation_lru) { + ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + a->fragmentation_lru, + alloc_k, last_flushed); + if (ret) + return ret; + } + if (a->data_type != BCH_DATA_cached) return 0; @@ -1597,41 +1606,30 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = &a_mut->v; } - lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]), 0); - ret = bkey_err(lru_k); + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + alloc_k, last_flushed); if (ret) - return ret; - - if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, - alloc_key_to_missing_lru_entry, - "missing lru entry\n" - " %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = bch2_lru_set(trans, - alloc_k.k->p.inode, - bucket_to_u64(alloc_k.k->p), - a->io_time[READ]); - if (ret) - goto err; - } + goto err; err: fsck_err: - bch2_trans_iter_exit(trans, &lru_iter); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index a40d116224ed..b12894ef44f3 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -77,6 +77,45 @@ static const char * const bch2_lru_types[] = { NULL }; +int bch2_lru_check_set(struct btree_trans *trans, + u16 lru_id, u64 time, + struct bkey_s_c referring_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter lru_iter; + struct bkey_s_c lru_k = + bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(lru_id, + bucket_to_u64(referring_k.k->p), + time), 0); + int ret = bkey_err(lru_k); + if (ret) + return ret; + + if (lru_k.k->type != KEY_TYPE_set) { + ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); + if (ret) + goto err; + + if (fsck_err(c, alloc_key_to_missing_lru_entry, + "missing %s lru entry\n" + " %s", + bch2_lru_types[lru_type(lru_k)], + (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + if (ret) + goto err; + } + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf); + return ret; +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index bd71ba77de07..ed75bcf59d47 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -61,6 +61,9 @@ int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +struct bkey_buf; +int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); + int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ -- cgit From b5cbb42dc59f519fa3cf49b9afbd5ee4805be01b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Jun 2024 18:37:13 -0400 Subject: bcachefs: Repair fragmentation_lru in alloc_write_key() fragmentation_lru derives from dirty_sectors, and wasn't being checked. Co-developed-by: Daniel Hill Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 24 +++++++++++------------- fs/bcachefs/sb-errors_format.h | 3 ++- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 0e477a926579..7c72a9e6f511 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -903,6 +903,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans, bch2_dev_usage_update(c, ca, &old_gc, &gc, 0, true); percpu_up_read(&c->mark_lock); + gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca); + if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" @@ -916,23 +918,19 @@ static int bch2_alloc_write_key(struct btree_trans *trans, #define copy_bucket_field(_errtype, _f) \ if (fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", \ + ": got %llu, should be %llu", \ iter->pos.inode, iter->pos.offset, \ gc.gen, \ bch2_data_type_str(gc.data_type), \ - new._f, gc._f)) \ + (u64) new._f, (u64) gc._f)) \ new._f = gc._f; \ - copy_bucket_field(alloc_key_gen_wrong, - gen); - copy_bucket_field(alloc_key_dirty_sectors_wrong, - dirty_sectors); - copy_bucket_field(alloc_key_cached_sectors_wrong, - cached_sectors); - copy_bucket_field(alloc_key_stripe_wrong, - stripe); - copy_bucket_field(alloc_key_stripe_redundancy_wrong, - stripe_redundancy); + copy_bucket_field(alloc_key_gen_wrong, gen); + copy_bucket_field(alloc_key_dirty_sectors_wrong, dirty_sectors); + copy_bucket_field(alloc_key_cached_sectors_wrong, cached_sectors); + copy_bucket_field(alloc_key_stripe_wrong, stripe); + copy_bucket_field(alloc_key_stripe_redundancy_wrong, stripe_redundancy); + copy_bucket_field(alloc_key_fragmentation_lru_wrong, fragmentation_lru); #undef copy_bucket_field if (!bch2_alloc_v4_cmp(*old, new)) @@ -946,7 +944,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, a->v = new; /* - * The trigger normally makes sure this is set, but we're not running + * The trigger normally makes sure these are set, but we're not running * triggers: */ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d6f35a99c429..d54121ec093f 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -286,7 +286,8 @@ enum bch_fsck_flags { x(accounting_mismatch, 272, 0) \ x(accounting_replicas_not_marked, 273, 0) \ x(invalid_btree_id, 274, 0) \ - x(alloc_key_io_time_bad, 275, 0) + x(alloc_key_io_time_bad, 275, 0) \ + x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, -- cgit From a2d23f3d916bf9abd77944882cba131af1085bcc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Jun 2024 21:02:17 -0400 Subject: bcachefs: io clock: run timer fns under clock lock We don't have a way to flush a timer that's executing the callback, and this is simple and limited enough in scope that we can just use the lock instead. Needed for the next patch that adds direct wakeups from the allocator to copygc, where we're now more frequently calling io_timer_del() on an expiring timer. Signed-off-by: Kent Overstreet --- fs/bcachefs/clock.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 363644451106..0f40b585ce2b 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -132,14 +132,9 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, { struct io_timer *ret = NULL; - spin_lock(&clock->timer_lock); - if (clock->timers.used && time_after_eq(now, clock->timers.data[0]->expire)) heap_pop(&clock->timers, ret, io_timer_cmp, NULL); - - spin_unlock(&clock->timer_lock); - return ret; } @@ -148,8 +143,10 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) struct io_timer *timer; unsigned long now = atomic64_add_return(sectors, &clock->now); + spin_lock(&clock->timer_lock); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); + spin_unlock(&clock->timer_lock); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) -- cgit From 64d2c847ba380e07b9072d65a50aa6469d2aa43f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Thu, 20 Jun 2024 15:05:45 +0900 Subject: btrfs: zoned: fix calc_available_free_space() for zoned mode calc_available_free_space() returns the total size of metadata (or system) block groups, which can be allocated from unallocated disk space. The logic is wrong on zoned mode in two places. First, the calculation of data_chunk_size is wrong. We always allocate one zone as one chunk, and no partial allocation of a zone. So, we should use zone_size (= data_sinfo->chunk_size) as it is. Second, the result "avail" may not be zone aligned. Since we always allocate one zone as one chunk on zoned mode, returning non-zone size aligned bytes will result in less pressure on the async metadata reclaim process. This is serious for the nearly full state with a large zone size device. Allowing over-commit too much will result in less async reclaim work and end up in ENOSPC. We can align down to the zone size to avoid that. Fixes: cb6cbab79055 ("btrfs: adjust overcommit logic when very close to full") CC: stable@vger.kernel.org # 6.9 Signed-off-by: Naohiro Aota Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d620323d08ea..ae8c56442549 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -373,11 +373,18 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, * "optimal" chunk size based on the fs size. However when we actually * allocate the chunk we will strip this down further, making it no more * than 10% of the disk or 1G, whichever is smaller. + * + * On the zoned mode, we need to use zone_size (= + * data_sinfo->chunk_size) as it is. */ data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - data_chunk_size = min(data_sinfo->chunk_size, - mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); - data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + if (!btrfs_is_zoned(fs_info)) { + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + } else { + data_chunk_size = data_sinfo->chunk_size; + } /* * Since data allocations immediately use block groups as part of the @@ -405,6 +412,17 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, avail >>= 3; else avail >>= 1; + + /* + * On the zoned mode, we always allocate one zone as one chunk. + * Returning non-zone size alingned bytes here will result in + * less pressure for the async metadata reclaim process, and it + * will over-commit too much leading to ENOSPC. Align down to the + * zone size to avoid that. + */ + if (btrfs_is_zoned(fs_info)) + avail = ALIGN_DOWN(avail, fs_info->zone_size); + return avail; } -- cgit From 724d8042cef84496ddb4492dc120291f997ae26b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 24 Jun 2024 15:10:53 +0930 Subject: btrfs: always do the basic checks for btrfs_qgroup_inherit structure [BUG] Syzbot reports the following regression detected by KASAN: BUG: KASAN: slab-out-of-bounds in btrfs_qgroup_inherit+0x42e/0x2e20 fs/btrfs/qgroup.c:3277 Read of size 8 at addr ffff88814628ca50 by task syz-executor318/5171 CPU: 0 PID: 5171 Comm: syz-executor318 Not tainted 6.10.0-rc2-syzkaller-00010-g2ab795141095 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/02/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 btrfs_qgroup_inherit+0x42e/0x2e20 fs/btrfs/qgroup.c:3277 create_pending_snapshot+0x1359/0x29b0 fs/btrfs/transaction.c:1854 create_pending_snapshots+0x195/0x1d0 fs/btrfs/transaction.c:1922 btrfs_commit_transaction+0xf20/0x3740 fs/btrfs/transaction.c:2382 create_snapshot+0x6a1/0x9e0 fs/btrfs/ioctl.c:875 btrfs_mksubvol+0x58f/0x710 fs/btrfs/ioctl.c:1029 btrfs_mksnapshot+0xb5/0xf0 fs/btrfs/ioctl.c:1075 __btrfs_ioctl_snap_create+0x387/0x4b0 fs/btrfs/ioctl.c:1340 btrfs_ioctl_snap_create_v2+0x1f2/0x3a0 fs/btrfs/ioctl.c:1422 btrfs_ioctl+0x99e/0xc60 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fcbf1992509 RSP: 002b:00007fcbf1928218 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fcbf1a1f618 RCX: 00007fcbf1992509 RDX: 0000000020000280 RSI: 0000000050009417 RDI: 0000000000000003 RBP: 00007fcbf1a1f610 R08: 00007ffea1298e97 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fcbf19eb660 R13: 00000000200002b8 R14: 00007fcbf19e60c0 R15: 0030656c69662f2e And it also pinned it down to commit b5357cb268c4 ("btrfs: qgroup: do not check qgroup inherit if qgroup is disabled"). [CAUSE] That offending commit skips the whole qgroup inherit check if qgroup is not enabled. But that also skips the very basic checks like num_ref_copies/num_excl_copies and the structure size checks. Meaning if a qgroup enable/disable race is happening at the background, and we pass a btrfs_qgroup_inherit structure when the qgroup is disabled, the check would be completely skipped. Then at the time of transaction commitment, qgroup is re-enabled and btrfs_qgroup_inherit() is going to use the incorrect structure and causing the above KASAN error. [FIX] Make btrfs_qgroup_check_inherit() only skip the source qgroup checks. So that even if invalid btrfs_qgroup_inherit structure is passed in, we can still reject invalid ones no matter if qgroup is enabled or not. Furthermore we do already have an extra safety inside btrfs_qgroup_inherit(), which would just ignore invalid qgroup sources, so even if we only skip the qgroup source check we're still safe. Reported-by: syzbot+a0d1f7e26910be4dc171@syzkaller.appspotmail.com Fixes: b5357cb268c4 ("btrfs: qgroup: do not check qgroup inherit if qgroup is disabled") Reviewed-by: Boris Burkov Reviewed-by: Jeongjun Park Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index bf0f81d59b6b..39a15cca58ca 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3062,8 +3062,6 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_inherit *inherit, size_t size) { - if (!btrfs_qgroup_enabled(fs_info)) - return 0; if (inherit->flags & ~BTRFS_QGROUP_INHERIT_FLAGS_SUPP) return -EOPNOTSUPP; if (size < sizeof(*inherit) || size > PAGE_SIZE) @@ -3084,6 +3082,14 @@ int btrfs_qgroup_check_inherit(struct btrfs_fs_info *fs_info, if (size != struct_size(inherit, qgroups, inherit->num_qgroups)) return -EINVAL; + /* + * Skip the inherit source qgroups check if qgroup is not enabled. + * Qgroup can still be later enabled causing problems, but in that case + * btrfs_qgroup_inherit() would just ignore those invalid ones. + */ + if (!btrfs_qgroup_enabled(fs_info)) + return 0; + /* * Now check all the remaining qgroups, they should all: * -- cgit From 9da45c88e124f13a3c4d480b89b298e007fbb9e4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 23 Jun 2024 12:50:26 +0100 Subject: btrfs: fix uninitialized return value in the ref-verify tool In the ref-verify tool, when processing the inline references of an extent item, we may end up returning with uninitialized return value, because: 1) The 'ret' variable is not initialized if there are no inline extent references ('ptr' == 'end' before the while loop starts); 2) If we find an extent owner inline reference we don't initialize 'ret'. So fix these cases by initializing 'ret' to 0 when declaring the variable and set it to -EINVAL if we find an extent owner inline references and simple quotas are not enabled (as well as print an error message). Reported-by: Mirsad Todorovac Link: https://lore.kernel.org/linux-btrfs/59b40ebe-c824-457d-8b24-0bbca69d472b@gmail.com/ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ref-verify.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index cf531255ab76..9522a8b79d22 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -441,7 +441,8 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, u32 item_size = btrfs_item_size(leaf, slot); unsigned long end, ptr; u64 offset, flags, count; - int type, ret; + int type; + int ret = 0; ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); @@ -486,7 +487,11 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, key->objectid, key->offset); break; case BTRFS_EXTENT_OWNER_REF_KEY: - WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) { + btrfs_err(fs_info, + "found extent owner ref without simple quotas enabled"); + ret = -EINVAL; + } break; default: btrfs_err(fs_info, "invalid key type in iref"); -- cgit From 08f70c0a930c00d25015fed7e3b7c5370d60be24 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 2 Jul 2024 15:50:09 +0100 Subject: cifs: Fix read-performance regression by dropping readahead expansion cifs_expand_read() is causing a performance regression of around 30% by causing extra pagecache to be allocated for an inode in the readahead path before we begin actually dispatching RPC requests, thereby delaying the actual I/O. The expansion is sized according to the rsize parameter, which seems to be 4MiB on my test system; this is a big step up from the first requests made by the fio test program. Simple repro (look at read bandwidth number): fio --name=writetest --filename=/xfstest.test/foo --time_based --runtime=60 --size=16M --numjobs=1 --rw=read Fix this by removing cifs_expand_readahead(). Readahead expansion is mostly useful for when we're using the local cache if the local cache has a block size greater than PAGE_SIZE, so we can dispense with it when not caching. Fixes: 69c3c023af25 ("cifs: Implement netfslib hooks") Signed-off-by: David Howells Acked-by: Paulo Alcantara (Red Hat) cc: Jeff Layton cc: Matthew Wilcox cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org Signed-off-by: Steve French --- fs/smb/client/file.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'fs') diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index f1f2573bb18d..1374635e89fa 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -245,35 +245,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file) return 0; } -/* - * Expand the size of a readahead to the size of the rsize, if at least as - * large as a page, allowing for the possibility that rsize is not pow-2 - * aligned. - */ -static void cifs_expand_readahead(struct netfs_io_request *rreq) -{ - unsigned int rsize = rreq->rsize; - loff_t misalignment, i_size = i_size_read(rreq->inode); - - if (rsize < PAGE_SIZE) - return; - - if (rsize < INT_MAX) - rsize = roundup_pow_of_two(rsize); - else - rsize = ((unsigned int)INT_MAX + 1) / 2; - - misalignment = rreq->start & (rsize - 1); - if (misalignment) { - rreq->start -= misalignment; - rreq->len += misalignment; - } - - rreq->len = round_up(rreq->len, rsize); - if (rreq->start < i_size && rreq->len > i_size - rreq->start) - rreq->len = i_size - rreq->start; -} - /* * Completion of a request operation. */ @@ -329,7 +300,6 @@ const struct netfs_request_ops cifs_req_ops = { .init_request = cifs_init_request, .free_request = cifs_free_request, .free_subrequest = cifs_free_subrequest, - .expand_readahead = cifs_expand_readahead, .clamp_length = cifs_clamp_length, .issue_read = cifs_req_issue_read, .done = cifs_rreq_done, -- cgit From 85b08b31a22b481ec6528130daf94eee4452e23f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 28 Jun 2024 14:29:22 +0800 Subject: netfs, fscache: export fscache_put_volume() and add fscache_try_get_volume() Export fscache_put_volume() and add fscache_try_get_volume() helper function to allow cachefiles to get/put fscache_volume via linux/fscache-cache.h. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-2-libaokun@huaweicloud.com Signed-off-by: Christian Brauner --- fs/netfs/fscache_volume.c | 14 ++++++++++++++ fs/netfs/internal.h | 2 -- include/linux/fscache-cache.h | 6 ++++++ 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c index fbdc428aaea9..2e2a405ca9b0 100644 --- a/fs/netfs/fscache_volume.c +++ b/fs/netfs/fscache_volume.c @@ -27,6 +27,19 @@ struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, return volume; } +struct fscache_volume *fscache_try_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where) +{ + int ref; + + if (!__refcount_inc_not_zero(&volume->ref, &ref)) + return NULL; + + trace_fscache_volume(volume->debug_id, ref + 1, where); + return volume; +} +EXPORT_SYMBOL(fscache_try_get_volume); + static void fscache_see_volume(struct fscache_volume *volume, enum fscache_volume_trace where) { @@ -420,6 +433,7 @@ void fscache_put_volume(struct fscache_volume *volume, fscache_free_volume(volume); } } +EXPORT_SYMBOL(fscache_put_volume); /* * Relinquish a volume representation cookie. diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index de59e39e39a7..a92bfc8e8c68 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -352,8 +352,6 @@ extern const struct seq_operations fscache_volumes_seq_ops; struct fscache_volume *fscache_get_volume(struct fscache_volume *volume, enum fscache_volume_trace where); -void fscache_put_volume(struct fscache_volume *volume, - enum fscache_volume_trace where); bool fscache_begin_volume_access(struct fscache_volume *volume, struct fscache_cookie *cookie, enum fscache_access_trace why); diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index bdf7f3eddf0a..4c91a019972b 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -19,6 +19,7 @@ enum fscache_cache_trace; enum fscache_cookie_trace; enum fscache_access_trace; +enum fscache_volume_trace; enum fscache_cache_state { FSCACHE_CACHE_IS_NOT_PRESENT, /* No cache is present for this name */ @@ -97,6 +98,11 @@ extern void fscache_withdraw_cookie(struct fscache_cookie *cookie); extern void fscache_io_error(struct fscache_cache *cache); +extern struct fscache_volume * +fscache_try_get_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); +extern void fscache_put_volume(struct fscache_volume *volume, + enum fscache_volume_trace where); extern void fscache_end_volume_access(struct fscache_volume *volume, struct fscache_cookie *cookie, enum fscache_access_trace why); -- cgit From 522018a0de6b6fcce60c04f86dfc5f0e4b6a1b36 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 28 Jun 2024 14:29:23 +0800 Subject: cachefiles: fix slab-use-after-free in fscache_withdraw_volume() We got the following issue in our fault injection stress test: ================================================================== BUG: KASAN: slab-use-after-free in fscache_withdraw_volume+0x2e1/0x370 Read of size 4 at addr ffff88810680be08 by task ondemand-04-dae/5798 CPU: 0 PID: 5798 Comm: ondemand-04-dae Not tainted 6.8.0-dirty #565 Call Trace: kasan_check_range+0xf6/0x1b0 fscache_withdraw_volume+0x2e1/0x370 cachefiles_withdraw_volume+0x31/0x50 cachefiles_withdraw_cache+0x3ad/0x900 cachefiles_put_unbind_pincount+0x1f6/0x250 cachefiles_daemon_release+0x13b/0x290 __fput+0x204/0xa00 task_work_run+0x139/0x230 Allocated by task 5820: __kmalloc+0x1df/0x4b0 fscache_alloc_volume+0x70/0x600 __fscache_acquire_volume+0x1c/0x610 erofs_fscache_register_volume+0x96/0x1a0 erofs_fscache_register_fs+0x49a/0x690 erofs_fc_fill_super+0x6c0/0xcc0 vfs_get_super+0xa9/0x140 vfs_get_tree+0x8e/0x300 do_new_mount+0x28c/0x580 [...] Freed by task 5820: kfree+0xf1/0x2c0 fscache_put_volume.part.0+0x5cb/0x9e0 erofs_fscache_unregister_fs+0x157/0x1b0 erofs_kill_sb+0xd9/0x1c0 deactivate_locked_super+0xa3/0x100 vfs_get_super+0x105/0x140 vfs_get_tree+0x8e/0x300 do_new_mount+0x28c/0x580 [...] ================================================================== Following is the process that triggers the issue: mount failed | daemon exit ------------------------------------------------------------ deactivate_locked_super cachefiles_daemon_release erofs_kill_sb erofs_fscache_unregister_fs fscache_relinquish_volume __fscache_relinquish_volume fscache_put_volume(fscache_volume, fscache_volume_put_relinquish) zero = __refcount_dec_and_test(&fscache_volume->ref, &ref); cachefiles_put_unbind_pincount cachefiles_daemon_unbind cachefiles_withdraw_cache cachefiles_withdraw_volumes list_del_init(&volume->cache_link) fscache_free_volume(fscache_volume) cache->ops->free_volume cachefiles_free_volume list_del_init(&cachefiles_volume->cache_link); kfree(fscache_volume) cachefiles_withdraw_volume fscache_withdraw_volume fscache_volume->n_accesses // fscache_volume UAF !!! The fscache_volume in cache->volumes must not have been freed yet, but its reference count may be 0. So use the new fscache_try_get_volume() helper function try to get its reference count. If the reference count of fscache_volume is 0, fscache_put_volume() is freeing it, so wait for it to be removed from cache->volumes. If its reference count is not 0, call cachefiles_withdraw_volume() with reference count protection to avoid the above issue. Fixes: fe2140e2f57f ("cachefiles: Implement volume support") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-3-libaokun@huaweicloud.com Signed-off-by: Christian Brauner --- fs/cachefiles/cache.c | 10 ++++++++++ include/trace/events/fscache.h | 4 ++++ 2 files changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index f449f7340aad..56ef519a36a0 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "internal.h" /* @@ -319,12 +320,20 @@ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache) _enter(""); for (;;) { + struct fscache_volume *vcookie = NULL; struct cachefiles_volume *volume = NULL; spin_lock(&cache->object_list_lock); if (!list_empty(&cache->volumes)) { volume = list_first_entry(&cache->volumes, struct cachefiles_volume, cache_link); + vcookie = fscache_try_get_volume(volume->vcookie, + fscache_volume_get_withdraw); + if (!vcookie) { + spin_unlock(&cache->object_list_lock); + cpu_relax(); + continue; + } list_del_init(&volume->cache_link); } spin_unlock(&cache->object_list_lock); @@ -332,6 +341,7 @@ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache) break; cachefiles_withdraw_volume(volume); + fscache_put_volume(vcookie, fscache_volume_put_withdraw); } _leave(""); diff --git a/include/trace/events/fscache.h b/include/trace/events/fscache.h index a6190aa1b406..f1a73aa83fbb 100644 --- a/include/trace/events/fscache.h +++ b/include/trace/events/fscache.h @@ -35,12 +35,14 @@ enum fscache_volume_trace { fscache_volume_get_cookie, fscache_volume_get_create_work, fscache_volume_get_hash_collision, + fscache_volume_get_withdraw, fscache_volume_free, fscache_volume_new_acquire, fscache_volume_put_cookie, fscache_volume_put_create_work, fscache_volume_put_hash_collision, fscache_volume_put_relinquish, + fscache_volume_put_withdraw, fscache_volume_see_create_work, fscache_volume_see_hash_wake, fscache_volume_wait_create_work, @@ -120,12 +122,14 @@ enum fscache_access_trace { EM(fscache_volume_get_cookie, "GET cook ") \ EM(fscache_volume_get_create_work, "GET creat") \ EM(fscache_volume_get_hash_collision, "GET hcoll") \ + EM(fscache_volume_get_withdraw, "GET withd") \ EM(fscache_volume_free, "FREE ") \ EM(fscache_volume_new_acquire, "NEW acq ") \ EM(fscache_volume_put_cookie, "PUT cook ") \ EM(fscache_volume_put_create_work, "PUT creat") \ EM(fscache_volume_put_hash_collision, "PUT hcoll") \ EM(fscache_volume_put_relinquish, "PUT relnq") \ + EM(fscache_volume_put_withdraw, "PUT withd") \ EM(fscache_volume_see_create_work, "SEE creat") \ EM(fscache_volume_see_hash_wake, "SEE hwake") \ E_(fscache_volume_wait_create_work, "WAIT crea") -- cgit From 5d8f805789072ea7fd39504694b7bd17e5f751c4 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 28 Jun 2024 14:29:24 +0800 Subject: cachefiles: fix slab-use-after-free in cachefiles_withdraw_cookie() We got the following issue in our fault injection stress test: ================================================================== BUG: KASAN: slab-use-after-free in cachefiles_withdraw_cookie+0x4d9/0x600 Read of size 8 at addr ffff888118efc000 by task kworker/u78:0/109 CPU: 13 PID: 109 Comm: kworker/u78:0 Not tainted 6.8.0-dirty #566 Call Trace: kasan_report+0x93/0xc0 cachefiles_withdraw_cookie+0x4d9/0x600 fscache_cookie_state_machine+0x5c8/0x1230 fscache_cookie_worker+0x91/0x1c0 process_one_work+0x7fa/0x1800 [...] Allocated by task 117: kmalloc_trace+0x1b3/0x3c0 cachefiles_acquire_volume+0xf3/0x9c0 fscache_create_volume_work+0x97/0x150 process_one_work+0x7fa/0x1800 [...] Freed by task 120301: kfree+0xf1/0x2c0 cachefiles_withdraw_cache+0x3fa/0x920 cachefiles_put_unbind_pincount+0x1f6/0x250 cachefiles_daemon_release+0x13b/0x290 __fput+0x204/0xa00 task_work_run+0x139/0x230 do_exit+0x87a/0x29b0 [...] ================================================================== Following is the process that triggers the issue: p1 | p2 ------------------------------------------------------------ fscache_begin_lookup fscache_begin_volume_access fscache_cache_is_live(fscache_cache) cachefiles_daemon_release cachefiles_put_unbind_pincount cachefiles_daemon_unbind cachefiles_withdraw_cache fscache_withdraw_cache fscache_set_cache_state(cache, FSCACHE_CACHE_IS_WITHDRAWN); cachefiles_withdraw_objects(cache) fscache_wait_for_objects(fscache) atomic_read(&fscache_cache->object_count) == 0 fscache_perform_lookup cachefiles_lookup_cookie cachefiles_alloc_object refcount_set(&object->ref, 1); object->volume = volume fscache_count_object(vcookie->cache); atomic_inc(&fscache_cache->object_count) cachefiles_withdraw_volumes cachefiles_withdraw_volume fscache_withdraw_volume __cachefiles_free_volume kfree(cachefiles_volume) fscache_cookie_state_machine cachefiles_withdraw_cookie cache = object->volume->cache; // cachefiles_volume UAF !!! After setting FSCACHE_CACHE_IS_WITHDRAWN, wait for all the cookie lookups to complete first, and then wait for fscache_cache->object_count == 0 to avoid the cookie exiting after the volume has been freed and triggering the above issue. Therefore call fscache_withdraw_volume() before calling cachefiles_withdraw_objects(). This way, after setting FSCACHE_CACHE_IS_WITHDRAWN, only the following two cases will occur: 1) fscache_begin_lookup fails in fscache_begin_volume_access(). 2) fscache_withdraw_volume() will ensure that fscache_count_object() has been executed before calling fscache_wait_for_objects(). Fixes: fe2140e2f57f ("cachefiles: Implement volume support") Suggested-by: Hou Tao Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-4-libaokun@huaweicloud.com Signed-off-by: Christian Brauner --- fs/cachefiles/cache.c | 35 ++++++++++++++++++++++++++++++++++- fs/cachefiles/volume.c | 1 - 2 files changed, 34 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c index 56ef519a36a0..9fb06dc16520 100644 --- a/fs/cachefiles/cache.c +++ b/fs/cachefiles/cache.c @@ -313,7 +313,39 @@ static void cachefiles_withdraw_objects(struct cachefiles_cache *cache) } /* - * Withdraw volumes. + * Withdraw fscache volumes. + */ +static void cachefiles_withdraw_fscache_volumes(struct cachefiles_cache *cache) +{ + struct list_head *cur; + struct cachefiles_volume *volume; + struct fscache_volume *vcookie; + + _enter(""); +retry: + spin_lock(&cache->object_list_lock); + list_for_each(cur, &cache->volumes) { + volume = list_entry(cur, struct cachefiles_volume, cache_link); + + if (atomic_read(&volume->vcookie->n_accesses) == 0) + continue; + + vcookie = fscache_try_get_volume(volume->vcookie, + fscache_volume_get_withdraw); + if (vcookie) { + spin_unlock(&cache->object_list_lock); + fscache_withdraw_volume(vcookie); + fscache_put_volume(vcookie, fscache_volume_put_withdraw); + goto retry; + } + } + spin_unlock(&cache->object_list_lock); + + _leave(""); +} + +/* + * Withdraw cachefiles volumes. */ static void cachefiles_withdraw_volumes(struct cachefiles_cache *cache) { @@ -381,6 +413,7 @@ void cachefiles_withdraw_cache(struct cachefiles_cache *cache) pr_info("File cache on %s unregistering\n", fscache->name); fscache_withdraw_cache(fscache); + cachefiles_withdraw_fscache_volumes(cache); /* we now have to destroy all the active objects pertaining to this * cache - which we do by passing them off to thread pool to be diff --git a/fs/cachefiles/volume.c b/fs/cachefiles/volume.c index 89df0ba8ba5e..781aac4ef274 100644 --- a/fs/cachefiles/volume.c +++ b/fs/cachefiles/volume.c @@ -133,7 +133,6 @@ void cachefiles_free_volume(struct fscache_volume *vcookie) void cachefiles_withdraw_volume(struct cachefiles_volume *volume) { - fscache_withdraw_volume(volume->vcookie); cachefiles_set_volume_xattr(volume); __cachefiles_free_volume(volume); } -- cgit From 0ece614a52bc9d219b839a6a29282b30d10e0c48 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 28 Jun 2024 14:29:25 +0800 Subject: cachefiles: propagate errors from vfs_getxattr() to avoid infinite loop In cachefiles_check_volume_xattr(), the error returned by vfs_getxattr() is not passed to ret, so it ends up returning -ESTALE, which leads to an endless loop as follows: cachefiles_acquire_volume retry: ret = cachefiles_check_volume_xattr ret = -ESTALE xlen = vfs_getxattr // return -EIO // The ret is not updated when xlen < 0, so -ESTALE is returned. return ret // Supposed to jump out of the loop at this judgement. if (ret != -ESTALE) goto error_dir; cachefiles_bury_object // EIO causes rename failure goto retry; Hence propagate the error returned by vfs_getxattr() to avoid the above issue. Do the same in cachefiles_check_auxdata(). Fixes: 32e150037dce ("fscache, cachefiles: Store the volume coherency data") Fixes: 72b957856b0c ("cachefiles: Implement metadata/coherency data storage in xattrs") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-5-libaokun@huaweicloud.com Reviewed-by: Gao Xiang Signed-off-by: Christian Brauner --- fs/cachefiles/xattr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index bcb6173943ee..4dd8a993c60a 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -110,9 +110,11 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file if (xlen == 0) xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, tlen); if (xlen != tlen) { - if (xlen < 0) + if (xlen < 0) { + ret = xlen; trace_cachefiles_vfs_error(object, file_inode(file), xlen, cachefiles_trace_getxattr_error); + } if (xlen == -EIO) cachefiles_io_error_obj( object, @@ -252,6 +254,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume) xlen = vfs_getxattr(&nop_mnt_idmap, dentry, cachefiles_xattr_cache, buf, len); if (xlen != len) { if (xlen < 0) { + ret = xlen; trace_cachefiles_vfs_error(NULL, d_inode(dentry), xlen, cachefiles_trace_getxattr_error); if (xlen == -EIO) -- cgit From b2415d1f4566b6939acacc69637eaa57815829c1 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 28 Jun 2024 14:29:26 +0800 Subject: cachefiles: stop sending new request when dropping object Added CACHEFILES_ONDEMAND_OBJSTATE_DROPPING indicates that the cachefiles object is being dropped, and is set after the close request for the dropped object completes, and no new requests are allowed to be sent after this state. This prepares for the later addition of cancel_work_sync(). It prevents leftover reopen requests from being sent, to avoid processing unnecessary requests and to avoid cancel_work_sync() blocking by waiting for daemon to complete the reopen requests. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-6-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Gao Xiang Reviewed-by: Jia Zhu Signed-off-by: Christian Brauner --- fs/cachefiles/internal.h | 2 ++ fs/cachefiles/ondemand.c | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index d33169f0018b..8ecd296cc1c4 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -48,6 +48,7 @@ enum cachefiles_object_state { CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */ CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */ CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */ + CACHEFILES_ONDEMAND_OBJSTATE_DROPPING, /* Object is being dropped. */ }; struct cachefiles_ondemand_info { @@ -332,6 +333,7 @@ cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \ CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN); CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE); CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING); +CACHEFILES_OBJECT_STATE_FUNCS(dropping, DROPPING); static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req) { diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 4ba42f1fa3b4..73da4d4eaa9b 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -422,7 +422,8 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, */ xas_lock(&xas); - if (test_bit(CACHEFILES_DEAD, &cache->flags)) { + if (test_bit(CACHEFILES_DEAD, &cache->flags) || + cachefiles_ondemand_object_is_dropping(object)) { xas_unlock(&xas); ret = -EIO; goto out; @@ -463,7 +464,8 @@ out: * If error occurs after creating the anonymous fd, * cachefiles_ondemand_fd_release() will set object to close. */ - if (opcode == CACHEFILES_OP_OPEN) + if (opcode == CACHEFILES_OP_OPEN && + !cachefiles_ondemand_object_is_dropping(object)) cachefiles_ondemand_set_object_close(object); kfree(req); return ret; @@ -562,8 +564,12 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) void cachefiles_ondemand_clean_object(struct cachefiles_object *object) { + if (!object->ondemand) + return; + cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0, cachefiles_ondemand_init_close_req, NULL); + cachefiles_ondemand_set_object_dropping(object); } int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, -- cgit From 751f524635a4f076117d714705eeddadaf6748ee Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 28 Jun 2024 14:29:27 +0800 Subject: cachefiles: cancel all requests for the object that is being dropped Because after an object is dropped, requests for that object are useless, cancel them to avoid causing other problems. This prepares for the later addition of cancel_work_sync(). After the reopen requests is generated, cancel it to avoid cancel_work_sync() blocking by waiting for daemon to complete the reopen requests. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-7-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Gao Xiang Reviewed-by: Jia Zhu Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 73da4d4eaa9b..be1d327a2219 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -564,12 +564,31 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object) void cachefiles_ondemand_clean_object(struct cachefiles_object *object) { + unsigned long index; + struct cachefiles_req *req; + struct cachefiles_cache *cache; + if (!object->ondemand) return; cachefiles_ondemand_send_req(object, CACHEFILES_OP_CLOSE, 0, cachefiles_ondemand_init_close_req, NULL); + + if (!object->ondemand->ondemand_id) + return; + + /* Cancel all requests for the object that is being dropped. */ + cache = object->volume->cache; + xa_lock(&cache->reqs); cachefiles_ondemand_set_object_dropping(object); + xa_for_each(&cache->reqs, index, req) { + if (req->object == object) { + req->error = -EIO; + complete(&req->done); + __xa_erase(&cache->reqs, index); + } + } + xa_unlock(&cache->reqs); } int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, -- cgit From 12e009d60852f7bce0afc373ca0b320f14150418 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 28 Jun 2024 14:29:28 +0800 Subject: cachefiles: wait for ondemand_object_worker to finish when dropping object When queuing ondemand_object_worker() to re-open the object, cachefiles_object is not pinned. The cachefiles_object may be freed when the pending read request is completed intentionally and the related erofs is umounted. If ondemand_object_worker() runs after the object is freed, it will incur use-after-free problem as shown below. process A processs B process C process D cachefiles_ondemand_send_req() // send a read req X // wait for its completion // close ondemand fd cachefiles_ondemand_fd_release() // set object as CLOSE cachefiles_ondemand_daemon_read() // set object as REOPENING queue_work(fscache_wq, &info->ondemand_work) // close /dev/cachefiles cachefiles_daemon_release cachefiles_flush_reqs complete(&req->done) // read req X is completed // umount the erofs fs cachefiles_put_object() // object will be freed cachefiles_ondemand_deinit_obj_info() kmem_cache_free(object) // both info and object are freed ondemand_object_worker() When dropping an object, it is no longer necessary to reopen the object, so use cancel_work_sync() to cancel or wait for ondemand_object_worker() to finish. Fixes: 0a7e54c1959c ("cachefiles: resend an open request if the read request's object is closed") Signed-off-by: Hou Tao Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-8-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jia Zhu Reviewed-by: Gao Xiang Signed-off-by: Christian Brauner --- fs/cachefiles/ondemand.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index be1d327a2219..6166b66729e4 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -589,6 +589,9 @@ void cachefiles_ondemand_clean_object(struct cachefiles_object *object) } } xa_unlock(&cache->reqs); + + /* Wait for ondemand_object_worker() to finish to avoid UAF. */ + cancel_work_sync(&object->ondemand->ondemand_work); } int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object, -- cgit From 19f4f399091478c95947f6bd7ad61622300c30d9 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 28 Jun 2024 14:29:29 +0800 Subject: cachefiles: cyclic allocation of msg_id to avoid reuse Reusing the msg_id after a maliciously completed reopen request may cause a read request to remain unprocessed and result in a hung, as shown below: t1 | t2 | t3 ------------------------------------------------- cachefiles_ondemand_select_req cachefiles_ondemand_object_is_close(A) cachefiles_ondemand_set_object_reopening(A) queue_work(fscache_object_wq, &info->work) ondemand_object_worker cachefiles_ondemand_init_object(A) cachefiles_ondemand_send_req(OPEN) // get msg_id 6 wait_for_completion(&req_A->done) cachefiles_ondemand_daemon_read // read msg_id 6 req_A cachefiles_ondemand_get_fd copy_to_user // Malicious completion msg_id 6 copen 6,-1 cachefiles_ondemand_copen complete(&req_A->done) // will not set the object to close // because ondemand_id && fd is valid. // ondemand_object_worker() is done // but the object is still reopening. // new open req_B cachefiles_ondemand_init_object(B) cachefiles_ondemand_send_req(OPEN) // reuse msg_id 6 process_open_req copen 6,A.size // The expected failed copen was executed successfully Expect copen to fail, and when it does, it closes fd, which sets the object to close, and then close triggers reopen again. However, due to msg_id reuse resulting in a successful copen, the anonymous fd is not closed until the daemon exits. Therefore read requests waiting for reopen to complete may trigger hung task. To avoid this issue, allocate the msg_id cyclically to avoid reusing the msg_id for a very short duration of time. Fixes: c8383054506c ("cachefiles: notify the user daemon when looking up cookie") Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-9-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Gao Xiang Reviewed-by: Jia Zhu Signed-off-by: Christian Brauner --- fs/cachefiles/internal.h | 1 + fs/cachefiles/ondemand.c | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 8ecd296cc1c4..9200c00f3e98 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -128,6 +128,7 @@ struct cachefiles_cache { unsigned long req_id_next; struct xarray ondemand_ids; /* xarray for ondemand_id allocation */ u32 ondemand_id_next; + u32 msg_id_next; }; static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index 6166b66729e4..d40b17010b5f 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -433,20 +433,32 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object, smp_mb(); if (opcode == CACHEFILES_OP_CLOSE && - !cachefiles_ondemand_object_is_open(object)) { + !cachefiles_ondemand_object_is_open(object)) { WARN_ON_ONCE(object->ondemand->ondemand_id == 0); xas_unlock(&xas); ret = -EIO; goto out; } - xas.xa_index = 0; + /* + * Cyclically find a free xas to avoid msg_id reuse that would + * cause the daemon to successfully copen a stale msg_id. + */ + xas.xa_index = cache->msg_id_next; xas_find_marked(&xas, UINT_MAX, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) { + xas.xa_index = 0; + xas_find_marked(&xas, cache->msg_id_next - 1, XA_FREE_MARK); + } if (xas.xa_node == XAS_RESTART) xas_set_err(&xas, -EBUSY); + xas_store(&xas, req); - xas_clear_mark(&xas, XA_FREE_MARK); - xas_set_mark(&xas, CACHEFILES_REQ_NEW); + if (xas_valid(&xas)) { + cache->msg_id_next = xas.xa_index + 1; + xas_clear_mark(&xas, XA_FREE_MARK); + xas_set_mark(&xas, CACHEFILES_REQ_NEW); + } xas_unlock(&xas); } while (xas_nomem(&xas, GFP_KERNEL)); -- cgit From cf5bb09e742a9cf6349127e868329a8f69b7a014 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Fri, 28 Jun 2024 14:29:30 +0800 Subject: cachefiles: add missing lock protection when polling Add missing lock protection in poll routine when iterating xarray, otherwise: Even with RCU read lock held, only the slot of the radix tree is ensured to be pinned there, while the data structure (e.g. struct cachefiles_req) stored in the slot has no such guarantee. The poll routine will iterate the radix tree and dereference cachefiles_req accordingly. Thus RCU read lock is not adequate in this case and spinlock is needed here. Fixes: b817e22b2e91 ("cachefiles: narrow the scope of triggering EPOLLIN events in ondemand mode") Signed-off-by: Jingbo Xu Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240628062930.2467993-10-libaokun@huaweicloud.com Acked-by: Jeff Layton Reviewed-by: Jia Zhu Reviewed-by: Gao Xiang Signed-off-by: Christian Brauner --- fs/cachefiles/daemon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index 6465e2574230..73ed2323282a 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -365,14 +365,14 @@ static __poll_t cachefiles_daemon_poll(struct file *file, if (cachefiles_in_ondemand_mode(cache)) { if (!xa_empty(&cache->reqs)) { - rcu_read_lock(); + xas_lock(&xas); xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) { if (!cachefiles_ondemand_is_reopening_read(req)) { mask |= EPOLLIN; break; } } - rcu_read_unlock(); + xas_unlock(&xas); } } else { if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags)) -- cgit From da0386c1c70da1a01b5fa8ec503b96116bc8734c Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 2 Jul 2024 07:31:13 -0700 Subject: btrfs: fix folio refcount in btrfs_do_encoded_write() The conversion to folios switched __free_page() to __folio_put() in the error path in btrfs_do_encoded_write(). However, this gets the page refcounting wrong. If we do hit that error path (I reproduced by modifying btrfs_do_encoded_write to pretend to always fail in a way that jumps to out_folios and running the fstests case btrfs/281), then we always hit the following BUG freeing the folio: BUG: Bad page state in process btrfs pfn:40ab0b page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x61be5 pfn:0x40ab0b flags: 0x5ffff0000000000(node=0|zone=2|lastcpupid=0x1ffff) raw: 05ffff0000000000 0000000000000000 dead000000000122 0000000000000000 raw: 0000000000061be5 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: nonzero _refcount Call Trace: dump_stack_lvl+0x3d/0xe0 bad_page+0xea/0xf0 free_unref_page+0x8e1/0x900 ? __mem_cgroup_uncharge+0x69/0x90 __folio_put+0xe6/0x190 btrfs_do_encoded_write+0x445/0x780 ? current_time+0x25/0xd0 btrfs_do_write_iter+0x2cc/0x4b0 btrfs_ioctl_encoded_write+0x2b6/0x340 It turns out __free_page() decreases the page reference count while __folio_put() does not. Switch __folio_put() to folio_put() which decreases the folio reference count first. Fixes: 400b172b8cdc ("btrfs: compression: migrate compression/decompression paths to folios") Tested-by: Ed Tomlinson Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d0274324c75a..6c96a6086d3f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10382,7 +10382,7 @@ out_unlock: out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) - __folio_put(folios[i]); + folio_put(folios[i]); } kvfree(folios); out: -- cgit From a56c85fa2d59ab0780514741550edf87989a66e9 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Tue, 2 Jul 2024 07:31:14 -0700 Subject: btrfs: fix folio refcount in __alloc_dummy_extent_buffer() Another improper use of __folio_put() in an error path after freshly allocating pages/folios which returns them with the refcount initialized to 1. The refactor from __free_pages() -> __folio_put() (instead of folio_put) removed a refcount decrement found in __free_pages() and folio_put but absent from __folio_put(). Fixes: 13df3775efca ("btrfs: cleanup metadata page pointer usage") CC: stable@vger.kernel.org # 6.8+ Tested-by: Ed Tomlinson Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f688fab55251..958155cc43a8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3553,7 +3553,7 @@ err: for (int i = 0; i < num_folios; i++) { if (eb->folios[i]) { detach_extent_buffer_folio(eb, eb->folios[i]); - __folio_put(eb->folios[i]); + folio_put(eb->folios[i]); } } __free_extent_buffer(eb); -- cgit From 25a6e135569b3901452e4863c94560df7c11c492 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 24 Jun 2024 08:39:23 +0900 Subject: ksmbd: return FILE_DEVICE_DISK instead of super magic MS-SMB2 specification describes setting ->DeviceType to FILE_DEVICE_DISK or FILE_DEVICE_CD_ROM. Set FILE_DEVICE_DISK instead of super magic in FS_DEVICE_INFORMATION. And Set FILE_READ_ONLY_DEVICE for read-only share. Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/common/smb2pdu.h | 34 ++++++++++++++++++++++++++++++++++ fs/smb/server/smb2pdu.c | 9 +++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index 8d10be1fe18a..c3ee42188d25 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -917,6 +917,40 @@ struct smb2_query_directory_rsp { __u8 Buffer[]; } __packed; +/* DeviceType Flags */ +#define FILE_DEVICE_CD_ROM 0x00000002 +#define FILE_DEVICE_CD_ROM_FILE_SYSTEM 0x00000003 +#define FILE_DEVICE_DFS 0x00000006 +#define FILE_DEVICE_DISK 0x00000007 +#define FILE_DEVICE_DISK_FILE_SYSTEM 0x00000008 +#define FILE_DEVICE_FILE_SYSTEM 0x00000009 +#define FILE_DEVICE_NAMED_PIPE 0x00000011 +#define FILE_DEVICE_NETWORK 0x00000012 +#define FILE_DEVICE_NETWORK_FILE_SYSTEM 0x00000014 +#define FILE_DEVICE_NULL 0x00000015 +#define FILE_DEVICE_PARALLEL_PORT 0x00000016 +#define FILE_DEVICE_PRINTER 0x00000018 +#define FILE_DEVICE_SERIAL_PORT 0x0000001b +#define FILE_DEVICE_STREAMS 0x0000001e +#define FILE_DEVICE_TAPE 0x0000001f +#define FILE_DEVICE_TAPE_FILE_SYSTEM 0x00000020 +#define FILE_DEVICE_VIRTUAL_DISK 0x00000024 +#define FILE_DEVICE_NETWORK_REDIRECTOR 0x00000028 + +/* Device Characteristics */ +#define FILE_REMOVABLE_MEDIA 0x00000001 +#define FILE_READ_ONLY_DEVICE 0x00000002 +#define FILE_FLOPPY_DISKETTE 0x00000004 +#define FILE_WRITE_ONCE_MEDIA 0x00000008 +#define FILE_REMOTE_DEVICE 0x00000010 +#define FILE_DEVICE_IS_MOUNTED 0x00000020 +#define FILE_VIRTUAL_VOLUME 0x00000040 +#define FILE_DEVICE_SECURE_OPEN 0x00000100 +#define FILE_CHARACTERISTIC_TS_DEVICE 0x00001000 +#define FILE_CHARACTERISTIC_WEBDAV_DEVICE 0x00002000 +#define FILE_PORTABLE_DEVICE 0x00004000 +#define FILE_DEVICE_ALLOW_APPCONTAINER_TRAVERSAL 0x00020000 + /* * Maximum number of iovs we need for a set-info request. * The largest one is rename/hardlink diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index e7e07891781b..786cd45fe18f 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -5314,8 +5314,13 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, info = (struct filesystem_device_info *)rsp->Buffer; - info->DeviceType = cpu_to_le32(stfs.f_type); - info->DeviceCharacteristics = cpu_to_le32(0x00000020); + info->DeviceType = cpu_to_le32(FILE_DEVICE_DISK); + info->DeviceCharacteristics = + cpu_to_le32(FILE_DEVICE_IS_MOUNTED); + if (!test_tree_conn_flag(work->tcon, + KSMBD_TREE_CONN_FLAG_WRITABLE)) + info->DeviceCharacteristics |= + cpu_to_le32(FILE_READ_ONLY_DEVICE); rsp->OutputBufferLength = cpu_to_le32(8); break; } -- cgit From 1723f04caacb32cadc4e063725d836a0c4450694 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Wed, 26 Jun 2024 09:05:11 -0400 Subject: Fix userfaultfd_api to return EINVAL as expected Currently if we request a feature that is not set in the Kernel config we fail silently and return all the available features. However, the man page indicates we should return an EINVAL. We need to fix this issue since we can end up with a Kernel warning should a program request the feature UFFD_FEATURE_WP_UNPOPULATED on a kernel with the config not set with this feature. [ 200.812896] WARNING: CPU: 91 PID: 13634 at mm/memory.c:1660 zap_pte_range+0x43d/0x660 [ 200.820738] Modules linked in: [ 200.869387] CPU: 91 PID: 13634 Comm: userfaultfd Kdump: loaded Not tainted 6.9.0-rc5+ #8 [ 200.877477] Hardware name: Dell Inc. PowerEdge R6525/0N7YGH, BIOS 2.7.3 03/30/2022 [ 200.885052] RIP: 0010:zap_pte_range+0x43d/0x660 Link: https://lkml.kernel.org/r/20240626130513.120193-1-audra@redhat.com Fixes: e06f1e1dd499 ("userfaultfd: wp: enabled write protection in userfaultfd API") Signed-off-by: Audra Mitchell Cc: Al Viro Cc: Andrea Arcangeli Cc: Christian Brauner Cc: Jan Kara Cc: Mike Rapoport Cc: Peter Xu Cc: Rafael Aquini Cc: Shaohua Li Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index eee7320ab0b0..17e409ceaa33 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -2057,7 +2057,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, goto out; features = uffdio_api.features; ret = -EINVAL; - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + if (uffdio_api.api != UFFD_API) goto err_out; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) @@ -2081,6 +2081,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; #endif + + ret = -EINVAL; + if (features & ~uffdio_api.features) + goto err_out; + uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) -- cgit From a9e1ddc09ca55746079cc479aa3eb6411f0d99d4 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 29 Jun 2024 01:51:07 +0900 Subject: nilfs2: fix kernel bug on rename operation of broken directory Syzbot reported that in rename directory operation on broken directory on nilfs2, __block_write_begin_int() called to prepare block write may fail BUG_ON check for access exceeding the folio/page size. This is because nilfs_dotdot(), which gets parent directory reference entry ("..") of the directory to be moved or renamed, does not check consistency enough, and may return location exceeding folio/page size for broken directories. Fix this issue by checking required directory entries ("." and "..") in the first chunk of the directory in nilfs_dotdot(). Link: https://lkml.kernel.org/r/20240628165107.9006-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Reported-by: syzbot+d3abed1ad3d367fa2627@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d3abed1ad3d367fa2627 Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/dir.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index dddfa604491a..4a29b0138d75 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -383,11 +383,39 @@ found: struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop) { - struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop); + struct folio *folio; + struct nilfs_dir_entry *de, *next_de; + size_t limit; + char *msg; + de = nilfs_get_folio(dir, 0, &folio); if (IS_ERR(de)) return NULL; - return nilfs_next_entry(de); + + limit = nilfs_last_byte(dir, 0); /* is a multiple of chunk size */ + if (unlikely(!limit || le64_to_cpu(de->inode) != dir->i_ino || + !nilfs_match(1, ".", de))) { + msg = "missing '.'"; + goto fail; + } + + next_de = nilfs_next_entry(de); + /* + * If "next_de" has not reached the end of the chunk, there is + * at least one more record. Check whether it matches "..". + */ + if (unlikely((char *)next_de == (char *)de + nilfs_chunk_size(dir) || + !nilfs_match(2, "..", next_de))) { + msg = "missing '..'"; + goto fail; + } + *foliop = folio; + return next_de; + +fail: + nilfs_error(dir->i_sb, "directory #%lu %s", dir->i_ino, msg); + folio_release_kmap(folio, de); + return NULL; } ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) -- cgit From e2e33caa5dc2eae7bddf88b22ce11ec3d760e5cd Mon Sep 17 00:00:00 2001 From: Hobin Woo Date: Fri, 5 Jul 2024 12:27:25 +0900 Subject: ksmbd: discard write access to the directory open may_open() does not allow a directory to be opened with the write access. However, some writing flags set by client result in adding write access on server, making ksmbd incompatible with FUSE file system. Simply, let's discard the write access when opening a directory. list_add corruption. next is NULL. ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:26! pc : __list_add_valid+0x88/0xbc lr : __list_add_valid+0x88/0xbc Call trace: __list_add_valid+0x88/0xbc fuse_finish_open+0x11c/0x170 fuse_open_common+0x284/0x5e8 fuse_dir_open+0x14/0x24 do_dentry_open+0x2a4/0x4e0 dentry_open+0x50/0x80 smb2_open+0xbe4/0x15a4 handle_ksmbd_work+0x478/0x5ec process_one_work+0x1b4/0x448 worker_thread+0x25c/0x430 kthread+0x104/0x1d4 ret_from_fork+0x10/0x20 Cc: stable@vger.kernel.org Signed-off-by: Yoonho Shin Signed-off-by: Hobin Woo Acked-by: Namjae Jeon Signed-off-by: Steve French --- fs/smb/server/smb2pdu.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 786cd45fe18f..840c71c66b30 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2051,15 +2051,22 @@ out_err1: * @access: file access flags * @disposition: file disposition flags * @may_flags: set with MAY_ flags + * @is_dir: is creating open flags for directory * * Return: file open flags */ static int smb2_create_open_flags(bool file_present, __le32 access, __le32 disposition, - int *may_flags) + int *may_flags, + bool is_dir) { int oflags = O_NONBLOCK | O_LARGEFILE; + if (is_dir) { + access &= ~FILE_WRITE_DESIRE_ACCESS_LE; + ksmbd_debug(SMB, "Discard write access to a directory\n"); + } + if (access & FILE_READ_DESIRED_ACCESS_LE && access & FILE_WRITE_DESIRE_ACCESS_LE) { oflags |= O_RDWR; @@ -3167,7 +3174,9 @@ int smb2_open(struct ksmbd_work *work) open_flags = smb2_create_open_flags(file_present, daccess, req->CreateDisposition, - &may_flags); + &may_flags, + req->CreateOptions & FILE_DIRECTORY_FILE_LE || + (file_present && S_ISDIR(d_inode(path.dentry)->i_mode))); if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { if (open_flags & (O_CREAT | O_TRUNC)) { -- cgit From 1b3ec4f7c03d4b07bad70697d7e2f4088d2cfe92 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Jul 2024 18:44:48 -0400 Subject: filelock: fix potential use-after-free in posix_lock_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Light Hsieh reported a KASAN UAF warning in trace_posix_lock_inode(). The request pointer had been changed earlier to point to a lock entry that was added to the inode's list. However, before the tracepoint could fire, another task raced in and freed that lock. Fix this by moving the tracepoint inside the spinlock, which should ensure that this doesn't happen. Fixes: 74f6f5912693 ("locks: fix KASAN: use-after-free in trace_event_raw_event_filelock_lock") Link: https://lore.kernel.org/linux-fsdevel/724ffb0a2962e912ea62bb0515deadf39c325112.camel@kernel.org/ Reported-by: Light Hsieh (謝明燈) Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20240702-filelock-6-10-v1-1-96e766aadc98@kernel.org Reviewed-by: Alexander Aring Signed-off-by: Christian Brauner --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 90c8746874de..2a445776dae3 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1367,9 +1367,9 @@ retry: locks_wake_up_blocks(&left->c); } out: + trace_posix_lock_inode(inode, request, error); spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); - trace_posix_lock_inode(inode, request, error); /* * Free any unused locks. */ -- cgit From aabfe57ebaa75841db47ea59091ec3c5a06d2f52 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 3 Jul 2024 08:13:01 -0400 Subject: vfs: don't mod negative dentry count when on shrinker list The nr_dentry_negative counter is intended to only account negative dentries that are present on the superblock LRU. Therefore, the LRU add, remove and isolate helpers modify the counter based on whether the dentry is negative, but the shrinker list related helpers do not modify the counter, and the paths that change a dentry between positive and negative only do so if DCACHE_LRU_LIST is set. The problem with this is that a dentry on a shrinker list still has DCACHE_LRU_LIST set to indicate ->d_lru is in use. The additional DCACHE_SHRINK_LIST flag denotes whether the dentry is on LRU or a shrink related list. Therefore if a relevant operation (i.e. unlink) occurs while a dentry is present on a shrinker list, and the associated codepath only checks for DCACHE_LRU_LIST, then it is technically possible to modify the negative dentry count for a dentry that is off the LRU. Since the shrinker list related helpers do not modify the negative dentry count (because non-LRU dentries should not be included in the count) when the dentry is ultimately removed from the shrinker list, this can cause the negative dentry count to become permanently inaccurate. This problem can be reproduced via a heavy file create/unlink vs. drop_caches workload. On an 80xcpu system, I start 80 tasks each running a 1k file create/delete loop, and one task spinning on drop_caches. After 10 minutes or so of runtime, the idle/clean cache negative dentry count increases from somewhere in the range of 5-10 entries to several hundred (and increasingly grows beyond nr_dentry_unused). Tweak the logic in the paths that turn a dentry negative or positive to filter out the case where the dentry is present on a shrink related list. This allows the above workload to maintain an accurate negative dentry count. Fixes: af0c9af1b3f6 ("fs/dcache: Track & report number of negative dentries") Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20240703121301.247680-1-bfoster@redhat.com Acked-by: Ian Kent Reviewed-by: Josef Bacik Reviewed-by: Waiman Long Signed-off-by: Christian Brauner --- fs/dcache.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 407095188f83..66515fbc9dd7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -355,7 +355,11 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) flags &= ~DCACHE_ENTRY_TYPE; WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; - if (flags & DCACHE_LRU_LIST) + /* + * The negative counter only tracks dentries on the LRU. Don't inc if + * d_lru is on another list. + */ + if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } @@ -1844,9 +1848,11 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) spin_lock(&dentry->d_lock); /* - * Decrement negative dentry count if it was in the LRU list. + * The negative counter only tracks dentries on the LRU. Don't dec if + * d_lru is on another list. */ - if (dentry->d_flags & DCACHE_LRU_LIST) + if ((dentry->d_flags & + (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); -- cgit From 0570730c16307a72f8241df12363f76600baf57d Mon Sep 17 00:00:00 2001 From: Edward Adam Davis Date: Tue, 21 May 2024 13:21:46 +0800 Subject: hfsplus: fix uninit-value in copy_name [syzbot reported] BUG: KMSAN: uninit-value in sized_strscpy+0xc4/0x160 sized_strscpy+0xc4/0x160 copy_name+0x2af/0x320 fs/hfsplus/xattr.c:411 hfsplus_listxattr+0x11e9/0x1a50 fs/hfsplus/xattr.c:750 vfs_listxattr fs/xattr.c:493 [inline] listxattr+0x1f3/0x6b0 fs/xattr.c:840 path_listxattr fs/xattr.c:864 [inline] __do_sys_listxattr fs/xattr.c:876 [inline] __se_sys_listxattr fs/xattr.c:873 [inline] __x64_sys_listxattr+0x16b/0x2f0 fs/xattr.c:873 x64_sys_call+0x2ba0/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:195 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:3877 [inline] slab_alloc_node mm/slub.c:3918 [inline] kmalloc_trace+0x57b/0xbe0 mm/slub.c:4065 kmalloc include/linux/slab.h:628 [inline] hfsplus_listxattr+0x4cc/0x1a50 fs/hfsplus/xattr.c:699 vfs_listxattr fs/xattr.c:493 [inline] listxattr+0x1f3/0x6b0 fs/xattr.c:840 path_listxattr fs/xattr.c:864 [inline] __do_sys_listxattr fs/xattr.c:876 [inline] __se_sys_listxattr fs/xattr.c:873 [inline] __x64_sys_listxattr+0x16b/0x2f0 fs/xattr.c:873 x64_sys_call+0x2ba0/0x3b50 arch/x86/include/generated/asm/syscalls_64.h:195 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcf/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f [Fix] When allocating memory to strbuf, initialize memory to 0. Reported-and-tested-by: syzbot+efde959319469ff8d4d7@syzkaller.appspotmail.com Signed-off-by: Edward Adam Davis Link: https://lore.kernel.org/r/tencent_8BBB6433BC9E1C1B7B4BDF1BF52574BA8808@qq.com Reported-and-tested-by: syzbot+01ade747b16e9c8030e0@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/hfsplus/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 5a400259ae74..9a1a93e3888b 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -696,7 +696,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) return err; } - strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + + strbuf = kzalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); if (!strbuf) { res = -ENOMEM; -- cgit From 3d1bec293378700dddc087d4d862306702276c23 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 9 Jul 2024 20:58:39 +0100 Subject: minixfs: Fix minixfs_rename with HIGHMEM minixfs now uses kmap_local_page(), so we can't call kunmap() to undo it. This one call was missed as part of the commit this fixes. Fixes: 6628f69ee66a (minixfs: Use dir_put_page() in minix_unlink() and minix_rename()) Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240709195841.1986374-1-willy@infradead.org Signed-off-by: Christian Brauner --- fs/minix/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/minix/namei.c b/fs/minix/namei.c index d6031acc34f0..a944a0f17b53 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -213,8 +213,7 @@ static int minix_rename(struct mnt_idmap *idmap, if (!new_de) goto out_dir; err = minix_set_link(new_de, new_page, old_inode); - kunmap(new_page); - put_page(new_page); + unmap_and_put_page(new_page, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); -- cgit From 0435773239b5afe25801e83c8252e04299a0e306 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 29 Jun 2024 21:40:57 -0400 Subject: bcachefs: Fix journal getting stuck on a flush commit silly race Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index db24ce21b2ac..f17c478a18e8 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1762,11 +1762,13 @@ static CLOSURE_CALLBACK(journal_write_preflush) if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); - closure_wait(&j->async_wait, cl); + if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); + continue_at(cl, journal_write_preflush, j->wq); + return; + } spin_unlock(&j->lock); - - continue_at(cl, journal_write_preflush, j->wq); - return; } if (w->separate_flush) { -- cgit From b02f973e67589cf617f229250e2a738ab62ca666 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 1 Jul 2024 16:23:54 -0400 Subject: bcachefs: Fix bch2_inode_insert() race path for tmpfiles Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f9c9a95d7d4c..1768b2678b38 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -194,6 +194,12 @@ static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_ino * discard_new_inode() expects it to be set... */ inode->v.i_flags |= I_NEW; + /* + * We don't want bch2_evict_inode() to delete the inode on disk, + * we just raced and had another inode in cache. Normally new + * inodes don't have nlink == 0 - except tmpfiles do... + */ + set_nlink(&inode->v, 1); discard_new_inode(&inode->v); inode = old; } else { -- cgit From 86d81ec5f5f05846c7c6e48ffb964b24cba2e669 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Wed, 3 Jul 2024 15:09:55 +0800 Subject: bcachefs: Mark bch_inode_info as SLAB_ACCOUNT After commit 230e9fc28604 ("slab: add SLAB_ACCOUNT flag"), we need to mark the inode cache as SLAB_ACCOUNT, similar to commit 5d097056c9a0 ("kmemcg: account for certain kmem allocations to memcg") Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 1768b2678b38..98238da7c8c1 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2071,7 +2071,8 @@ int __init bch2_vfs_init(void) { int ret = -ENOMEM; - bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); + bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT); if (!bch2_inode_cache) goto err; -- cgit From 8ed58789fc43343d5a55565b204db40a514c06fc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 3 Jul 2024 11:56:19 -0400 Subject: bcachefs: Fix undefined behaviour in eytzinger1_first() Signed-off-by: Kent Overstreet --- fs/bcachefs/eytzinger.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 24840aee335c..795f4fc0bab1 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -48,7 +48,7 @@ static inline unsigned eytzinger1_right_child(unsigned i) static inline unsigned eytzinger1_first(unsigned size) { - return rounddown_pow_of_two(size); + return size ? rounddown_pow_of_two(size) : 0; } static inline unsigned eytzinger1_last(unsigned size) @@ -101,7 +101,9 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) static inline unsigned eytzinger1_extra(unsigned size) { - return (size + 1 - rounddown_pow_of_two(size)) << 1; + return size + ? (size + 1 - rounddown_pow_of_two(size)) << 1 + : 0; } static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, -- cgit From 0f1f7324da0a65c823f16c0abae3bf2c18ee43e7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 3 Jul 2024 12:58:34 -0400 Subject: bcachefs: Log mount failure error code Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 98238da7c8c1..74a1e12a7a14 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2032,6 +2032,8 @@ err_put_super: __bch2_fs_stop(c); deactivate_locked_super(sb); err: + if (ret) + pr_err("error: %s", bch2_err_str(ret)); /* * On an inconsistency error in recovery we might see an -EROFS derived * errorcode (from the journal), but we don't want to return that to -- cgit From ad8b68cd3941c547cf73154244488ab412076177 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 3 Jul 2024 13:25:13 -0400 Subject: bcachefs: bch2_data_update_to_text() Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 34 ++++++++++++++++++++++++++++++++++ fs/bcachefs/data_update.h | 5 +++++ fs/bcachefs/move.c | 25 ------------------------- 3 files changed, 39 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 1a0072eef109..eeae760a15d3 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -5,7 +5,9 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "compress.h" #include "data_update.h" +#include "disk_groups.h" #include "ec.h" #include "error.h" #include "extents.h" @@ -454,6 +456,38 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, } } +void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + printbuf_tabstop_push(out, 20); + prt_str(out, "rewrite ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + + prt_str(out, "kill ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + + prt_str(out, "target:\t"); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + + prt_str(out, "compression:\t"); + bch2_compression_opt_to_text(out, background_compression(*io_opts)); + prt_newline(out); + + prt_str(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); +} + +void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index 991095bbd469..8d36365bdea8 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -17,6 +17,9 @@ struct data_update_opts { unsigned write_flags; }; +void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_opts *, struct data_update_opts *); + struct data_update { /* extent being updated: */ enum btree_id btree_id; @@ -27,6 +30,8 @@ struct data_update { struct bch_write_op op; }; +void bch2_data_update_to_text(struct printbuf *, struct data_update *); + int bch2_data_update_index_update(struct bch_write_op *); void bch2_data_update_read_done(struct data_update *, diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 6e477fadaa2a..e714e3bd5bbb 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -36,31 +36,6 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - printbuf_tabstop_push(out, 20); - prt_str(out, "rewrite ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); - prt_newline(out); - - prt_str(out, "kill ptrs:\t"); - bch2_prt_u64_base2(out, data_opts->kill_ptrs); - prt_newline(out); - - prt_str(out, "target:\t"); - bch2_target_to_text(out, c, data_opts->target); - prt_newline(out); - - prt_str(out, "compression:\t"); - bch2_compression_opt_to_text(out, background_compression(*io_opts)); - prt_newline(out); - - prt_str(out, "extra replicas:\t"); - prt_u64(out, data_opts->extra_replicas); -} - static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) -- cgit From f49d2c9835f95fa078ea8a8eba6de9cbddb9eb33 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 3 Jul 2024 13:23:58 -0400 Subject: bcachefs: Warn on attempting a move with no replicas Instead of popping an assert in bch2_write(), WARN and print out some debugging info. Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index eeae760a15d3..0087b8555ead 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -677,6 +677,16 @@ int bch2_data_update_init(struct btree_trans *trans, if (!(durability_have + durability_removing)) m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1); + if (!m->op.nr_replicas) { + struct printbuf buf = PRINTBUF; + + bch2_data_update_to_text(&buf, m); + WARN(1, "trying to move an extent, but nr_replicas=0\n%s", buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_data_update_done; + goto done; + } + m->op.nr_replicas_required = m->op.nr_replicas; if (reserve_sectors) { -- cgit From 0f6f8f76936b22e9a466ca6bd49aa0261f698276 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 4 Jul 2024 21:18:06 -0400 Subject: bcachefs: Fix missing error check in journal_entry_btree_keys_validate() Closes: https://syzkaller.appspot.com/bug?extid=8996d8f176cf946ef641 Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index f17c478a18e8..2326e2cb9cd2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -415,6 +415,8 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, flags|BCH_VALIDATE_journal); if (ret == FSCK_DELETED_KEY) continue; + else if (ret) + return ret; k = bkey_next(k); } -- cgit From 7d7f71cd8763a296d02dff9514447aa3de199c47 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Jul 2024 09:10:35 -0400 Subject: bcachefs: Add missing bch2_trans_begin() this fixes a 'transaction should be locked' error in backpointers fsck Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index e8037b5d8332..6d8b1bc90be0 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -770,6 +770,8 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, !((1U << btree) & btree_interior_mask)) continue; + bch2_trans_begin(trans); + __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, 0, depth, BTREE_ITER_prefetch, b, ret) { -- cgit