aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/alloc_background.c22
-rw-r--r--fs/bcachefs/bcachefs.h3
-rw-r--r--fs/bcachefs/btree_cache.c9
-rw-r--r--fs/bcachefs/btree_gc.c17
-rw-r--r--fs/bcachefs/btree_io.c8
-rw-r--r--fs/bcachefs/btree_iter.c11
-rw-r--r--fs/bcachefs/btree_key_cache.c33
-rw-r--r--fs/bcachefs/btree_locking.c1
-rw-r--r--fs/bcachefs/btree_node_scan.c9
-rw-r--r--fs/bcachefs/buckets.c293
-rw-r--r--fs/bcachefs/buckets.h17
-rw-r--r--fs/bcachefs/buckets_types.h2
-rw-r--r--fs/bcachefs/data_update.c3
-rw-r--r--fs/bcachefs/ec.c26
-rw-r--r--fs/bcachefs/extents.c9
-rw-r--r--fs/bcachefs/fs-ioctl.c17
-rw-r--r--fs/bcachefs/fs.c3
-rw-r--r--fs/bcachefs/fsck.c3
-rw-r--r--fs/bcachefs/io_read.c37
-rw-r--r--fs/bcachefs/io_write.c19
-rw-r--r--fs/bcachefs/move.c16
-rw-r--r--fs/bcachefs/movinggc.c7
-rw-r--r--fs/bcachefs/super-io.c6
-rw-r--r--fs/bcachefs/super.c10
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/disk-io.c10
-rw-r--r--fs/btrfs/extent_io.c60
-rw-r--r--fs/btrfs/file.c16
-rw-r--r--fs/btrfs/ordered-data.c31
-rw-r--r--fs/btrfs/tree-log.c17
-rw-r--r--fs/cachefiles/daemon.c3
-rw-r--r--fs/cachefiles/internal.h5
-rw-r--r--fs/cachefiles/ondemand.c218
-rw-r--r--fs/debugfs/inode.c10
-rw-r--r--fs/file.c4
-rw-r--r--fs/iomap/buffered-io.c56
-rw-r--r--fs/nfs/dir.c77
-rw-r--r--fs/nfs/nfs4proc.c24
-rw-r--r--fs/nfs/pagelist.c5
-rw-r--r--fs/nfs/symlink.c2
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/segment.c3
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/smb/client/smb2pdu.c3
-rw-r--r--fs/smb/client/smb2transport.c2
-rw-r--r--fs/smb/server/smb2pdu.c22
-rw-r--r--fs/smb/server/vfs.c17
-rw-r--r--fs/smb/server/vfs.h3
-rw-r--r--fs/smb/server/vfs_cache.c3
-rw-r--r--fs/xfs/libxfs/xfs_sb.c7
50 files changed, 778 insertions, 415 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 346cd91f91f9..c4b6601f5b74 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -741,6 +741,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
@@ -860,8 +861,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
percpu_down_read(&c->mark_lock);
- if (new_a->gen != old_a->gen)
- *bucket_gen(ca, new.k->p.offset) = new_a->gen;
+ if (new_a->gen != old_a->gen) {
+ u8 *gen = bucket_gen(ca, new.k->p.offset);
+ if (unlikely(!gen)) {
+ percpu_up_read(&c->mark_lock);
+ goto invalid_bucket;
+ }
+ *gen = new_a->gen;
+ }
bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
percpu_up_read(&c->mark_lock);
@@ -895,6 +902,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, new.k->p.offset);
+ if (unlikely(!g)) {
+ percpu_up_read(&c->mark_lock);
+ goto invalid_bucket;
+ }
+ g->gen_valid = 1;
bucket_lock(g);
@@ -910,8 +922,14 @@ int bch2_trigger_alloc(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
err:
+ printbuf_exit(&buf);
bch2_dev_put(ca);
return ret;
+invalid_bucket:
+ bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
+ (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
+ ret = -EIO;
+ goto err;
}
/*
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 2a538eb2af11..2992a644d822 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -790,7 +790,8 @@ struct bch_fs {
/* BTREE CACHE */
struct bio_set btree_bio;
- struct workqueue_struct *io_complete_wq;
+ struct workqueue_struct *btree_read_complete_wq;
+ struct workqueue_struct *btree_write_submit_wq;
struct btree_root btree_roots_known[BTREE_ID_NR];
DARRAY(struct btree_root) btree_roots_extra;
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 9e4ed75d3675..4f5e411771ba 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -91,10 +91,11 @@ static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
}
static const struct rhashtable_params bch_btree_cache_params = {
- .head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, hash_val),
- .key_len = sizeof(u64),
- .obj_cmpfn = bch2_btree_cache_cmp_fn,
+ .head_offset = offsetof(struct btree, hash),
+ .key_offset = offsetof(struct btree, hash_val),
+ .key_len = sizeof(u64),
+ .obj_cmpfn = bch2_btree_cache_cmp_fn,
+ .automatic_shrinking = true,
};
static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index dc97991bcd6a..0e477a926579 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -874,6 +874,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
const struct bch_alloc_v4 *old;
int ret;
+ if (!bucket_valid(ca, k.k->p.offset))
+ return 0;
+
old = bch2_alloc_to_v4(k, &old_convert);
gc = new = *old;
@@ -990,6 +993,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = ca->mi.nbuckets;
+ buckets->nbuckets_minus_first =
+ buckets->nbuckets - buckets->first_bucket;
rcu_assign_pointer(ca->buckets_gc, buckets);
}
@@ -1003,12 +1008,14 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
continue;
}
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+ if (bucket_valid(ca, k.k->p.offset)) {
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
- struct bucket *g = gc_bucket(ca, k.k->p.offset);
- g->gen_valid = 1;
- g->gen = a->gen;
+ struct bucket *g = gc_bucket(ca, k.k->p.offset);
+ g->gen_valid = 1;
+ g->gen = a->gen;
+ }
0;
})));
bch2_dev_put(ca);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 829c1b91477d..7bca15c604f5 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1389,7 +1389,7 @@ static void btree_node_read_endio(struct bio *bio)
bch2_latency_acct(ca, rb->start_time, READ);
}
- queue_work(c->io_complete_wq, &rb->work);
+ queue_work(c->btree_read_complete_wq, &rb->work);
}
struct btree_node_read_all {
@@ -1656,7 +1656,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
btree_node_read_all_replicas_done(&ra->cl.work);
} else {
continue_at(&ra->cl, btree_node_read_all_replicas_done,
- c->io_complete_wq);
+ c->btree_read_complete_wq);
}
return 0;
@@ -1737,7 +1737,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
if (sync)
btree_node_read_work(&rb->work);
else
- queue_work(c->io_complete_wq, &rb->work);
+ queue_work(c->btree_read_complete_wq, &rb->work);
}
}
@@ -2229,7 +2229,7 @@ do_write:
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
INIT_WORK(&wbio->work, btree_write_submit);
- queue_work(c->io_complete_wq, &wbio->work);
+ queue_work(c->btree_write_submit_wq, &wbio->work);
return;
err:
set_btree_node_noevict(b);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index d3bcb4e4e230..3694c600a3ad 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -221,11 +221,8 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
- unsigned i;
-
- EBUG_ON(path->btree_id >= BTREE_ID_NR);
- for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+ for (unsigned i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
if (!path->l[i].b) {
BUG_ON(!path->cached &&
bch2_btree_id_root(c, path->btree_id)->b->c.level > i);
@@ -251,8 +248,6 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- BUG_ON(iter->btree_id >= BTREE_ID_NR);
-
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
@@ -3406,8 +3401,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
bch2_time_stats_exit(&s->lock_hold_times);
}
- if (c->btree_trans_barrier_initialized)
+ if (c->btree_trans_barrier_initialized) {
+ synchronize_srcu_expedited(&c->btree_trans_barrier);
cleanup_srcu_struct(&c->btree_trans_barrier);
+ }
mempool_exit(&c->btree_trans_mem_pool);
mempool_exit(&c->btree_trans_pool);
}
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 34056aaece00..2d3c0d45c37f 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -32,10 +32,11 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
}
static const struct rhashtable_params bch2_btree_key_cache_params = {
- .head_offset = offsetof(struct bkey_cached, hash),
- .key_offset = offsetof(struct bkey_cached, key),
- .key_len = sizeof(struct bkey_cached_key),
- .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+ .head_offset = offsetof(struct bkey_cached, hash),
+ .key_offset = offsetof(struct bkey_cached, key),
+ .key_len = sizeof(struct bkey_cached_key),
+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
+ .automatic_shrinking = true,
};
__flatten
@@ -840,7 +841,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- freed++;
bc->nr_freed_nonpcpu--;
bc->freed++;
}
@@ -854,7 +854,6 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
- freed++;
bc->nr_freed_pcpu--;
bc->freed++;
}
@@ -876,23 +875,22 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
bc->skipped_dirty++;
- goto next;
} else if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) {
clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
bc->skipped_accessed++;
- goto next;
- } else if (bkey_cached_lock_for_evict(ck)) {
+ } else if (!bkey_cached_lock_for_evict(ck)) {
+ bc->skipped_lock_fail++;
+ } else {
bkey_cached_evict(bc, ck);
bkey_cached_free(bc, ck);
bc->moved_to_freelist++;
- } else {
- bc->skipped_lock_fail++;
+ freed++;
}
scanned++;
if (scanned >= nr)
break;
-next:
+
pos = next;
}
@@ -917,6 +915,14 @@ static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
long nr = atomic_long_read(&bc->nr_keys) -
atomic_long_read(&bc->nr_dirty);
+ /*
+ * Avoid hammering our shrinker too much if it's nearly empty - the
+ * shrinker code doesn't take into account how big our cache is, if it's
+ * mostly empty but the system is under memory pressure it causes nasty
+ * lock contention:
+ */
+ nr -= 128;
+
return max(0L, nr);
}
@@ -1025,9 +1031,10 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
if (!shrink)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
bc->shrink = shrink;
- shrink->seeks = 0;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
+ shrink->batch = 1 << 14;
+ shrink->seeks = 0;
shrink->private_data = c;
shrinker_register(shrink);
return 0;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index c3e9b0cc7bbd..d66fff22109a 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -215,6 +215,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
if (unlikely(!best)) {
struct printbuf buf = PRINTBUF;
+ buf.atomic++;
prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 45cb8149d374..2cb0442f6cc9 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -72,10 +72,11 @@ static bool found_btree_node_is_readable(struct btree_trans *trans,
struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
bool ret = !IS_ERR_OR_NULL(b);
- if (ret) {
- f->sectors_written = b->written;
- six_unlock_read(&b->c.lock);
- }
+ if (!ret)
+ return ret;
+
+ f->sectors_written = b->written;
+ six_unlock_read(&b->c.lock);
/*
* We might update this node's range; if that happens, we need the node
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index ed97712d0db1..743d57eba760 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -465,143 +465,172 @@ int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64
return bch2_update_replicas_list(trans, &r.e, sectors);
}
-int bch2_check_fix_ptrs(struct btree_trans *trans,
- enum btree_id btree, unsigned level, struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags)
+static int bch2_check_fix_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ const union bch_extent_entry *entry,
+ bool *do_update)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry_c;
- struct extent_ptr_decoded p = { 0 };
- bool do_update = false;
struct printbuf buf = PRINTBUF;
int ret = 0;
- percpu_down_read(&c->mark_lock);
+ struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
+ if (!ca) {
+ if (fsck_err(c, ptr_to_invalid_device,
+ "pointer to missing device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ return 0;
+ }
- bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
- if (!ca) {
- if (fsck_err(c, ptr_to_invalid_device,
- "pointer to missing device %u\n"
- "while marking %s",
- p.ptr.dev,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
- continue;
- }
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ if (!g) {
+ if (fsck_err(c, ptr_to_invalid_device,
+ "pointer to invalid bucket on device %u\n"
+ "while marking %s",
+ p.ptr.dev,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+ goto out;
+ }
- struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
- enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry_c);
+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
- if (fsck_err_on(!g->gen_valid,
- c, ptr_to_missing_alloc_key,
- "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- } else {
- do_update = true;
- }
+ if (fsck_err_on(!g->gen_valid,
+ c, ptr_to_missing_alloc_key,
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ } else {
+ *do_update = true;
}
+ }
- if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (!p.ptr.cached &&
- (g->data_type != BCH_DATA_btree ||
- data_type == BCH_DATA_btree)) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- do_update = true;
- }
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (!p.ptr.cached &&
+ (g->data_type != BCH_DATA_btree ||
+ data_type == BCH_DATA_btree)) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = 0;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ *do_update = true;
+ }
+ }
+
+ if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+ c, ptr_gen_newer_than_bucket_gen,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+
+ if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+ c, stale_dirty_ptr,
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+ bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
+ p.ptr.gen, g->gen,
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+ *do_update = true;
+
+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
+ goto out;
+
+ if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
+ c, ptr_bucket_data_type_mismatch,
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s",
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ if (data_type == BCH_DATA_btree) {
+ g->gen_valid = true;
+ g->gen = p.ptr.gen;
+ g->data_type = data_type;
+ g->dirty_sectors = 0;
+ g->cached_sectors = 0;
+ } else {
+ *do_update = true;
}
+ }
+
+ if (p.has_ec) {
+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
- if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
- c, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+ if (fsck_err_on(!m || !m->alive,
+ c, ptr_to_missing_stripe,
+ "pointer to nonexistent stripe %llu\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen,
+ (u64) p.ec.idx,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
+ *do_update = true;
- if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
- c, stale_dirty_ptr,
- "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p),
+ c, ptr_to_incorrect_stripe,
+ "pointer does not match stripe %llu\n"
"while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_type_str(ptr_data_type(k.k, &p.ptr)),
- p.ptr.gen, g->gen,
+ (u64) p.ec.idx,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
+ *do_update = true;
+ }
+out:
+fsck_err:
+ bch2_dev_put(ca);
+ printbuf_exit(&buf);
+ return ret;
+}
- if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
- goto next;
+int bch2_check_fix_ptrs(struct btree_trans *trans,
+ enum btree_id btree, unsigned level, struct bkey_s_c k,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry_c;
+ struct extent_ptr_decoded p = { 0 };
+ bool do_update = false;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
- if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type),
- c, ptr_bucket_data_type_mismatch,
- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = data_type;
- g->dirty_sectors = 0;
- g->cached_sectors = 0;
- } else {
- do_update = true;
- }
- }
+ percpu_down_read(&c->mark_lock);
- if (p.has_ec) {
- struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
-
- if (fsck_err_on(!m || !m->alive, c,
- ptr_to_missing_stripe,
- "pointer to nonexistent stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
-
- if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
- ptr_to_incorrect_stripe,
- "pointer does not match stripe %llu\n"
- "while marking %s",
- (u64) p.ec.idx,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- do_update = true;
- }
-next:
- bch2_dev_put(ca);
+ bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
+ ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
+ if (ret)
+ goto err;
}
if (do_update) {
@@ -716,7 +745,6 @@ found:
bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
}
err:
-fsck_err:
percpu_up_read(&c->mark_lock);
printbuf_exit(&buf);
return ret;
@@ -987,6 +1015,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
bool insert = !(flags & BTREE_TRIGGER_overwrite);
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_fs *c = trans->c;
@@ -1019,6 +1048,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ p.ptr.dev,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ goto err_unlock;
+ }
+
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
ret = __mark_pointer(trans, ca, k, &p.ptr, *sectors, bp.data_type, &new);
@@ -1027,10 +1063,12 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, &old, &new, 0, true);
}
bucket_unlock(g);
+err_unlock:
percpu_up_read(&c->mark_lock);
}
err:
bch2_dev_put(ca);
+ printbuf_exit(&buf);
return ret;
}
@@ -1318,10 +1356,11 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
u64 b, enum bch_data_type data_type, unsigned sectors,
enum btree_iter_update_trigger_flags flags)
{
- int ret = 0;
-
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, b);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
+ ca->dev_idx, bch2_data_type_str(data_type)))
+ goto err_unlock;
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
@@ -1330,29 +1369,27 @@ static int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
bch2_data_type_str(g->data_type),
- bch2_data_type_str(data_type))) {
- ret = -EIO;
+ bch2_data_type_str(data_type)))
goto err;
- }
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
bch2_data_type_str(g->data_type ?: data_type),
- g->dirty_sectors, sectors)) {
- ret = -EIO;
+ g->dirty_sectors, sectors))
goto err;
- }
g->data_type = data_type;
g->dirty_sectors += sectors;
struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
+ bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+ percpu_up_read(&c->mark_lock);
+ return 0;
err:
bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update(c, ca, &old, &new, 0, true);
+err_unlock:
percpu_up_read(&c->mark_lock);
- return ret;
+ return -EIO;
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
@@ -1595,6 +1632,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
bucket_gens->first_bucket = ca->mi.first_bucket;
bucket_gens->nbuckets = nbuckets;
+ bucket_gens->nbuckets_minus_first =
+ bucket_gens->nbuckets - bucket_gens->first_bucket;
if (resize) {
down_write(&c->gc_lock);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 617ffde2fb7a..80ee0be9793e 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -93,7 +93,8 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
{
struct bucket_array *buckets = gc_bucket_array(ca);
- BUG_ON(!bucket_valid(ca, b));
+ if (b - buckets->first_bucket >= buckets->nbuckets_minus_first)
+ return NULL;
return buckets->b + b;
}
@@ -110,7 +111,8 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
{
struct bucket_gens *gens = bucket_gens(ca);
- BUG_ON(!bucket_valid(ca, b));
+ if (b - gens->first_bucket >= gens->nbuckets_minus_first)
+ return NULL;
return gens->b + b;
}
@@ -170,19 +172,22 @@ static inline int gen_after(u8 a, u8 b)
return r > 0 ? r : 0;
}
-static inline u8 dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- return gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+ u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr));
+ if (!gen)
+ return -1;
+ return gen_after(*gen, ptr->gen);
}
/**
* dev_ptr_stale() - check if a pointer points into a bucket that has been
* invalidated.
*/
-static inline u8 dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
+static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
rcu_read_lock();
- u8 ret = dev_ptr_stale_rcu(ca, ptr);
+ int ret = dev_ptr_stale_rcu(ca, ptr);
rcu_read_unlock();
return ret;
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 6a31740222a7..f636e17c4caf 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -22,6 +22,7 @@ struct bucket_array {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
+ size_t nbuckets_minus_first;
struct bucket b[];
};
@@ -29,6 +30,7 @@ struct bucket_gens {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
+ size_t nbuckets_minus_first;
u8 b[];
};
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 0d807c2ce9c6..1a0072eef109 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -202,9 +202,8 @@ restart_drop_conflicting_replicas:
bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
/* Now, drop excess replicas: */
-restart_drop_extra_replicas:
-
rcu_read_lock();
+restart_drop_extra_replicas:
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d8b9beca3776..83e279d41829 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -268,6 +268,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx;
+ struct printbuf buf = PRINTBUF;
int ret = 0;
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
@@ -289,6 +290,13 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (flags & BTREE_TRIGGER_gc) {
percpu_down_read(&c->mark_lock);
struct bucket *g = gc_bucket(ca, bucket.offset);
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ ptr->dev,
+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+ ret = -EIO;
+ goto err_unlock;
+ }
+
bucket_lock(g);
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
@@ -297,10 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
bch2_dev_usage_update(c, ca, &old, &new, 0, true);
}
bucket_unlock(g);
+err_unlock:
percpu_up_read(&c->mark_lock);
}
err:
bch2_dev_put(ca);
+ printbuf_exit(&buf);
return ret;
}
@@ -714,10 +724,12 @@ static void ec_block_endio(struct bio *bio)
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
- if (dev_ptr_stale(ca, ptr)) {
+ int stale = dev_ptr_stale(ca, ptr);
+ if (stale) {
bch_err_ratelimited(ca->fs,
- "error %s stripe: stale pointer after io",
- bio_data_dir(bio) == READ ? "reading from" : "writing to");
+ "error %s stripe: stale/invalid pointer (%i) after io",
+ bio_data_dir(bio) == READ ? "reading from" : "writing to",
+ stale);
clear_bit(ec_bio->idx, ec_bio->buf->valid);
}
@@ -743,10 +755,12 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
return;
}
- if (dev_ptr_stale(ca, ptr)) {
+ int stale = dev_ptr_stale(ca, ptr);
+ if (stale) {
bch_err_ratelimited(c,
- "error %s stripe: stale pointer",
- rw == READ ? "reading from" : "writing to");
+ "error %s stripe: stale pointer (%i)",
+ rw == READ ? "reading from" : "writing to",
+ stale);
clear_bit(idx, buf->valid);
return;
}
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 469037929685..410b8bd81b5a 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -137,7 +137,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
- if (p.ptr.cached && (!ca || dev_ptr_stale(ca, &p.ptr)))
+ if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
@@ -999,7 +999,7 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
(ca = bch2_dev_rcu(c, ptr->dev)) &&
- dev_ptr_stale_rcu(ca, ptr));
+ dev_ptr_stale_rcu(ca, ptr) > 0);
rcu_read_unlock();
return bkey_deleted(k.k);
@@ -1024,8 +1024,11 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
prt_str(out, " cached");
if (ptr->unwritten)
prt_str(out, " unwritten");
- if (bucket_valid(ca, b) && dev_ptr_stale_rcu(ca, ptr))
+ int stale = dev_ptr_stale_rcu(ca, ptr);
+ if (stale > 0)
prt_printf(out, " stale");
+ else if (stale)
+ prt_printf(out, " invalid");
}
rcu_read_unlock();
--out->atomic;
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 205a323ffc6d..3551a737181b 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -308,8 +308,8 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
return ret;
}
-static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+ struct bch_ioctl_subvolume arg)
{
struct inode *dir;
struct bch_inode_info *inode;
@@ -406,9 +406,12 @@ retry:
!arg.src_ptr)
snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol;
+ down_write(&c->snapshot_create_lock);
inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir),
dst_dentry, arg.mode|S_IFDIR,
0, snapshot_src, create_flags);
+ up_write(&c->snapshot_create_lock);
+
error = PTR_ERR_OR_ZERO(inode);
if (error)
goto err3;
@@ -429,16 +432,6 @@ err1:
return error;
}
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
- struct bch_ioctl_subvolume arg)
-{
- down_write(&c->snapshot_create_lock);
- long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
- up_write(&c->snapshot_create_lock);
-
- return ret;
-}
-
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index cd388f1702dc..77126992dba8 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -227,7 +227,9 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
mutex_init(&inode->ei_update_lock);
two_state_lock_init(&inode->ei_pagecache_lock);
INIT_LIST_HEAD(&inode->ei_vfs_inode_list);
+ inode->ei_flags = 0;
mutex_init(&inode->ei_quota_lock);
+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
inode->v.i_state = 0;
if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
@@ -1967,6 +1969,7 @@ got_sb:
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
sb->s_uuid = c->sb.user_uuid;
+ sb->s_shrink->seeks = 0;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index fd277bd58ed3..921bcdb3e5e4 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -1677,6 +1677,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
trans_was_restarted(trans, restart_count);
}
+noinline_for_stack
static int check_dirent_inode_dirent(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
@@ -1773,6 +1774,7 @@ out_noiter:
return ret;
}
+noinline_for_stack
static int check_dirent_target(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c_dirent d,
@@ -1847,6 +1849,7 @@ found:
return ret;
}
+noinline_for_stack
static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_dirent d)
{
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index f57486794484..c97fa7002b06 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -84,9 +84,10 @@ struct promote_op {
};
static const struct rhashtable_params bch_promote_params = {
- .head_offset = offsetof(struct promote_op, hash),
- .key_offset = offsetof(struct promote_op, pos),
- .key_len = sizeof(struct bpos),
+ .head_offset = offsetof(struct promote_op, hash),
+ .key_offset = offsetof(struct promote_op, pos),
+ .key_len = sizeof(struct bpos),
+ .automatic_shrinking = true,
};
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
@@ -776,18 +777,32 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
PTR_BUCKET_POS(ca, &ptr),
BTREE_ITER_cached);
- prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
- printbuf_indent_add(&buf, 2);
+ u8 *gen = bucket_gen(ca, iter.pos.offset);
+ if (gen) {
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
+ prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
+ printbuf_indent_add(&buf, 2);
- prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
-
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
- if (!ret) {
+ bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
+
+ prt_printf(&buf, "memory gen: %u", *gen);
+
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (!ret) {
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ }
+ } else {
+ prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n",
+ iter.pos.inode, iter.pos.offset);
+ printbuf_indent_add(&buf, 2);
+
+ prt_printf(&buf, "first bucket %u nbuckets %llu\n",
+ ca->mi.first_bucket, ca->mi.nbuckets);
+
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
}
bch2_fs_inconsistent(c, "%s", buf.buf);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 9401d13e31bb..05e0cbef420b 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1220,7 +1220,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
u32 snapshot;
struct bucket_to_lock *stale_at;
- int ret;
+ int stale, ret;
if (op->flags & BCH_WRITE_MOVE)
return;
@@ -1299,7 +1299,8 @@ retry:
BUCKET_NOCOW_LOCK_UPDATE);
rcu_read_lock();
- bool stale = gen_after(*bucket_gen(ca, i->b.offset), i->gen);
+ u8 *gen = bucket_gen(ca, i->b.offset);
+ stale = !gen ? -1 : gen_after(*gen, i->gen);
rcu_read_unlock();
if (unlikely(stale)) {
@@ -1380,8 +1381,18 @@ err_bucket_stale:
break;
}
- /* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
+ struct printbuf buf = PRINTBUF;
+ if (bch2_fs_inconsistent_on(stale < 0, c,
+ "pointer to invalid bucket in nocow path on device %llu\n %s",
+ stale_at->b.inode,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = -EIO;
+ } else {
+ /* We can retry this: */
+ ret = -BCH_ERR_transaction_restart;
+ }
+ printbuf_exit(&buf);
+
goto err_get_ioref;
}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8171f947fac8..6e477fadaa2a 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -547,6 +547,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
ctxt->stats->pos = BBPOS(btree_id, start);
}
+ bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, btree_id, start,
BTREE_ITER_prefetch|
BTREE_ITER_all_snapshots);
@@ -920,7 +921,20 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
? c->opts.metadata_replicas
: io_opts->data_replicas;
- if (!nr_good || nr_good >= replicas)
+ rcu_read_lock();
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ptr->cached &&
+ (!ca || !ca->mi.durability))
+ data_opts->kill_ptrs |= BIT(i);
+ i++;
+ }
+ rcu_read_unlock();
+
+ if (!data_opts->kill_ptrs &&
+ (!nr_good || nr_good >= replicas))
return false;
data_opts->target = 0;
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 10bfb31c151b..eb49dd045eff 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -35,9 +35,10 @@ struct buckets_in_flight {
};
static const struct rhashtable_params bch_move_bucket_params = {
- .head_offset = offsetof(struct move_bucket_in_flight, hash),
- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
- .key_len = sizeof(struct move_bucket_key),
+ .head_offset = offsetof(struct move_bucket_in_flight, hash),
+ .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
+ .key_len = sizeof(struct move_bucket_key),
+ .automatic_shrinking = true,
};
static struct move_bucket_in_flight *
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d73a0222f709..055478d21e9e 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -1310,15 +1310,15 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Device index:\t%u\n", sb->dev_idx);
- prt_str(out, "Label:\t");
+ prt_printf(out, "Label:\t");
prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label);
prt_newline(out);
- prt_str(out, "Version:\t");
+ prt_printf(out, "Version:\t");
bch2_version_to_text(out, le16_to_cpu(sb->version));
prt_newline(out);
- prt_str(out, "Version upgrade complete:\t");
+ prt_printf(out, "Version upgrade complete:\t");
bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb));
prt_newline(out);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index df2bea38e83f..65e239d32915 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -582,8 +582,10 @@ static void __bch2_fs_free(struct bch_fs *c)
if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq);
- if (c->io_complete_wq)
- destroy_workqueue(c->io_complete_wq);
+ if (c->btree_write_submit_wq)
+ destroy_workqueue(c->btree_write_submit_wq);
+ if (c->btree_read_complete_wq)
+ destroy_workqueue(c->btree_read_complete_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
if (c->btree_io_complete_wq)
@@ -878,8 +880,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+ !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
+ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 91c994b569f3..6ed495ca7a31 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -89,6 +89,16 @@ enum {
BTRFS_INODE_FREE_SPACE_INODE,
/* Set when there are no capabilities in XATTs for the inode. */
BTRFS_INODE_NO_CAP_XATTR,
+ /*
+ * Set if an error happened when doing a COW write before submitting a
+ * bio or during writeback. Used for both buffered writes and direct IO
+ * writes. This is to signal a fast fsync that it has to wait for
+ * ordered extents to complete and therefore not log extent maps that
+ * point to unwritten extents (when an ordered extent completes and it
+ * has the BTRFS_ORDERED_IOERR flag set, it drops extent maps in its
+ * range).
+ */
+ BTRFS_INODE_COW_WRITE_ERROR,
};
/* in memory btrfs inode */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1b20b3e390df..38cdb8875e8e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4538,18 +4538,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
- struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_delayed_ref_node *ref;
- delayed_refs = &trans->delayed_refs;
-
spin_lock(&delayed_refs->lock);
- if (atomic_read(&delayed_refs->num_entries) == 0) {
- spin_unlock(&delayed_refs->lock);
- btrfs_debug(fs_info, "delayed_refs has NO entry");
- return;
- }
-
while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 597387e9f040..f688fab55251 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3689,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer(
struct folio *folio = page_folio(page);
struct extent_buffer *exists;
+ lockdep_assert_held(&page->mapping->i_private_lock);
+
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
@@ -3756,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct btrfs_subpage *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
const unsigned long index = eb->start >> PAGE_SHIFT;
- struct folio *existing_folio;
+ struct folio *existing_folio = NULL;
int ret;
ASSERT(found_eb_ret);
@@ -3774,12 +3777,14 @@ retry:
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
- return 0;
+ goto finish;
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
- if (IS_ERR(existing_folio))
+ if (IS_ERR(existing_folio)) {
+ existing_folio = NULL;
goto retry;
+ }
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
@@ -3790,14 +3795,13 @@ retry:
return -EAGAIN;
}
- if (fs_info->nodesize < PAGE_SIZE) {
- /*
- * We're going to reuse the existing page, can drop our page
- * and subpage structure now.
- */
+finish:
+ spin_lock(&mapping->i_private_lock);
+ if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ /* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
- } else {
+ } else if (existing_folio) {
struct extent_buffer *existing_eb;
existing_eb = grab_extent_buffer(fs_info,
@@ -3805,6 +3809,7 @@ retry:
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
+ spin_unlock(&mapping->i_private_lock);
folio_unlock(existing_folio);
folio_put(existing_folio);
return 1;
@@ -3813,6 +3818,22 @@ retry:
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
}
+ eb->folio_size = folio_size(eb->folios[i]);
+ eb->folio_shift = folio_shift(eb->folios[i]);
+ /* Should not fail, as we have preallocated the memory. */
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
+ ASSERT(!ret);
+ /*
+ * To inform we have an extra eb under allocation, so that
+ * detach_extent_buffer_page() won't release the folio private when the
+ * eb hasn't been inserted into radix tree yet.
+ *
+ * The ref will be decreased when the eb releases the page, in
+ * detach_extent_buffer_page(). Thus needs no special handling in the
+ * error path.
+ */
+ btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
@@ -3824,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
@@ -3890,7 +3910,7 @@ reallocate:
for (int i = 0; i < num_folios; i++) {
struct folio *folio;
- ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
if (ret > 0) {
ASSERT(existing_eb);
goto out;
@@ -3927,24 +3947,6 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- eb->folio_size = folio_size(folio);
- eb->folio_shift = folio_shift(folio);
- spin_lock(&mapping->i_private_lock);
- /* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_folio(eb, folio, prealloc);
- ASSERT(!ret);
- /*
- * To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the folio private
- * when the eb hasn't yet been inserted into radix tree.
- *
- * The ref will be decreased when the eb released the page, in
- * detach_extent_buffer_page().
- * Thus needs no special handling in error path.
- */
- btrfs_folio_inc_eb_refs(fs_info, folio);
- spin_unlock(&mapping->i_private_lock);
-
WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
/*
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e764ac3f22e2..d90138683a0a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1885,6 +1885,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
if (full_sync || btrfs_is_zoned(fs_info)) {
ret = btrfs_wait_ordered_range(inode, start, len);
+ clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &BTRFS_I(inode)->runtime_flags);
} else {
/*
* Get our ordered extents as soon as possible to avoid doing
@@ -1894,6 +1895,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
&ctx.ordered_extents);
ret = filemap_fdatawait_range(inode->i_mapping, start, end);
+ if (ret)
+ goto out_release_extents;
+
+ /*
+ * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after
+ * starting and waiting for writeback, because for buffered IO
+ * it may have been set during the end IO callback
+ * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in
+ * case an error happened and we need to wait for ordered
+ * extents to complete so that any extent maps that point to
+ * unwritten locations are dropped and we don't log them.
+ */
+ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = btrfs_wait_ordered_range(inode, start, len);
}
if (ret)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c5bdd674f55c..35a413ce935d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -388,6 +388,37 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate);
spin_unlock_irqrestore(&inode->ordered_tree_lock, flags);
+ /*
+ * If this is a COW write it means we created new extent maps for the
+ * range and they point to unwritten locations if we got an error either
+ * before submitting a bio or during IO.
+ *
+ * We have marked the ordered extent with BTRFS_ORDERED_IOERR, and we
+ * are queuing its completion below. During completion, at
+ * btrfs_finish_one_ordered(), we will drop the extent maps for the
+ * unwritten extents.
+ *
+ * However because completion runs in a work queue we can end up having
+ * a fast fsync running before that. In the case of direct IO, once we
+ * unlock the inode the fsync might start, and we queue the completion
+ * before unlocking the inode. In the case of buffered IO when writeback
+ * finishes (end_bbio_data_write()) we queue the completion, so if the
+ * writeback was triggered by a fast fsync, the fsync might start
+ * logging before ordered extent completion runs in the work queue.
+ *
+ * The fast fsync will log file extent items based on the extent maps it
+ * finds, so if by the time it collects extent maps the ordered extent
+ * completion didn't happen yet, it will log file extent items that
+ * point to unwritten extents, resulting in a corruption if a crash
+ * happens and the log tree is replayed. Note that a fast fsync does not
+ * wait for completion of ordered extents in order to reduce latency.
+ *
+ * Set a flag in the inode so that the next fast fsync will wait for
+ * ordered extents to complete before starting to log.
+ */
+ if (!uptodate && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ set_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags);
+
if (ret)
btrfs_queue_ordered_fn(ordered);
return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 5146387b416b..26a2e5aa08e9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4860,18 +4860,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
- if (!dropped_extents) {
- /*
- * Avoid logging extent items logged in past fsync calls
- * and leading to duplicate keys in the log tree.
- */
+ /*
+ * Avoid overlapping items in the log tree. The first time we
+ * get here, get rid of everything from a past fsync. After
+ * that, if the current extent starts before the end of the last
+ * extent we copied, truncate the last one. This can happen if
+ * an ordered extent completion modifies the subvolume tree
+ * while btrfs_next_leaf() has the tree unlocked.
+ */
+ if (!dropped_extents || key.offset < truncate_offset) {
ret = truncate_inode_items(trans, root->log_root, inode,
- truncate_offset,
+ min(key.offset, truncate_offset),
BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
}
+ truncate_offset = btrfs_file_extent_end(path);
if (ins_nr == 0)
start_slot = slot;
ins_nr++;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 6465e2574230..06cdf1a8a16f 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -133,7 +133,7 @@ static int cachefiles_daemon_open(struct inode *inode, struct file *file)
return 0;
}
-static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
+void cachefiles_flush_reqs(struct cachefiles_cache *cache)
{
struct xarray *xa = &cache->reqs;
struct cachefiles_req *req;
@@ -159,6 +159,7 @@ static void cachefiles_flush_reqs(struct cachefiles_cache *cache)
xa_for_each(xa, index, req) {
req->error = -EIO;
complete(&req->done);
+ __xa_erase(xa, index);
}
xa_unlock(xa);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index d33169f0018b..6845a90cdfcc 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -55,6 +55,7 @@ struct cachefiles_ondemand_info {
int ondemand_id;
enum cachefiles_object_state state;
struct cachefiles_object *object;
+ spinlock_t lock;
};
/*
@@ -138,6 +139,7 @@ static inline bool cachefiles_in_ondemand_mode(struct cachefiles_cache *cache)
struct cachefiles_req {
struct cachefiles_object *object;
struct completion done;
+ refcount_t ref;
int error;
struct cachefiles_msg msg;
};
@@ -186,6 +188,7 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
* daemon.c
*/
extern const struct file_operations cachefiles_daemon_fops;
+extern void cachefiles_flush_reqs(struct cachefiles_cache *cache);
extern void cachefiles_get_unbind_pincount(struct cachefiles_cache *cache);
extern void cachefiles_put_unbind_pincount(struct cachefiles_cache *cache);
@@ -424,6 +427,8 @@ do { \
pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
fscache_io_error((___cache)->cache); \
set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
+ if (cachefiles_in_ondemand_mode(___cache)) \
+ cachefiles_flush_reqs(___cache); \
} while (0)
#define cachefiles_io_error_obj(object, FMT, ...) \
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 4ba42f1fa3b4..bce005f2b456 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -1,22 +1,42 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-#include <linux/fdtable.h>
#include <linux/anon_inodes.h>
#include <linux/uio.h>
#include "internal.h"
+struct ondemand_anon_file {
+ struct file *file;
+ int fd;
+};
+
+static inline void cachefiles_req_put(struct cachefiles_req *req)
+{
+ if (refcount_dec_and_test(&req->ref))
+ kfree(req);
+}
+
static int cachefiles_ondemand_fd_release(struct inode *inode,
struct file *file)
{
struct cachefiles_object *object = file->private_data;
- struct cachefiles_cache *cache = object->volume->cache;
- struct cachefiles_ondemand_info *info = object->ondemand;
- int object_id = info->ondemand_id;
+ struct cachefiles_cache *cache;
+ struct cachefiles_ondemand_info *info;
+ int object_id;
struct cachefiles_req *req;
- XA_STATE(xas, &cache->reqs, 0);
+ XA_STATE(xas, NULL, 0);
+
+ if (!object)
+ return 0;
+
+ info = object->ondemand;
+ cache = object->volume->cache;
+ xas.xa = &cache->reqs;
xa_lock(&cache->reqs);
+ spin_lock(&info->lock);
+ object_id = info->ondemand_id;
info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
cachefiles_ondemand_set_object_close(object);
+ spin_unlock(&info->lock);
/* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */
xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
@@ -76,12 +96,12 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
}
static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
- unsigned long arg)
+ unsigned long id)
{
struct cachefiles_object *object = filp->private_data;
struct cachefiles_cache *cache = object->volume->cache;
struct cachefiles_req *req;
- unsigned long id;
+ XA_STATE(xas, &cache->reqs, id);
if (ioctl != CACHEFILES_IOC_READ_COMPLETE)
return -EINVAL;
@@ -89,10 +109,15 @@ static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
return -EOPNOTSUPP;
- id = arg;
- req = xa_erase(&cache->reqs, id);
- if (!req)
+ xa_lock(&cache->reqs);
+ req = xas_load(&xas);
+ if (!req || req->msg.opcode != CACHEFILES_OP_READ ||
+ req->object != object) {
+ xa_unlock(&cache->reqs);
return -EINVAL;
+ }
+ xas_store(&xas, NULL);
+ xa_unlock(&cache->reqs);
trace_cachefiles_ondemand_cread(object, id);
complete(&req->done);
@@ -116,10 +141,12 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
{
struct cachefiles_req *req;
struct fscache_cookie *cookie;
+ struct cachefiles_ondemand_info *info;
char *pid, *psize;
unsigned long id;
long size;
int ret;
+ XA_STATE(xas, &cache->reqs, 0);
if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
return -EOPNOTSUPP;
@@ -143,10 +170,18 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
if (ret)
return ret;
- req = xa_erase(&cache->reqs, id);
- if (!req)
+ xa_lock(&cache->reqs);
+ xas.xa_index = id;
+ req = xas_load(&xas);
+ if (!req || req->msg.opcode != CACHEFILES_OP_OPEN ||
+ !req->object->ondemand->ondemand_id) {
+ xa_unlock(&cache->reqs);
return -EINVAL;
+ }
+ xas_store(&xas, NULL);
+ xa_unlock(&cache->reqs);
+ info = req->object->ondemand;
/* fail OPEN request if copen format is invalid */
ret = kstrtol(psize, 0, &size);
if (ret) {
@@ -166,6 +201,32 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
goto out;
}
+ spin_lock(&info->lock);
+ /*
+ * The anonymous fd was closed before copen ? Fail the request.
+ *
+ * t1 | t2
+ * ---------------------------------------------------------
+ * cachefiles_ondemand_copen
+ * req = xa_erase(&cache->reqs, id)
+ * // Anon fd is maliciously closed.
+ * cachefiles_ondemand_fd_release
+ * xa_lock(&cache->reqs)
+ * cachefiles_ondemand_set_object_close(object)
+ * xa_unlock(&cache->reqs)
+ * cachefiles_ondemand_set_object_open
+ * // No one will ever close it again.
+ * cachefiles_ondemand_daemon_read
+ * cachefiles_ondemand_select_req
+ *
+ * Get a read req but its fd is already closed. The daemon can't
+ * issue a cread ioctl with an closed fd, then hung.
+ */
+ if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED) {
+ spin_unlock(&info->lock);
+ req->error = -EBADFD;
+ goto out;
+ }
cookie = req->object->cookie;
cookie->object_size = size;
if (size)
@@ -175,9 +236,15 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
trace_cachefiles_ondemand_copen(req->object, id, size);
cachefiles_ondemand_set_object_open(req->object);
+ spin_unlock(&info->lock);
wake_up_all(&cache->daemon_pollwq);
out:
+ spin_lock(&info->lock);
+ /* Need to set object close to avoid reopen status continuing */
+ if (info->ondemand_id == CACHEFILES_ONDEMAND_ID_CLOSED)
+ cachefiles_ondemand_set_object_close(req->object);
+ spin_unlock(&info->lock);
complete(&req->done);
return ret;
}
@@ -205,14 +272,14 @@ int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args)
return 0;
}
-static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
+static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
+ struct ondemand_anon_file *anon_file)
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
struct cachefiles_open *load;
- struct file *file;
u32 object_id;
- int ret, fd;
+ int ret;
object = cachefiles_grab_object(req->object,
cachefiles_obj_get_ondemand_fd);
@@ -224,35 +291,53 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
if (ret < 0)
goto err;
- fd = get_unused_fd_flags(O_WRONLY);
- if (fd < 0) {
- ret = fd;
+ anon_file->fd = get_unused_fd_flags(O_WRONLY);
+ if (anon_file->fd < 0) {
+ ret = anon_file->fd;
goto err_free_id;
}
- file = anon_inode_getfile("[cachefiles]", &cachefiles_ondemand_fd_fops,
- object, O_WRONLY);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
+ anon_file->file = anon_inode_getfile("[cachefiles]",
+ &cachefiles_ondemand_fd_fops, object, O_WRONLY);
+ if (IS_ERR(anon_file->file)) {
+ ret = PTR_ERR(anon_file->file);
goto err_put_fd;
}
- file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
- fd_install(fd, file);
+ spin_lock(&object->ondemand->lock);
+ if (object->ondemand->ondemand_id > 0) {
+ spin_unlock(&object->ondemand->lock);
+ /* Pair with check in cachefiles_ondemand_fd_release(). */
+ anon_file->file->private_data = NULL;
+ ret = -EEXIST;
+ goto err_put_file;
+ }
+
+ anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
load = (void *)req->msg.data;
- load->fd = fd;
+ load->fd = anon_file->fd;
object->ondemand->ondemand_id = object_id;
+ spin_unlock(&object->ondemand->lock);
cachefiles_get_unbind_pincount(cache);
trace_cachefiles_ondemand_open(object, &req->msg, load);
return 0;
+err_put_file:
+ fput(anon_file->file);
+ anon_file->file = NULL;
err_put_fd:
- put_unused_fd(fd);
+ put_unused_fd(anon_file->fd);
+ anon_file->fd = ret;
err_free_id:
xa_erase(&cache->ondemand_ids, object_id);
err:
+ spin_lock(&object->ondemand->lock);
+ /* Avoid marking an opened object as closed. */
+ if (object->ondemand->ondemand_id <= 0)
+ cachefiles_ondemand_set_object_close(object);
+ spin_unlock(&object->ondemand->lock);
cachefiles_put_object(object, cachefiles_obj_put_ondemand_fd);
return ret;
}
@@ -294,14 +379,28 @@ static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xa
return NULL;
}
+static inline bool cachefiles_ondemand_finish_req(struct cachefiles_req *req,
+ struct xa_state *xas, int err)
+{
+ if (unlikely(!xas || !req))
+ return false;
+
+ if (xa_cmpxchg(xas->xa, xas->xa_index, req, NULL, 0) != req)
+ return false;
+
+ req->error = err;
+ complete(&req->done);
+ return true;
+}
+
ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
char __user *_buffer, size_t buflen)
{
struct cachefiles_req *req;
struct cachefiles_msg *msg;
- unsigned long id = 0;
size_t n;
int ret = 0;
+ struct ondemand_anon_file anon_file;
XA_STATE(xas, &cache->reqs, cache->req_id_next);
xa_lock(&cache->reqs);
@@ -330,42 +429,37 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
xas_clear_mark(&xas, CACHEFILES_REQ_NEW);
cache->req_id_next = xas.xa_index + 1;
+ refcount_inc(&req->ref);
+ cachefiles_grab_object(req->object, cachefiles_obj_get_read_req);
xa_unlock(&cache->reqs);
- id = xas.xa_index;
-
if (msg->opcode == CACHEFILES_OP_OPEN) {
- ret = cachefiles_ondemand_get_fd(req);
- if (ret) {
- cachefiles_ondemand_set_object_close(req->object);
- goto error;
- }
+ ret = cachefiles_ondemand_get_fd(req, &anon_file);
+ if (ret)
+ goto out;
}
- msg->msg_id = id;
+ msg->msg_id = xas.xa_index;
msg->object_id = req->object->ondemand->ondemand_id;
- if (copy_to_user(_buffer, msg, n) != 0) {
+ if (copy_to_user(_buffer, msg, n) != 0)
ret = -EFAULT;
- goto err_put_fd;
- }
- /* CLOSE request has no reply */
- if (msg->opcode == CACHEFILES_OP_CLOSE) {
- xa_erase(&cache->reqs, id);
- complete(&req->done);
+ if (msg->opcode == CACHEFILES_OP_OPEN) {
+ if (ret < 0) {
+ fput(anon_file.file);
+ put_unused_fd(anon_file.fd);
+ goto out;
+ }
+ fd_install(anon_file.fd, anon_file.file);
}
-
- return n;
-
-err_put_fd:
- if (msg->opcode == CACHEFILES_OP_OPEN)
- close_fd(((struct cachefiles_open *)msg->data)->fd);
-error:
- xa_erase(&cache->reqs, id);
- req->error = ret;
- complete(&req->done);
- return ret;
+out:
+ cachefiles_put_object(req->object, cachefiles_obj_put_read_req);
+ /* Remove error request and CLOSE request has no reply */
+ if (ret || msg->opcode == CACHEFILES_OP_CLOSE)
+ cachefiles_ondemand_finish_req(req, &xas, ret);
+ cachefiles_req_put(req);
+ return ret ? ret : n;
}
typedef int (*init_req_fn)(struct cachefiles_req *req, void *private);
@@ -395,6 +489,7 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
goto out;
}
+ refcount_set(&req->ref, 1);
req->object = object;
init_completion(&req->done);
req->msg.opcode = opcode;
@@ -454,9 +549,19 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
goto out;
wake_up_all(&cache->daemon_pollwq);
- wait_for_completion(&req->done);
- ret = req->error;
- kfree(req);
+wait:
+ ret = wait_for_completion_killable(&req->done);
+ if (!ret) {
+ ret = req->error;
+ } else {
+ ret = -EINTR;
+ if (!cachefiles_ondemand_finish_req(req, &xas, ret)) {
+ /* Someone will complete it soon. */
+ cpu_relax();
+ goto wait;
+ }
+ }
+ cachefiles_req_put(req);
return ret;
out:
/* Reset the object to close state in error handling path.
@@ -578,6 +683,7 @@ int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
return -ENOMEM;
object->ondemand->object = object;
+ spin_lock_init(&object->ondemand->lock);
INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker);
return 0;
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index dc51df0b118d..8fd928899a59 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -107,8 +107,16 @@ static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param
int opt;
opt = fs_parse(fc, debugfs_param_specs, param, &result);
- if (opt < 0)
+ if (opt < 0) {
+ /*
+ * We might like to report bad mount options here; but
+ * traditionally debugfs has ignored all mount options
+ */
+ if (opt == -ENOPARAM)
+ return 0;
+
return opt;
+ }
switch (opt) {
case Opt_uid:
diff --git a/fs/file.c b/fs/file.c
index 8076aef9c210..a3b72aa64f11 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -486,12 +486,12 @@ struct files_struct init_files = {
static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned int maxfd = fdt->max_fds;
+ unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
- if (bitbit > maxfd)
+ if (bitbit >= maxfd)
return maxfd;
if (bitbit > start)
start = bitbit;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index c5802a459334..d46558990279 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -241,6 +241,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
unsigned block_size = (1 << block_bits);
size_t poff = offset_in_folio(folio, *pos);
size_t plen = min_t(loff_t, folio_size(folio) - poff, length);
+ size_t orig_plen = plen;
unsigned first = poff >> block_bits;
unsigned last = (poff + plen - 1) >> block_bits;
@@ -277,7 +278,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* handle both halves separately so that we properly zero data in the
* page cache for blocks that are entirely outside of i_size.
*/
- if (orig_pos <= isize && orig_pos + length > isize) {
+ if (orig_pos <= isize && orig_pos + orig_plen > isize) {
unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
if (first <= end && last > end)
@@ -877,22 +878,37 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t old_size = iter->inode->i_size;
+ size_t written;
if (srcmap->type == IOMAP_INLINE) {
iomap_write_end_inline(iter, folio, pos, copied);
- return true;
+ written = copied;
+ } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
+ written = block_write_end(NULL, iter->inode->i_mapping, pos,
+ len, copied, &folio->page, NULL);
+ WARN_ON_ONCE(written != copied && written != 0);
+ } else {
+ written = __iomap_write_end(iter->inode, pos, len, copied,
+ folio) ? copied : 0;
}
- if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
- size_t bh_written;
-
- bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
- len, copied, &folio->page, NULL);
- WARN_ON_ONCE(bh_written != copied && bh_written != 0);
- return bh_written == copied;
+ /*
+ * Update the in-memory inode size after copying the data into the page
+ * cache. It's up to the file system to write the updated size to disk,
+ * preferably after I/O completion so that no stale data is exposed.
+ * Only once that's done can we unlock and release the folio.
+ */
+ if (pos + written > old_size) {
+ i_size_write(iter->inode, pos + written);
+ iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
+ __iomap_put_folio(iter, pos, written, folio);
- return __iomap_write_end(iter->inode, pos, len, copied, folio);
+ if (old_size < pos)
+ pagecache_isize_extended(iter->inode, old_size, pos);
+
+ return written == copied;
}
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
@@ -907,7 +923,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
do {
struct folio *folio;
- loff_t old_size;
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
@@ -959,23 +974,6 @@ retry:
written = iomap_write_end(iter, pos, bytes, copied, folio) ?
copied : 0;
- /*
- * Update the in-memory inode size after copying the data into
- * the page cache. It's up to the file system to write the
- * updated size to disk, preferably after I/O completion so that
- * no stale data is exposed. Only once that's done can we
- * unlock and release the folio.
- */
- old_size = iter->inode->i_size;
- if (pos + written > old_size) {
- i_size_write(iter->inode, pos + written);
- iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
- }
- __iomap_put_folio(iter, pos, written, folio);
-
- if (old_size < pos)
- pagecache_isize_extended(iter->inode, old_size, pos);
-
cond_resched();
if (unlikely(written == 0)) {
/*
@@ -1346,7 +1344,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
bytes = folio_size(folio) - offset;
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
- __iomap_put_folio(iter, pos, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
@@ -1412,7 +1409,6 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
folio_mark_accessed(folio);
ret = iomap_write_end(iter, pos, bytes, bytes, folio);
- __iomap_put_folio(iter, pos, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 342930996226..07a7be27182e 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1627,7 +1627,16 @@ nfs_lookup_revalidate_done(struct inode *dir, struct dentry *dentry,
switch (error) {
case 1:
break;
- case 0:
+ case -ETIMEDOUT:
+ if (inode && (IS_ROOT(dentry) ||
+ NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL))
+ error = 1;
+ break;
+ case -ESTALE:
+ case -ENOENT:
+ error = 0;
+ fallthrough;
+ default:
/*
* We can't d_drop the root of a disconnected tree:
* its d_hash is on the s_anon list and d_drop() would hide
@@ -1682,18 +1691,8 @@ static int nfs_lookup_revalidate_dentry(struct inode *dir,
dir_verifier = nfs_save_change_attribute(dir);
ret = NFS_PROTO(dir)->lookup(dir, dentry, fhandle, fattr);
- if (ret < 0) {
- switch (ret) {
- case -ESTALE:
- case -ENOENT:
- ret = 0;
- break;
- case -ETIMEDOUT:
- if (NFS_SERVER(inode)->flags & NFS_MOUNT_SOFTREVAL)
- ret = 1;
- }
+ if (ret < 0)
goto out;
- }
/* Request help from readdirplus */
nfs_lookup_advise_force_readdirplus(dir, flags);
@@ -1737,7 +1736,7 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct inode *inode;
- int error;
+ int error = 0;
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = d_inode(dentry);
@@ -1782,7 +1781,7 @@ out_valid:
out_bad:
if (flags & LOOKUP_RCU)
return -ECHILD;
- return nfs_lookup_revalidate_done(dir, dentry, inode, 0);
+ return nfs_lookup_revalidate_done(dir, dentry, inode, error);
}
static int
@@ -1804,9 +1803,10 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags,
if (parent != READ_ONCE(dentry->d_parent))
return -ECHILD;
} else {
- /* Wait for unlink to complete */
+ /* Wait for unlink to complete - see unblock_revalidate() */
wait_var_event(&dentry->d_fsdata,
- dentry->d_fsdata != NFS_FSDATA_BLOCKED);
+ smp_load_acquire(&dentry->d_fsdata)
+ != NFS_FSDATA_BLOCKED);
parent = dget_parent(dentry);
ret = reval(d_inode(parent), dentry, flags);
dput(parent);
@@ -1819,6 +1819,29 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
return __nfs_lookup_revalidate(dentry, flags, nfs_do_lookup_revalidate);
}
+static void block_revalidate(struct dentry *dentry)
+{
+ /* old devname - just in case */
+ kfree(dentry->d_fsdata);
+
+ /* Any new reference that could lead to an open
+ * will take ->d_lock in lookup_open() -> d_lookup().
+ * Holding this lock ensures we cannot race with
+ * __nfs_lookup_revalidate() and removes and need
+ * for further barriers.
+ */
+ lockdep_assert_held(&dentry->d_lock);
+
+ dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+}
+
+static void unblock_revalidate(struct dentry *dentry)
+{
+ /* store_release ensures wait_var_event() sees the update */
+ smp_store_release(&dentry->d_fsdata, NULL);
+ wake_up_var(&dentry->d_fsdata);
+}
+
/*
* A weaker form of d_revalidate for revalidating just the d_inode(dentry)
* when we don't really care about the dentry name. This is called when a
@@ -2255,6 +2278,9 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
*/
int error = 0;
+ if (dentry->d_name.len > NFS_SERVER(dir)->namelen)
+ return -ENAMETOOLONG;
+
if (open_flags & O_CREAT) {
file->f_mode |= FMODE_CREATED;
error = nfs_do_create(dir, dentry, mode, open_flags);
@@ -2549,15 +2575,12 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
goto out;
}
- /* old devname */
- kfree(dentry->d_fsdata);
- dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ block_revalidate(dentry);
spin_unlock(&dentry->d_lock);
error = nfs_safe_remove(dentry);
nfs_dentry_remove_handle_error(dir, dentry, error);
- dentry->d_fsdata = NULL;
- wake_up_var(&dentry->d_fsdata);
+ unblock_revalidate(dentry);
out:
trace_nfs_unlink_exit(dir, dentry, error);
return error;
@@ -2664,8 +2687,7 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
{
struct dentry *new_dentry = data->new_dentry;
- new_dentry->d_fsdata = NULL;
- wake_up_var(&new_dentry->d_fsdata);
+ unblock_revalidate(new_dentry);
}
/*
@@ -2727,11 +2749,6 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) ||
WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED))
goto out;
- if (new_dentry->d_fsdata) {
- /* old devname */
- kfree(new_dentry->d_fsdata);
- new_dentry->d_fsdata = NULL;
- }
spin_lock(&new_dentry->d_lock);
if (d_count(new_dentry) > 2) {
@@ -2753,7 +2770,7 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
new_dentry = dentry;
new_inode = NULL;
} else {
- new_dentry->d_fsdata = NFS_FSDATA_BLOCKED;
+ block_revalidate(new_dentry);
must_unblock = true;
spin_unlock(&new_dentry->d_lock);
}
@@ -2765,6 +2782,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
must_unblock ? nfs_unblock_rename : NULL);
if (IS_ERR(task)) {
+ if (must_unblock)
+ unblock_revalidate(new_dentry);
error = PTR_ERR(task);
goto out;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c93c12063b3a..a691fa10b3e9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4023,6 +4023,23 @@ static void test_fs_location_for_trunking(struct nfs4_fs_location *location,
}
}
+static bool _is_same_nfs4_pathname(struct nfs4_pathname *path1,
+ struct nfs4_pathname *path2)
+{
+ int i;
+
+ if (path1->ncomponents != path2->ncomponents)
+ return false;
+ for (i = 0; i < path1->ncomponents; i++) {
+ if (path1->components[i].len != path2->components[i].len)
+ return false;
+ if (memcmp(path1->components[i].data, path2->components[i].data,
+ path1->components[i].len))
+ return false;
+ }
+ return true;
+}
+
static int _nfs4_discover_trunking(struct nfs_server *server,
struct nfs_fh *fhandle)
{
@@ -4056,9 +4073,13 @@ static int _nfs4_discover_trunking(struct nfs_server *server,
if (status)
goto out_free_3;
- for (i = 0; i < locations->nlocations; i++)
+ for (i = 0; i < locations->nlocations; i++) {
+ if (!_is_same_nfs4_pathname(&locations->fs_path,
+ &locations->locations[i].rootpath))
+ continue;
test_fs_location_for_trunking(&locations->locations[i], clp,
server);
+ }
out_free_3:
kfree(locations->fattr);
out_free_2:
@@ -6268,6 +6289,7 @@ nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
if (status == 0)
nfs_setsecurity(inode, fattr);
+ nfs_free_fattr(fattr);
return status;
}
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 6efb5068c116..040b6b79c75e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -1545,6 +1545,11 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
continue;
} else if (index == prev->wb_index + 1)
continue;
+ /*
+ * We will submit more requests after these. Indicate
+ * this to the underlying layers.
+ */
+ desc->pg_moreio = 1;
nfs_pageio_complete(desc);
break;
}
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 0e27a2e4e68b..13818129d268 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -41,7 +41,7 @@ static int nfs_symlink_filler(struct file *file, struct folio *folio)
error:
folio_set_error(folio);
folio_unlock(folio);
- return -EIO;
+ return error;
}
static const char *nfs_get_link(struct dentry *dentry,
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index a002a44ff161..52e50b1b7f22 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -607,7 +607,7 @@ int nilfs_empty_dir(struct inode *inode)
kaddr = nilfs_get_folio(inode, i, &folio);
if (IS_ERR(kaddr))
- continue;
+ return 0;
de = (struct nilfs_dir_entry *)kaddr;
kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 60d4f59f7665..6ea81f1d5094 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1652,6 +1652,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
if (bh->b_folio != bd_folio) {
if (bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
@@ -1665,6 +1666,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
if (bh == segbuf->sb_super_root) {
if (bh->b_folio != bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
@@ -1681,6 +1683,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
if (bd_folio) {
folio_lock(bd_folio);
+ folio_wait_writeback(bd_folio);
folio_clear_dirty_for_io(bd_folio);
folio_start_writeback(bd_folio);
folio_unlock(bd_folio);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 18550c071d71..72a1acd03675 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3214,7 +3214,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
- seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages);
+ seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm));
seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 993ac36c3d58..38a06e8a0f90 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -4577,8 +4577,6 @@ smb2_readv_callback(struct mid_q_entry *mid)
if (rdata->subreq.start < rdata->subreq.rreq->i_size)
rdata->result = 0;
}
- if (rdata->result == 0 || rdata->result == -EAGAIN)
- iov_iter_advance(&rdata->subreq.io_iter, rdata->got_bytes);
rdata->credits.value = 0;
netfs_subreq_terminated(&rdata->subreq,
(rdata->result == 0 || rdata->result == -EAGAIN) ?
@@ -4789,7 +4787,6 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->result = -ENOSPC;
else
wdata->subreq.len = written;
- iov_iter_advance(&wdata->subreq.io_iter, written);
break;
case MID_REQUEST_SUBMITTED:
case MID_RETRY_NEEDED:
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 02135a605305..1476c445cadc 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -216,8 +216,8 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid)
}
tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid);
if (!tcon) {
- cifs_put_smb_ses(ses);
spin_unlock(&cifs_tcp_ses_lock);
+ cifs_put_smb_ses(ses);
return NULL;
}
spin_unlock(&cifs_tcp_ses_lock);
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index b6c5a8ea3887..e7e07891781b 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -630,6 +630,12 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls)
return name;
}
+ if (*name == '\\') {
+ pr_err("not allow directory name included leading slash\n");
+ kfree(name);
+ return ERR_PTR(-EINVAL);
+ }
+
ksmbd_conv_path_to_unix(name);
ksmbd_strip_last_slash(name);
return name;
@@ -2361,7 +2367,8 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
if (rc > 0) {
rc = ksmbd_vfs_remove_xattr(idmap,
path,
- attr_name);
+ attr_name,
+ get_write);
if (rc < 0) {
ksmbd_debug(SMB,
@@ -2376,7 +2383,7 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
} else {
rc = ksmbd_vfs_setxattr(idmap, path, attr_name, value,
le16_to_cpu(eabuf->EaValueLength),
- 0, true);
+ 0, get_write);
if (rc < 0) {
ksmbd_debug(SMB,
"ksmbd_vfs_setxattr is failed(%d)\n",
@@ -2468,7 +2475,7 @@ static int smb2_remove_smb_xattrs(const struct path *path)
!strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX,
STREAM_PREFIX_LEN)) {
err = ksmbd_vfs_remove_xattr(idmap, path,
- name);
+ name, true);
if (err)
ksmbd_debug(SMB, "remove xattr failed : %s\n",
name);
@@ -2842,20 +2849,11 @@ int smb2_open(struct ksmbd_work *work)
}
if (req->NameLength) {
- if ((req->CreateOptions & FILE_DIRECTORY_FILE_LE) &&
- *(char *)req->Buffer == '\\') {
- pr_err("not allow directory name included leading slash\n");
- rc = -EINVAL;
- goto err_out2;
- }
-
name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset),
le16_to_cpu(req->NameLength),
work->conn->local_nls);
if (IS_ERR(name)) {
rc = PTR_ERR(name);
- if (rc != -ENOMEM)
- rc = -ENOENT;
name = NULL;
goto err_out2;
}
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 51b1b0bed616..9e859ba010cf 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -1058,16 +1058,21 @@ int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length,
}
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
- const struct path *path, char *attr_name)
+ const struct path *path, char *attr_name,
+ bool get_write)
{
int err;
- err = mnt_want_write(path->mnt);
- if (err)
- return err;
+ if (get_write == true) {
+ err = mnt_want_write(path->mnt);
+ if (err)
+ return err;
+ }
err = vfs_removexattr(idmap, path->dentry, attr_name);
- mnt_drop_write(path->mnt);
+
+ if (get_write == true)
+ mnt_drop_write(path->mnt);
return err;
}
@@ -1380,7 +1385,7 @@ int ksmbd_vfs_remove_sd_xattrs(struct mnt_idmap *idmap, const struct path *path)
ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name));
if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) {
- err = ksmbd_vfs_remove_xattr(idmap, path, name);
+ err = ksmbd_vfs_remove_xattr(idmap, path, name, true);
if (err)
ksmbd_debug(SMB, "remove xattr failed : %s\n", name);
}
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index cfe1c8092f23..cb76f4b5bafe 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -114,7 +114,8 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
- const struct path *path, char *attr_name);
+ const struct path *path, char *attr_name,
+ bool get_write);
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
unsigned int flags, struct path *parent_path,
struct path *path, bool caseless);
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 6cb599cd287e..8b2e37c8716e 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -254,7 +254,8 @@ static void __ksmbd_inode_close(struct ksmbd_file *fp)
ci->m_flags &= ~S_DEL_ON_CLS_STREAM;
err = ksmbd_vfs_remove_xattr(file_mnt_idmap(filp),
&filp->f_path,
- fp->stream.name);
+ fp->stream.name,
+ true);
if (err)
pr_err("remove xattr failed : %s\n",
fp->stream.name);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 09e4bf949bf8..6b56f0f6d4c1 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1038,11 +1038,12 @@ xfs_log_sb(
* and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
- mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+ mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount);
mp->m_sb.sb_ifree = min_t(uint64_t,
- percpu_counter_sum(&mp->m_ifree),
+ percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
- mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ mp->m_sb.sb_fdblocks =
+ percpu_counter_sum_positive(&mp->m_fdblocks);
}
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);