bcachefs bugfixes for 6.7

Lots of small fixes for minor nits and compiler warnings. Bigger items:
 
  - The six locks lost wakeup is finally fixed: six_read_trylock() was
    checking for the waiting bit before decrementing the number of
    readers - validated the fix with a torture test.
 
  - Fix for a memory reclaim issue: when needing to reallocate a key
    cache key, we now do our usual GFP_NOWAIT; unlock(); GFP_KERNEL
    dance.
 
  - Multiple deleted inodes btree fixes
 
  - Fix an issue in fsck, where i_nlink would be recalculated incorrectly
    for hardlinked files if a snapshot had ever been taken.
 
  - Kill journal pre-reservations: This is a bigger patch than I would
    normally send at this point, but it deletes code and it fixes some of
    our tests that would sporadically die with the journal getting stuck,
    and it's a performance improvement, too.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmVX6YUACgkQE6szbY3K
 bnbFTBAAmtGYPXXLUZrr6uZc2Kp/a/v86FUatkrmXS4wVydQSBVhbo6CdxTAAYbx
 C1K/+WG5MIfkFaVNLQvIXjGECFPffRmA/xtgfjaHzXvUnG74sLGrsOW7eBvkEd3j
 8sEPjfpp39LSLIWs9GbqP3kNF2ax6kZ4Y+kTod4PZB14S4VrXRmd2ZGjJ2VARsig
 ygFi+qOBJ0ojZHo7VSOzGRxcGNvVmbr76vOBuqEwR9PYnT3JKH2I+ANFdc1YLnAH
 mMr/nwTMzuLcN6FUSnPBoX+x1WyPaXDbBMNLBSoVBcpP6X/DqcNIr9yvkUGqT6uR
 cxW6oOten9M+JSbGALnTrQjc9Khug5SqjvTJ1fYl4NELocvapWvBdHtICwEpYl9F
 REeTGqTHMb8j4VJDl+JcP9cPbDEVa6TGa4SXB4NfF70MZf0y7HR2Y/ms0bi47+Zb
 IxLlbhFZUiqbnsBx+jPhuKD86mijjEbmjPWkqcsWG/olg8Sdor4LdU4CbaGEB1fn
 oSfMnwb5fKI4fFPVMSREJ2ktpeb1DUmvkdbt5klYTL4DBK2GIgGdYlnVXg9Z5AAY
 kz6fvJE0PhWw8cDb4ClGo6ZYffmlX6m4LoX0q3C1O0Wt+Q4av2/vtLuAMKXz38Y4
 zzw+JO/h0X9ECJdFPPsTq2sg7cWo7oFVrO3+ZQ0dTfKKtAPUwWY=
 =7zgm
 -----END PGP SIGNATURE-----

Merge tag 'bcachefs-2023-11-17' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Lots of small fixes for minor nits and compiler warnings.

  Bigger items:

   - The six locks lost wakeup is finally fixed: six_read_trylock() was
     checking for the waiting bit before decrementing the number of
     readers - validated the fix with a torture test.

   - Fix for a memory reclaim issue: when needing to reallocate a key
     cache key, we now do our usual GFP_NOWAIT; unlock(); GFP_KERNEL
     dance.

   - Multiple deleted inodes btree fixes

   - Fix an issue in fsck, where i_nlink would be recalculated
     incorrectly for hardlinked files if a snapshot had ever been taken.

   - Kill journal pre-reservations: This is a bigger patch than I would
     normally send at this point, but it deletes code and it fixes some
     of our tests that would sporadically die with the journal getting
     stuck, and it's a performance improvement, too"

* tag 'bcachefs-2023-11-17' of https://evilpiepirate.org/git/bcachefs: (22 commits)
  bcachefs: Fix missing locking for dentry->d_parent access
  bcachefs: six locks: Fix lost wakeup
  bcachefs: Fix no_data_io mode checksum check
  bcachefs: Fix bch2_check_nlinks() for snapshots
  bcachefs: Don't decrease BTREE_ITER_MAX when LOCKDEP=y
  bcachefs: Disable debug log statements
  bcachefs: Fix missing transaction commit
  bcachefs: Fix error path in bch2_mount()
  bcachefs: Fix potential sleeping during mount
  bcachefs: Fix iterator leak in may_delete_deleted_inode()
  bcachefs: Kill journal pre-reservations
  bcachefs: Check for nonce offset inconsistency in data_update path
  bcachefs: Make sure to drop/retake btree locks before reclaim
  bcachefs: btree_trans->write_locked
  bcachefs: Run btree key cache shrinker less aggressively
  bcachefs: Split out btree_key_cache_types.h
  bcachefs: Guard against insufficient devices to create stripes
  bcachefs: Fix null ptr deref in bch2_backpointer_get_node()
  bcachefs: Fix multiple -Warray-bounds warnings
  bcachefs: Use DECLARE_FLEX_ARRAY() helper and fix multiple -Warray-bounds warnings
  ...
This commit is contained in:
Linus Torvalds 2023-11-17 14:36:58 -08:00
commit 791c8ab095
27 changed files with 248 additions and 377 deletions

View file

@ -313,17 +313,17 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
bp.level - 1,
0);
b = bch2_btree_iter_peek_node(iter);
if (IS_ERR(b))
if (IS_ERR_OR_NULL(b))
goto err;
BUG_ON(b->c.level != bp.level - 1);
if (b && extent_matches_bp(c, bp.btree_id, bp.level,
bkey_i_to_s_c(&b->key),
bucket, bp))
if (extent_matches_bp(c, bp.btree_id, bp.level,
bkey_i_to_s_c(&b->key),
bucket, bp))
return b;
if (b && btree_node_will_make_reachable(b)) {
if (btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));

View file

@ -617,7 +617,7 @@ struct journal_seq_blacklist_table {
u64 start;
u64 end;
bool dirty;
} entries[0];
} entries[];
};
struct journal_keys {

View file

@ -3087,8 +3087,6 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
kfree(trans->extra_journal_entries.data);
if (trans->fs_usage_deltas) {

View file

@ -89,10 +89,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
ck->btree_trans_barrier_seq =
start_poll_synchronize_srcu(&c->btree_trans_barrier);
if (ck->c.lock.readers)
if (ck->c.lock.readers) {
list_move_tail(&ck->list, &bc->freed_pcpu);
else
bc->nr_freed_pcpu++;
} else {
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
}
atomic_long_inc(&bc->nr_freed);
kfree(ck->k);
@ -109,6 +112,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
{
struct bkey_cached *pos;
bc->nr_freed_nonpcpu++;
list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
pos->btree_trans_barrier_seq)) {
@ -158,6 +163,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
#else
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_nonpcpu);
bc->nr_freed_nonpcpu++;
mutex_unlock(&bc->lock);
#endif
} else {
@ -217,6 +223,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
f->nr < ARRAY_SIZE(f->objs) / 2) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
f->objs[f->nr++] = ck;
}
@ -229,6 +236,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_nonpcpu)) {
ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_nonpcpu--;
}
mutex_unlock(&bc->lock);
#endif
@ -664,7 +672,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
goto out;
bch2_journal_pin_drop(j, &ck->journal);
bch2_journal_preres_put(j, &ck->res);
BUG_ON(!btree_node_locked(c_iter.path, 0));
@ -762,18 +769,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
BUG_ON(insert->k.u64s > ck->u64s);
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
int difference;
BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
if (difference > 0) {
trans->journal_preres.u64s -= difference;
ck->res.u64s += difference;
}
}
bkey_copy(ck->k, insert);
ck->valid = true;
@ -850,6 +845,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
* Newest freed entries are at the end of the list - once we hit one
* that's too new to be freed, we can bail out:
*/
scanned += bc->nr_freed_nonpcpu;
list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@ -859,13 +856,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
scanned++;
freed++;
bc->nr_freed_nonpcpu--;
}
if (scanned >= nr)
goto out;
scanned += bc->nr_freed_pcpu;
list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
ck->btree_trans_barrier_seq))
@ -875,8 +874,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
six_lock_exit(&ck->c.lock);
kmem_cache_free(bch2_key_cache, ck);
atomic_long_dec(&bc->nr_freed);
scanned++;
freed++;
bc->nr_freed_pcpu--;
}
if (scanned >= nr)
@ -982,6 +981,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
}
#endif
BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
list_splice(&bc->freed_pcpu, &items);
list_splice(&bc->freed_nonpcpu, &items);
@ -991,7 +993,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
cond_resched();
bch2_journal_pin_drop(&c->journal, &ck->journal);
bch2_journal_preres_put(&c->journal, &ck->res);
list_del(&ck->list);
kfree(ck->k);

View file

@ -0,0 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
size_t nr_freed_pcpu;
struct list_head freed_nonpcpu;
size_t nr_freed_nonpcpu;
struct shrinker *shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
};
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
} __packed __aligned(4);
#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */

View file

@ -78,6 +78,53 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
bch2_btree_init_next(trans, b);
}
static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
{
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static inline int bch2_trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
EBUG_ON(trans->write_locked);
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
trans->write_locked = true;
return 0;
}
static inline void bch2_trans_unlock_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(trans, i->path,
insert_l(i)->b);
trans->write_locked = false;
}
}
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
@ -276,17 +323,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
}
static noinline int
bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
unsigned long trace_ip)
{
return drop_locks_do(trans,
bch2_journal_preres_get(&trans->c->journal,
&trans->journal_preres,
trans->journal_preres_u64s,
(flags & BCH_WATERMARK_MASK)));
}
static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
unsigned flags)
{
@ -321,6 +357,45 @@ static inline int btree_key_can_insert(struct btree_trans *trans,
return 0;
}
noinline static int
btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned new_u64s)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct bkey_cached *ck = (void *) path->l[0].b;
struct bkey_i *new_k;
int ret;
bch2_trans_unlock_write(trans);
bch2_trans_unlock(trans);
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
ret = bch2_trans_relock(trans) ?:
bch2_trans_lock_write(trans);
if (unlikely(ret)) {
kfree(new_k);
return ret;
}
memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v)
i->old_v = &new_k->v;
kfree(ck->k);
ck->u64s = new_u64s;
ck->k = new_k;
return 0;
}
static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned u64s)
{
@ -347,12 +422,9 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
trans_for_each_update(trans, i)
if (i->old_v == &ck->k->v)
@ -732,37 +804,6 @@ revert_fs_usage:
return ret;
}
static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
{
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
}
static inline int trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
}
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct btree_insert_entry *i;
@ -830,15 +871,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
}
}
ret = bch2_journal_preres_get(&c->journal,
&trans->journal_preres, trans->journal_preres_u64s,
(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
if (unlikely(ret))
return ret;
ret = trans_lock_write(trans);
ret = bch2_trans_lock_write(trans);
if (unlikely(ret))
return ret;
@ -847,10 +880,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
bch2_btree_node_unlock_write_inlined(trans, i->path,
insert_l(i)->b);
bch2_trans_unlock_write(trans);
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@ -1003,7 +1033,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_write_buffered_key *wb;
unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
@ -1063,13 +1092,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
trans->journal_u64s = trans->extra_journal_entries.nr;
trans->journal_preres_u64s = 0;
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
@ -1085,16 +1109,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (i->key_cache_already_flushed)
continue;
/* we're going to journal the key being updated: */
u64s = jset_u64s(i->k->k.u64s);
if (i->cached &&
likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
if (i->flags & BTREE_UPDATE_NOJOURNAL)
continue;
trans->journal_u64s += u64s;
/* we're going to journal the key being updated: */
trans->journal_u64s += jset_u64s(i->k->k.u64s);
/* and we're also going to log the overwrite: */
if (trans->journal_transaction_names)
@ -1126,8 +1145,6 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:

View file

@ -5,7 +5,7 @@
#include <linux/list.h>
#include <linux/rhashtable.h>
//#include "bkey_methods.h"
#include "btree_key_cache_types.h"
#include "buckets_types.h"
#include "darray.h"
#include "errcode.h"
@ -312,31 +312,6 @@ struct btree_iter {
#endif
};
struct btree_key_cache_freelist {
struct bkey_cached *objs[16];
unsigned nr;
};
struct btree_key_cache {
struct mutex lock;
struct rhashtable table;
bool table_init_done;
struct list_head freed_pcpu;
struct list_head freed_nonpcpu;
struct shrinker *shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
atomic_long_t nr_freed;
atomic_long_t nr_keys;
atomic_long_t nr_dirty;
};
struct bkey_cached_key {
u32 btree_id;
struct bpos pos;
} __packed __aligned(4);
#define BKEY_CACHED_ACCESSED 0
#define BKEY_CACHED_DIRTY 1
@ -352,7 +327,6 @@ struct bkey_cached {
struct rhash_head hash;
struct list_head list;
struct journal_preres res;
struct journal_entry_pin journal;
u64 seq;
@ -389,11 +363,7 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
#ifndef CONFIG_LOCKDEP
#define BTREE_ITER_MAX 64
#else
#define BTREE_ITER_MAX 32
#endif
struct btree_trans_commit_hook;
typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@ -434,6 +404,7 @@ struct btree_trans {
bool journal_transaction_names:1;
bool journal_replay_not_finished:1;
bool notrace_relock_fail:1;
bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
unsigned long last_begin_ip;
@ -465,11 +436,9 @@ struct btree_trans {
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
struct journal_preres journal_preres;
u64 *journal_seq;
struct disk_reservation *disk_res;
unsigned journal_u64s;
unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
};

View file

@ -513,8 +513,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
up_read(&c->gc_lock);
as->took_gc_lock = false;
bch2_journal_preres_put(&c->journal, &as->journal_preres);
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_pin_flush(&c->journal, &as->journal);
bch2_disk_reservation_put(c, &as->disk_res);
@ -734,8 +732,6 @@ err:
bch2_journal_pin_drop(&c->journal, &as->journal);
bch2_journal_preres_put(&c->journal, &as->journal_preres);
mutex_lock(&c->btree_interior_update_lock);
for (i = 0; i < as->nr_new_nodes; i++) {
b = as->new_nodes[i];
@ -1047,7 +1043,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
unsigned journal_flags = 0;
int ret = 0;
u32 restart_count = trans->restart_count;
@ -1061,10 +1056,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
journal_flags |= JOURNAL_RES_GET_NONBLOCK;
journal_flags |= watermark;
while (1) {
nr_nodes[!!update_level] += 1 + split;
update_level++;
@ -1129,27 +1120,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret)
goto err;
ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags|JOURNAL_RES_GET_NONBLOCK);
if (ret) {
if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
}
ret = drop_locks_do(trans,
bch2_journal_preres_get(&c->journal, &as->journal_preres,
BTREE_UPDATE_JOURNAL_RES,
journal_flags));
if (ret == -BCH_ERR_journal_preres_get_blocked) {
trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
}
if (ret)
goto err;
}
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
c->opts.metadata_replicas,

View file

@ -55,7 +55,6 @@ struct btree_update {
unsigned update_level;
struct disk_reservation disk_res;
struct journal_preres journal_preres;
/*
* BTREE_INTERIOR_UPDATING_NODE:

View file

@ -239,6 +239,34 @@ restart_drop_extra_replicas:
next_pos = insert->k.p;
/*
* Check for nonce offset inconsistency:
* This is debug code - we've been seeing this bug rarely, and
* it's been hard to reproduce, so this should give us some more
* information when it does occur:
*/
struct printbuf err = PRINTBUF;
int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
printbuf_exit(&err);
if (invalid) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "about to insert invalid key in data update path");
prt_str(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nk: ");
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
bch2_print_string_as_lines(KERN_ERR, buf.buf);
printbuf_exit(&buf);
bch2_fatal_error(c);
goto out;
}
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,

View file

@ -555,6 +555,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
case TARGET_DEV: {
struct bch_dev *ca;
out->atomic++;
rcu_read_lock();
ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
@ -570,6 +571,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
}
rcu_read_unlock();
out->atomic--;
break;
}
case TARGET_GROUP:
@ -580,7 +582,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
}
}
void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct target t = target_decode(v);

View file

@ -1373,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
h->nr_active_devs++;
rcu_read_unlock();
/*
* If we only have redundancy + 1 devices, we're better off with just
* replication:
*/
if (h->nr_active_devs < h->redundancy + 2)
bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
h->nr_active_devs, h->redundancy + 2);
list_add(&h->list, &c->ec_stripe_head_list);
return h;
}
@ -1424,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
found:
if (!IS_ERR_OR_NULL(h) &&
h->nr_active_devs < h->redundancy + 2) {
mutex_unlock(&h->lock);
h = NULL;
}
mutex_unlock(&c->ec_stripe_head_lock);
return h;
}
@ -1681,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
int ret;
h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
if (!h)
bch_err(c, "no stripe head");
if (IS_ERR_OR_NULL(h))
return h;

View file

@ -13,7 +13,7 @@
int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
loff_t start, u64 end,
int fgp_flags, gfp_t gfp,
fgf_t fgp_flags, gfp_t gfp,
folios *fs)
{
struct folio *f;

View file

@ -7,7 +7,7 @@
typedef DARRAY(struct folio *) folios;
int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
u64, int, gfp_t, folios *);
u64, fgf_t, gfp_t, folios *);
int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
/*

View file

@ -1922,10 +1922,7 @@ out:
return dget(sb->s_root);
err_put_super:
sb->s_fs_info = NULL;
c->vfs_sb = NULL;
deactivate_locked_super(sb);
bch2_fs_stop(c);
return ERR_PTR(bch2_err_class(ret));
}
@ -1933,11 +1930,8 @@ static void bch2_kill_sb(struct super_block *sb)
{
struct bch_fs *c = sb->s_fs_info;
if (c)
c->vfs_sb = NULL;
generic_shutdown_super(sb);
if (c)
bch2_fs_free(c);
bch2_fs_free(c);
}
static struct file_system_type bcache_fs_type = {

View file

@ -2220,7 +2220,7 @@ static int nlink_cmp(const void *_l, const void *_r)
const struct nlink *l = _l;
const struct nlink *r = _r;
return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
return cmp_int(l->inum, r->inum);
}
static void inc_link(struct bch_fs *c, struct snapshots_seen *s,

View file

@ -1134,7 +1134,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
* unlinked inodes in the snapshot leaves:
*/
*need_another_pass = true;
return 0;
goto out;
}
ret = 1;
@ -1169,8 +1169,10 @@ again:
*/
for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p,
&need_another_pass));
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
if (ret < 0)
break;

View file

@ -795,7 +795,7 @@ static int bch2_write_decrypt(struct bch_write_op *op)
* checksum:
*/
csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
if (bch2_crc_cmp(op->crc.csum, csum))
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
return -EIO;
ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);

View file

@ -526,36 +526,6 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
return ret;
}
/* journal_preres: */
static bool journal_preres_available(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
if (!ret && mutex_trylock(&j->reclaim_lock)) {
bch2_journal_reclaim(j);
mutex_unlock(&j->reclaim_lock);
}
return ret;
}
int __bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
int ret;
closure_wait_event(&j->preres_wait,
(ret = bch2_journal_error(j)) ||
journal_preres_available(j, res, new_u64s, flags));
return ret;
}
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *j,
@ -1306,7 +1276,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining);
prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);

View file

@ -395,104 +395,6 @@ out:
return 0;
}
/* journal_preres: */
static inline void journal_set_watermark(struct journal *j)
{
union journal_preres_state s = READ_ONCE(j->prereserved);
unsigned watermark = BCH_WATERMARK_stripe;
if (fifo_free(&j->pin) < j->pin.size / 4)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
if (fifo_free(&j->pin) < j->pin.size / 8)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
if (s.reserved > s.remaining)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
if (!s.remaining)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
if (watermark == j->watermark)
return;
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
}
static inline void bch2_journal_preres_put(struct journal *j,
struct journal_preres *res)
{
union journal_preres_state s = { .reserved = res->u64s };
if (!res->u64s)
return;
s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
res->u64s = 0;
if (unlikely(s.waiting)) {
clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
(unsigned long *) &j->prereserved.v);
closure_wake_up(&j->preres_wait);
}
if (s.reserved <= s.remaining && j->watermark)
journal_set_watermark(j);
}
int __bch2_journal_preres_get(struct journal *,
struct journal_preres *, unsigned, unsigned);
static inline int bch2_journal_preres_get_fast(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags,
bool set_waiting)
{
int d = new_u64s - res->u64s;
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret;
do {
old.v = new.v = v;
ret = 0;
if (watermark == BCH_WATERMARK_reclaim ||
new.reserved + d < new.remaining) {
new.reserved += d;
ret = 1;
} else if (set_waiting && !new.waiting)
new.waiting = true;
else
return 0;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
if (ret)
res->u64s += d;
return ret;
}
static inline int bch2_journal_preres_get(struct journal *j,
struct journal_preres *res,
unsigned new_u64s,
unsigned flags)
{
if (new_u64s <= res->u64s)
return 0;
if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
return 0;
if (flags & JOURNAL_RES_GET_NONBLOCK)
return -BCH_ERR_journal_preres_get_blocked;
return __bch2_journal_preres_get(j, res, new_u64s, flags);
}
/* journal_entry_res: */
void bch2_journal_entry_res_resize(struct journal *,

View file

@ -1079,6 +1079,12 @@ found:
if (ja->bucket_seq[ja->cur_idx] &&
ja->sectors_free == ca->mi.bucket_size) {
#if 0
/*
* Debug code for ZNS support, where we (probably) want to be
* correlated where we stopped in the journal to the zone write
* points:
*/
bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
for (i = 0; i < 3; i++) {
@ -1086,6 +1092,7 @@ found:
bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
}
#endif
ja->sectors_free = 0;
}

View file

@ -50,16 +50,21 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
static inline void journal_set_watermark(struct journal *j, bool low_on_space)
{
union journal_preres_state old, new;
u64 v = atomic64_read(&j->prereserved.counter);
unsigned watermark = BCH_WATERMARK_stripe;
do {
old.v = new.v = v;
new.remaining = u64s_remaining;
} while ((v = atomic64_cmpxchg(&j->prereserved.counter,
old.v, new.v)) != old.v);
if (low_on_space)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
if (fifo_free(&j->pin) < j->pin.size / 4)
watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
if (watermark == j->watermark)
return;
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
}
static struct journal_space
@ -162,7 +167,6 @@ void bch2_journal_space_available(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
s64 u64s_remaining = 0;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
unsigned i, nr_online = 0, nr_devs_want;
@ -222,16 +226,10 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
u64s_remaining = (u64) clean << 6;
u64s_remaining -= (u64) total << 3;
u64s_remaining = max(0LL, u64s_remaining);
u64s_remaining /= 4;
u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
journal_set_watermark(j, clean * 4 <= total);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
journal_set_remaining(j, u64s_remaining);
journal_set_watermark(j);
if (!ret)
journal_wake(j);
@ -555,11 +553,6 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
/* And include pre-reservations: */
nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
(ca->mi.bucket_size << 6) -
journal_entry_overhead(j));
nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@ -638,10 +631,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
msecs_to_jiffies(c->opts.journal_reclaim_delay)))
min_nr = 1;
if (j->prereserved.reserved * 4 > j->prereserved.remaining)
min_nr = 1;
if (fifo_free(&j->pin) <= 32)
if (j->watermark != BCH_WATERMARK_stripe)
min_nr = 1;
if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@ -652,8 +642,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
trace_and_count(c, journal_reclaim_start, c,
direct, kicked,
min_nr, min_key_cache,
j->prereserved.reserved,
j->prereserved.remaining,
atomic_read(&c->btree_cache.dirty),
c->btree_cache.used,
atomic_long_read(&c->btree_key_cache.nr_dirty),

View file

@ -76,14 +76,6 @@ struct journal_res {
u64 seq;
};
/*
* For reserving space in the journal prior to getting a reservation on a
* particular journal entry:
*/
struct journal_preres {
unsigned u64s;
};
union journal_res_state {
struct {
atomic64_t counter;
@ -104,22 +96,6 @@ union journal_res_state {
};
};
union journal_preres_state {
struct {
atomic64_t counter;
};
struct {
u64 v;
};
struct {
u64 waiting:1,
reserved:31,
remaining:32;
};
};
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
@ -180,8 +156,6 @@ struct journal {
union journal_res_state reservations;
enum bch_watermark watermark;
union journal_preres_state prereserved;
} __aligned(SMP_CACHE_BYTES);
unsigned long flags;

View file

@ -163,8 +163,11 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
this_cpu_sub(*lock->readers, !ret);
preempt_enable();
if (!ret && (old & SIX_LOCK_WAITING_write))
ret = -1 - SIX_LOCK_write;
if (!ret) {
smp_mb();
if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
ret = -1 - SIX_LOCK_write;
}
} else if (type == SIX_LOCK_write && lock->readers) {
if (try) {
atomic_add(SIX_LOCK_HELD_write, &lock->state);

View file

@ -20,7 +20,7 @@ struct snapshot_t {
};
struct snapshot_table {
struct snapshot_t s[0];
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
};
typedef struct {

View file

@ -196,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
TRACE_EVENT(journal_reclaim_start,
TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
u64 min_nr, u64 min_key_cache,
u64 prereserved, u64 prereserved_total,
u64 btree_cache_dirty, u64 btree_cache_total,
u64 btree_key_cache_dirty, u64 btree_key_cache_total),
TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
btree_cache_dirty, btree_cache_total,
btree_key_cache_dirty, btree_key_cache_total),
@ -209,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
__field(bool, kicked )
__field(u64, min_nr )
__field(u64, min_key_cache )
__field(u64, prereserved )
__field(u64, prereserved_total )
__field(u64, btree_cache_dirty )
__field(u64, btree_cache_total )
__field(u64, btree_key_cache_dirty )
@ -223,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
__entry->kicked = kicked;
__entry->min_nr = min_nr;
__entry->min_key_cache = min_key_cache;
__entry->prereserved = prereserved;
__entry->prereserved_total = prereserved_total;
__entry->btree_cache_dirty = btree_cache_dirty;
__entry->btree_cache_total = btree_cache_total;
__entry->btree_key_cache_dirty = btree_key_cache_dirty;
__entry->btree_key_cache_total = btree_key_cache_total;
),
TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->direct,
__entry->kicked,
__entry->min_nr,
__entry->min_key_cache,
__entry->prereserved,
__entry->prereserved_total,
__entry->btree_cache_dirty,
__entry->btree_cache_total,
__entry->btree_key_cache_dirty,

View file

@ -552,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
s.v = v + 1;
s.defined = true;
} else {
/*
* Check if this option was set on the parent - if so, switched
* back to inheriting from the parent:
*
* rename() also has to deal with keeping inherited options up
* to date - see bch2_reinherit_attrs()
*/
spin_lock(&dentry->d_lock);
if (!IS_ROOT(dentry)) {
struct bch_inode_info *dir =
to_bch_ei(d_inode(dentry->d_parent));
@ -560,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
} else {
s.v = 0;
}
spin_unlock(&dentry->d_lock);
s.defined = false;
}