aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-04-14 13:26:15 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:00 -0400
commit2527dd91580b1eb5ff1f8df1b47817ac60395830 (patch)
tree010df60c971735284954e484109dbac249db7a3a /fs
parent0ef107859bc868f783cbbbf055a907c702896661 (diff)
bcachefs: Improve bch2_btree_iter_traverse_all()
By changing it to upgrade iterators to intent locks to avoid lock restarts we can simplify __bch2_btree_node_lock() quite a bit - this fixes a probable bug where it could potentially drop a lock on an unrelated error but still succeed instead of causing a transaction restart. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/btree_iter.c87
-rw-r--r--fs/bcachefs/btree_iter.h2
-rw-r--r--fs/bcachefs/trace.h44
3 files changed, 76 insertions, 57 deletions
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 7d8b7d765cf7..c7a0ffc2cad5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -268,13 +268,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
*/
if (type == SIX_LOCK_intent &&
linked->nodes_locked != linked->nodes_intent_locked) {
- linked->locks_want = max_t(unsigned,
- linked->locks_want,
- __fls(linked->nodes_locked) + 1);
- if (!btree_iter_get_locks(linked, true, false)) {
- deadlock_iter = linked;
- reason = 1;
- }
+ deadlock_iter = linked;
+ reason = 1;
}
if (linked->btree_id != iter->btree_id) {
@@ -303,14 +298,8 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
* we're about to lock, it must have the ancestors locked too:
*/
if (level > __fls(linked->nodes_locked)) {
- linked->locks_want =
- max(level + 1, max_t(unsigned,
- linked->locks_want,
- iter->locks_want));
- if (!btree_iter_get_locks(linked, true, false)) {
- deadlock_iter = linked;
- reason = 5;
- }
+ deadlock_iter = linked;
+ reason = 5;
}
/* Must lock btree nodes in key order: */
@@ -320,26 +309,17 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
deadlock_iter = linked;
reason = 7;
}
-
- /*
- * Recheck if this is a node we already have locked - since one
- * of the get_locks() calls might've successfully
- * upgraded/relocked it:
- */
- if (linked->l[level].b == b &&
- btree_node_locked_type(linked, level) >= type) {
- six_lock_increment(&b->c.lock, type);
- return true;
- }
}
if (unlikely(deadlock_iter)) {
trace_trans_restart_would_deadlock(iter->trans->ip, ip,
- reason,
+ trans->in_traverse_all, reason,
deadlock_iter->btree_id,
btree_iter_type(deadlock_iter),
+ &deadlock_iter->real_pos,
iter->btree_id,
- btree_iter_type(iter));
+ btree_iter_type(iter),
+ &pos);
return false;
}
@@ -407,29 +387,11 @@ bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
unsigned new_locks_want)
{
- struct btree_iter *linked;
-
EBUG_ON(iter->locks_want >= new_locks_want);
iter->locks_want = new_locks_want;
- if (btree_iter_get_locks(iter, true, true))
- return true;
-
- /*
- * Ancestor nodes must be locked before child nodes, so set locks_want
- * on iterators that might lock ancestors before us to avoid getting
- * -EINTR later:
- */
- trans_for_each_iter(iter->trans, linked)
- if (linked != iter &&
- linked->btree_id == iter->btree_id &&
- linked->locks_want < new_locks_want) {
- linked->locks_want = new_locks_want;
- btree_iter_get_locks(linked, true, false);
- }
-
- return false;
+ return btree_iter_get_locks(iter, true, true);
}
void __bch2_btree_iter_downgrade(struct btree_iter *iter,
@@ -1192,7 +1154,8 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
struct bch_fs *c = trans->c;
struct btree_iter *iter;
u8 sorted[BTREE_ITER_MAX];
- unsigned i, nr_sorted = 0;
+ int i, nr_sorted = 0;
+ bool relock_fail;
if (trans->in_traverse_all)
return -EINTR;
@@ -1200,15 +1163,36 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
trans->in_traverse_all = true;
retry_all:
nr_sorted = 0;
+ relock_fail = false;
- trans_for_each_iter(trans, iter)
+ trans_for_each_iter(trans, iter) {
+ if (!bch2_btree_iter_relock(iter, true))
+ relock_fail = true;
sorted[nr_sorted++] = iter->idx;
+ }
+
+ if (!relock_fail) {
+ trans->in_traverse_all = false;
+ return 0;
+ }
#define btree_iter_cmp_by_idx(_l, _r) \
btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
#undef btree_iter_cmp_by_idx
+
+ for (i = nr_sorted - 2; i >= 0; --i) {
+ struct btree_iter *iter1 = trans->iters + sorted[i];
+ struct btree_iter *iter2 = trans->iters + sorted[i + 1];
+
+ if (iter1->btree_id == iter2->btree_id &&
+ iter1->locks_want < iter2->locks_want)
+ __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
+ else if (!iter1->locks_want && iter2->locks_want)
+ __bch2_btree_iter_upgrade(iter1, 1);
+ }
+
bch2_trans_unlock(trans);
cond_resched();
@@ -1258,6 +1242,8 @@ out:
bch2_btree_cache_cannibalize_unlock(c);
trans->in_traverse_all = false;
+
+ trace_trans_traverse_all(trans->ip);
return ret;
}
@@ -2210,7 +2196,8 @@ void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
if (!(flags & TRANS_RESET_NOUNLOCK))
bch2_trans_cond_resched(trans);
- if (!(flags & TRANS_RESET_NOTRAVERSE))
+ if (!(flags & TRANS_RESET_NOTRAVERSE) &&
+ trans->iters_linked)
bch2_btree_iter_traverse_all(trans);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 07d9b6d36e51..2f63adb9e420 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -187,7 +187,7 @@ static inline int btree_iter_lock_cmp(const struct btree_iter *l,
{
return cmp_int(l->btree_id, r->btree_id) ?:
-cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
- bkey_cmp(l->pos, r->pos);
+ bkey_cmp(l->real_pos, r->real_pos);
}
/*
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 493f9223c5bd..02f2662e7bde 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -561,43 +561,70 @@ DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
TRACE_EVENT(trans_restart_would_deadlock,
TP_PROTO(unsigned long trans_ip,
unsigned long caller_ip,
+ bool in_traverse_all,
unsigned reason,
enum btree_id have_btree_id,
unsigned have_iter_type,
+ struct bpos *have_pos,
enum btree_id want_btree_id,
- unsigned want_iter_type),
- TP_ARGS(trans_ip, caller_ip, reason,
- have_btree_id, have_iter_type,
- want_btree_id, want_iter_type),
+ unsigned want_iter_type,
+ struct bpos *want_pos),
+ TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
+ have_btree_id, have_iter_type, have_pos,
+ want_btree_id, want_iter_type, want_pos),
TP_STRUCT__entry(
__field(unsigned long, trans_ip )
__field(unsigned long, caller_ip )
+ __field(u8, in_traverse_all )
__field(u8, reason )
__field(u8, have_btree_id )
__field(u8, have_iter_type )
__field(u8, want_btree_id )
__field(u8, want_iter_type )
+
+ __field(u64, have_pos_inode )
+ __field(u64, have_pos_offset )
+ __field(u32, have_pos_snapshot)
+ __field(u32, want_pos_snapshot)
+ __field(u64, want_pos_inode )
+ __field(u64, want_pos_offset )
),
TP_fast_assign(
__entry->trans_ip = trans_ip;
__entry->caller_ip = caller_ip;
+ __entry->in_traverse_all = in_traverse_all;
__entry->reason = reason;
__entry->have_btree_id = have_btree_id;
__entry->have_iter_type = have_iter_type;
__entry->want_btree_id = want_btree_id;
__entry->want_iter_type = want_iter_type;
+
+ __entry->have_pos_inode = have_pos->inode;
+ __entry->have_pos_offset = have_pos->offset;
+ __entry->have_pos_snapshot = have_pos->snapshot;
+
+ __entry->want_pos_inode = want_pos->inode;
+ __entry->want_pos_offset = want_pos->offset;
+ __entry->want_pos_snapshot = want_pos->snapshot;
),
- TP_printk("%pS %pS because %u have %u:%u want %u:%u",
+ TP_printk("%pS %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
(void *) __entry->trans_ip,
(void *) __entry->caller_ip,
+ __entry->in_traverse_all,
__entry->reason,
__entry->have_btree_id,
__entry->have_iter_type,
+ __entry->have_pos_inode,
+ __entry->have_pos_offset,
+ __entry->have_pos_snapshot,
__entry->want_btree_id,
- __entry->want_iter_type)
+ __entry->want_iter_type,
+ __entry->want_pos_inode,
+ __entry->want_pos_offset,
+ __entry->want_pos_snapshot)
);
TRACE_EVENT(trans_restart_iters_realloced,
@@ -689,6 +716,11 @@ DEFINE_EVENT(transaction_restart, trans_restart_traverse,
TP_ARGS(ip)
);
+DEFINE_EVENT(transaction_restart, trans_traverse_all,
+ TP_PROTO(unsigned long ip),
+ TP_ARGS(ip)
+);
+
DECLARE_EVENT_CLASS(node_lock_fail,
TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
TP_ARGS(level, iter_seq, node, node_seq),