diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2020-11-14 09:59:58 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:08:49 -0400 |
commit | adbcada43fa79197224b5a522b1faaf222b43bcd (patch) | |
tree | 5df18388e19feb9d2f13f7d661ae9dea4d4f4782 /fs/bcachefs/journal_io.c | |
parent | b6df4325cd914d988e5b96016f64b879058d0bc6 (diff) |
bcachefs: Don't require flush/fua on every journal write
This patch adds a flag to journal entries which, if set, indicates that
they weren't done as flush/fua writes.
- non flush/fua journal writes don't update last_seq (i.e. they don't
free up space in the journal), thus the journal free space
calculations now check whether nonflush journal writes are currently
allowed (i.e. are we low on free space, or would doing a flush write
free up a lot of space in the journal)
- write_delay_ms, the user configurable option for when open journal
entries are automatically written, is now interpreted as the max
delay between flush journal writes (default 1 second).
- bch2_journal_flush_seq_async is changed to ensure a flush write >=
the requested sequence number has happened
- journal read/replay must now ignore, and blacklist, any journal
entries newer than the most recent flush entry in the journal. Also,
the way the read_entire_journal option is handled has been improved;
struct journal_replay now has an entry, 'ignore', for entries that
were read but should not be used.
- assorted refactoring and improvements related to journal read in
journal_io.c and recovery.c
Previously, we'd have to issue a flush/fua write every time we
accumulated a full journal entry - typically the bucket size. Now we
need to issue them much less frequently: when an fsync is requested, or
it's been more than write_delay_ms since the last flush, or when we need
to free up space in the journal. This is a significant performance
improvement on many write heavy workloads.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs/journal_io.c')
-rw-r--r-- | fs/bcachefs/journal_io.c | 208 |
1 files changed, 167 insertions, 41 deletions
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1aeeb58d3c2a..26556bb381b2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -10,9 +10,26 @@ #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "replicas.h" #include "trace.h" +static void __journal_replay_free(struct journal_replay *i) +{ + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + +} + +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +{ + i->ignore = true; + + if (!c->opts.read_entire_journal) + __journal_replay_free(i); +} + struct journal_list { struct closure cl; struct mutex lock; @@ -35,28 +52,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, struct bch_devs_list devs = { .nr = 0 }; struct list_head *where; size_t bytes = vstruct_bytes(j); - __le64 last_seq; + u64 last_seq = 0; int ret; - last_seq = !list_empty(jlist->head) - ? list_last_entry(jlist->head, struct journal_replay, - list)->j.last_seq - : 0; - - if (!c->opts.read_entire_journal) { - /* Is this entry older than the range we need? */ - if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { - ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; - goto out; + list_for_each_entry_reverse(i, jlist->head, list) { + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + break; } + } - /* Drop entries we don't need anymore */ + /* Is this entry older than the range we need? */ + if (!c->opts.read_entire_journal && + le64_to_cpu(j->seq) < last_seq) { + ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + goto out; + } + + /* Drop entries we don't need anymore */ + if (!JSET_NO_FLUSH(j)) { list_for_each_entry_safe(i, pos, jlist->head, list) { if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) break; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + journal_replay_free(c, i); } } @@ -80,9 +98,7 @@ add: if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { if (i->bad) { devs = i->devs; - list_del(&i->list); - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + __journal_replay_free(i); } else if (bad) { goto found; } else { @@ -104,6 +120,7 @@ add: list_add(&i->list, where); i->devs = devs; i->bad = bad; + i->ignore = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); found: if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) @@ -698,14 +715,16 @@ err: goto out; } -int bch2_journal_read(struct bch_fs *c, struct list_head *list) +int bch2_journal_read(struct bch_fs *c, struct list_head *list, + u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i; + struct journal_replay *i, *t; struct bch_dev *ca; unsigned iter; size_t keys = 0, entries = 0; bool degraded = false; + u64 seq, last_seq = 0; int ret = 0; closure_init_stack(&jlist.cl); @@ -734,12 +753,97 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (jlist.ret) return jlist.ret; + if (list_empty(list)) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + + i = list_last_entry(list, struct journal_replay, list); + *start_seq = le64_to_cpu(i->j.seq) + 1; + + /* + * Find most recent flush entry, and ignore newer non flush entries - + * those entries will be blacklisted: + */ + list_for_each_entry_safe_reverse(i, t, list, list) { + if (i->ignore) + continue; + + if (!JSET_NO_FLUSH(&i->j)) { + last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; + } + + journal_replay_free(c, i); + } + + if (!last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + return -1; + } + + /* Drop blacklisted entries and entries older than last_seq: */ + list_for_each_entry_safe(i, t, list, list) { + if (i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + if (seq < last_seq) { + journal_replay_free(c, i); + continue; + } + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + "found blacklisted journal entry %llu", seq); + + journal_replay_free(c, i); + } + } + + /* Check for missing entries: */ + seq = last_seq; + list_for_each_entry(i, list, list) { + if (i->ignore) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + last_seq, *blacklist_seq - 1); + } + + seq++; + } + list_for_each_entry(i, list, list) { struct jset_entry *entry; struct bkey_i *k, *_n; struct bch_replicas_padded replicas; char buf[80]; + if (i->ignore) + continue; + ret = jset_validate_entries(c, &i->j, READ); if (ret) goto fsck_err; @@ -767,12 +871,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) entries++; } - if (!list_empty(list)) { - i = list_last_entry(list, struct journal_replay, list); + bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", + keys, entries, *start_seq); - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, le64_to_cpu(i->j.seq)); - } + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); fsck_err: return ret; } @@ -990,8 +1094,12 @@ static void journal_write_done(struct closure *cl) j->seq_ondisk = seq; if (err && (!j->err_seq || seq < j->err_seq)) j->err_seq = seq; - j->last_seq_ondisk = last_seq; - bch2_journal_space_available(j); + + if (!w->noflush) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + } /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard @@ -1067,6 +1175,22 @@ void bch2_journal_write(struct closure *cl) j->write_start_time = local_clock(); + spin_lock(&j->lock); + if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && + !w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = cpu_to_le64(j->last_seq_ondisk); + + j->nr_noflush_writes++; + } else { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + } + spin_unlock(&j->lock); + /* * New btree roots are set by journalling them; when the journal entry * gets written we have to propagate them to c->btree_roots @@ -1183,11 +1307,12 @@ retry_alloc: sectors); bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; bio->bi_private = ca; + if (!JSET_NO_FLUSH(jset)) + bio->bi_opf |= REQ_PREFLUSH|REQ_FUA; bch2_bio_map(bio, jset, sectors << 9); trace_journal_write(bio); @@ -1196,18 +1321,19 @@ retry_alloc: ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); } - for_each_rw_member(ca, c, i) - if (journal_flushes_device(ca) && - !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { - percpu_ref_get(&ca->io_ref); - - bio = ca->journal.bio; - bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); - bio->bi_end_io = journal_write_endio; - bio->bi_private = ca; - closure_bio_submit(bio, cl); - } - + if (!JSET_NO_FLUSH(jset)) { + for_each_rw_member(ca, c, i) + if (journal_flushes_device(ca) && + !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + } no_io: bch2_bucket_seq_cleanup(c); |