aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-16 13:33:06 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-16 13:33:06 +0200
commit26bb0d3f38a764b743a3ad5c8b6e5b5044d7ceb4 (patch)
treea08d01893b603d2f611a617f6055b54a835c03f0
parent3a4d319a8fb5a9bbdf5b31ef32841eb286b1dcc2 (diff)
parentd4d7c03f7ee1d7f16b7b6e885b1e00968f72b93c (diff)
Merge tag 'for-6.12/block-20240913' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - MD changes via Song: - md-bitmap refactoring (Yu Kuai) - raid5 performance optimization (Artur Paszkiewicz) - Other small fixes (Yu Kuai, Chen Ni) - Add a sysfs entry 'new_level' (Xiao Ni) - Improve information reported in /proc/mdstat (Mateusz Kusiak) - NVMe changes via Keith: - Asynchronous namespace scanning (Stuart) - TCP TLS updates (Hannes) - RDMA queue controller validation (Niklas) - Align field names to the spec (Anuj) - Metadata support validation (Puranjay) - A syntax cleanup (Shen) - Fix a Kconfig linking error (Arnd) - New queue-depth quirk (Keith) - Add missing unplug trace event (Keith) - blk-iocost fixes (Colin, Konstantin) - t10-pi modular removal and fixes (Alexey) - Fix for potential BLKSECDISCARD overflow (Alexey) - bio splitting cleanups and fixes (Christoph) - Deal with folios rather than rather than pages, speeding up how the block layer handles bigger IOs (Kundan) - Use spinlocks rather than bit spinlocks in zram (Sebastian, Mike) - Reduce zoned device overhead in ublk (Ming) - Add and use sendpages_ok() for drbd and nvme-tcp (Ofir) - Fix regression in partition error pointer checking (Riyan) - Add support for write zeroes and rotational status in nbd (Wouter) - Add Yu Kuai as new BFQ maintainer. The scheduler has been unmaintained for quite a while. - Various sets of fixes for BFQ (Yu Kuai) - Misc fixes and cleanups (Alvaro, Christophe, Li, Md Haris, Mikhail, Yang) * tag 'for-6.12/block-20240913' of git://git.kernel.dk/linux: (120 commits) nvme-pci: qdepth 1 quirk block: fix potential invalid pointer dereference in blk_add_partition blk_iocost: make read-only static array vrate_adj_pct const block: unpin user pages belonging to a folio at once mm: release number of pages of a folio block: introduce folio awareness and add a bigger size from folio block: Added folio-ized version of bio_add_hw_page() block, bfq: factor out a helper to split bfqq in bfq_init_rq() block, bfq: remove local variable 'bfqq_already_existing' in bfq_init_rq() block, bfq: remove local variable 'split' in bfq_init_rq() block, bfq: remove bfq_log_bfqg() block, bfq: merge bfq_release_process_ref() into bfq_put_cooperator() block, bfq: fix procress reference leakage for bfqq in merge chain block, bfq: fix uaf for accessing waker_bfqq after splitting blk-throttle: support prioritized processing of metadata blk-throttle: remove last_low_overflow_time drbd: Add NULL check for net_conf to prevent dereference in state validation nvme-tcp: fix link failure for TCP auth blk-mq: add missing unplug trace event mtip32xx: Remove redundant null pointer checks in mtip_hw_debugfs_init() ...
-rw-r--r--MAINTAINERS5
-rw-r--r--block/bfq-cgroup.c8
-rw-r--r--block/bfq-iosched.c206
-rw-r--r--block/bfq-iosched.h8
-rw-r--r--block/bio.c112
-rw-r--r--block/blk-cgroup.c23
-rw-r--r--block/blk-cgroup.h1
-rw-r--r--block/blk-iocost.c10
-rw-r--r--block/blk-ioprio.c57
-rw-r--r--block/blk-ioprio.h9
-rw-r--r--block/blk-merge.c162
-rw-r--r--block/blk-mq.c14
-rw-r--r--block/blk-rq-qos.c2
-rw-r--r--block/blk-throttle.c69
-rw-r--r--block/blk-throttle.h2
-rw-r--r--block/blk.h74
-rw-r--r--block/ioctl.c9
-rw-r--r--block/partitions/core.c8
-rw-r--r--block/t10-pi.c8
-rw-r--r--drivers/block/drbd/drbd_int.h11
-rw-r--r--drivers/block/drbd/drbd_main.c2
-rw-r--r--drivers/block/drbd/drbd_state.c2
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c19
-rw-r--r--drivers/block/nbd.c28
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/rnbd/rnbd-srv.c11
-rw-r--r--drivers/block/ublk_drv.c62
-rw-r--r--drivers/block/zram/zram_drv.c16
-rw-r--r--drivers/block/zram/zram_drv.h7
-rw-r--r--drivers/md/dm-raid.c7
-rw-r--r--drivers/md/md-bitmap.c568
-rw-r--r--drivers/md/md-bitmap.h268
-rw-r--r--drivers/md/md-cluster.c91
-rw-r--r--drivers/md/md.c332
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/raid1-10.c9
-rw-r--r--drivers/md/raid1.c99
-rw-r--r--drivers/md/raid10.c75
-rw-r--r--drivers/md/raid5-cache.c14
-rw-r--r--drivers/md/raid5.c157
-rw-r--r--drivers/md/raid5.h2
-rw-r--r--drivers/nvme/common/keyring.c58
-rw-r--r--drivers/nvme/host/Kconfig3
-rw-r--r--drivers/nvme/host/core.c47
-rw-r--r--drivers/nvme/host/fabrics.c2
-rw-r--r--drivers/nvme/host/ioctl.c26
-rw-r--r--drivers/nvme/host/nvme.h7
-rw-r--r--drivers/nvme/host/pci.c18
-rw-r--r--drivers/nvme/host/rdma.c6
-rw-r--r--drivers/nvme/host/sysfs.c90
-rw-r--r--drivers/nvme/host/tcp.c57
-rw-r--r--drivers/nvme/target/admin-cmd.c2
-rw-r--r--drivers/nvme/target/auth.c12
-rw-r--r--drivers/nvme/target/rdma.c4
-rw-r--r--fs/btrfs/bio.c30
-rw-r--r--include/linux/bio.h4
-rw-r--r--include/linux/blkdev.h3
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/net.h19
-rw-r--r--include/linux/nvme-keyring.h6
-rw-r--r--include/linux/nvme-rdma.h6
-rw-r--r--include/linux/nvme.h8
-rw-r--r--include/uapi/linux/nbd.h8
-rw-r--r--mm/gup.c13
64 files changed, 1711 insertions, 1301 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index f0310d42374d..7081f93bdb9a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3813,10 +3813,9 @@ F: Documentation/filesystems/befs.rst
F: fs/befs/
BFQ I/O SCHEDULER
-M: Paolo Valente <paolo.valente@unimore.it>
-M: Jens Axboe <axboe@kernel.dk>
+M: Yu Kuai <yukuai3@huawei.com>
L: linux-block@vger.kernel.org
-S: Maintained
+S: Odd Fixes
F: Documentation/block/bfq-iosched.rst
F: block/bfq-*
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b758693697c0..e831aedb4643 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -679,12 +679,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
bfqg_and_blkg_put(old_parent);
- if (entity->parent &&
- entity->parent->last_bfqq_created == bfqq)
- entity->parent->last_bfqq_created = NULL;
- else if (bfqd->last_bfqq_created == bfqq)
- bfqd->last_bfqq_created = NULL;
-
+ bfq_reassign_last_bfqq(bfqq, NULL);
entity->parent = bfqg->my_entity;
entity->sched_data = &bfqg->sched_data;
/* pin down bfqg and its associated blkg */
@@ -741,7 +736,6 @@ static void bfq_sync_bfqq_move(struct bfq_data *bfqd,
*/
bfq_put_cooperator(sync_bfqq);
bic_set_bfqq(bic, NULL, true, act_idx);
- bfq_release_process_ref(bfqd, sync_bfqq);
}
}
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 36a4998c4b37..0747d9d0e48c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -2911,8 +2911,12 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
/* if a merge has already been setup, then proceed with that first */
- if (bfqq->new_bfqq)
- return bfqq->new_bfqq;
+ new_bfqq = bfqq->new_bfqq;
+ if (new_bfqq) {
+ while (new_bfqq->new_bfqq)
+ new_bfqq = new_bfqq->new_bfqq;
+ return new_bfqq;
+ }
/*
* Check delayed stable merge for rotational or non-queueing
@@ -3093,8 +3097,8 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
}
-static void
-bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq)
+void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq,
+ struct bfq_queue *new_bfqq)
{
if (cur_bfqq->entity.parent &&
cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq)
@@ -3125,10 +3129,12 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_put_queue(bfqq);
}
-static void
-bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
- struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+static struct bfq_queue *bfq_merge_bfqqs(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct bfq_queue *bfqq)
{
+ struct bfq_queue *new_bfqq = bfqq->new_bfqq;
+
bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
(unsigned long)new_bfqq->pid);
/* Save weight raising and idle window of the merged queues */
@@ -3222,6 +3228,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
bfq_reassign_last_bfqq(bfqq, new_bfqq);
bfq_release_process_ref(bfqd, bfqq);
+
+ return new_bfqq;
}
static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -3257,14 +3265,8 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
* fulfilled, i.e., bic can be redirected to new_bfqq
* and bfqq can be put.
*/
- bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
- new_bfqq);
- /*
- * If we get here, bio will be queued into new_queue,
- * so use new_bfqq to decide whether bio and rq can be
- * merged.
- */
- bfqq = new_bfqq;
+ while (bfqq != new_bfqq)
+ bfqq = bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq);
/*
* Change also bqfd->bio_bfqq, as
@@ -5432,6 +5434,8 @@ void bfq_put_cooperator(struct bfq_queue *bfqq)
bfq_put_queue(__bfqq);
__bfqq = next;
}
+
+ bfq_release_process_ref(bfqq->bfqd, bfqq);
}
static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
@@ -5444,8 +5448,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
bfq_put_cooperator(bfqq);
-
- bfq_release_process_ref(bfqd, bfqq);
}
static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
@@ -5701,9 +5703,7 @@ bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* state before killing it.
*/
bfqq->bic = bic;
- bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
-
- return new_bfqq;
+ return bfq_merge_bfqqs(bfqd, bic, bfqq);
}
/*
@@ -6158,6 +6158,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bool waiting, idle_timer_disabled = false;
if (new_bfqq) {
+ struct bfq_queue *old_bfqq = bfqq;
/*
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
@@ -6174,18 +6175,18 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* new_bfqq.
*/
if (bic_to_bfqq(RQ_BIC(rq), true,
- bfq_actuator_index(bfqd, rq->bio)) == bfqq)
- bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
- bfqq, new_bfqq);
+ bfq_actuator_index(bfqd, rq->bio)) == bfqq) {
+ while (bfqq != new_bfqq)
+ bfqq = bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq);
+ }
- bfq_clear_bfqq_just_created(bfqq);
+ bfq_clear_bfqq_just_created(old_bfqq);
/*
* rq is about to be enqueued into new_bfqq,
* release rq reference on bfqq
*/
- bfq_put_queue(bfqq);
+ bfq_put_queue(old_bfqq);
rq->elv.priv[1] = new_bfqq;
- bfqq = new_bfqq;
}
bfq_update_io_thinktime(bfqd, bfqq);
@@ -6723,7 +6724,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
{
bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
- if (bfqq_process_refs(bfqq) == 1) {
+ if (bfqq_process_refs(bfqq) == 1 && !bfqq->new_bfqq) {
bfqq->pid = current->pid;
bfq_clear_bfqq_coop(bfqq);
bfq_clear_bfqq_split_coop(bfqq);
@@ -6733,16 +6734,13 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx);
bfq_put_cooperator(bfqq);
-
- bfq_release_process_ref(bfqq->bfqd, bfqq);
return NULL;
}
-static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
- struct bfq_io_cq *bic,
- struct bio *bio,
- bool split, bool is_sync,
- bool *new_queue)
+static struct bfq_queue *
+__bfq_get_bfqq_handle_split(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+ struct bio *bio, bool split, bool is_sync,
+ bool *new_queue)
{
unsigned int act_idx = bfq_actuator_index(bfqd, bio);
struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx);
@@ -6821,6 +6819,84 @@ static void bfq_prepare_request(struct request *rq)
rq->elv.priv[0] = rq->elv.priv[1] = NULL;
}
+static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq)
+{
+ struct bfq_queue *new_bfqq = bfqq->new_bfqq;
+ struct bfq_queue *waker_bfqq = bfqq->waker_bfqq;
+
+ if (!waker_bfqq)
+ return NULL;
+
+ while (new_bfqq) {
+ if (new_bfqq == waker_bfqq) {
+ /*
+ * If waker_bfqq is in the merge chain, and current
+ * is the only procress.
+ */
+ if (bfqq_process_refs(waker_bfqq) == 1)
+ return NULL;
+ break;
+ }
+
+ new_bfqq = new_bfqq->new_bfqq;
+ }
+
+ return waker_bfqq;
+}
+
+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct bio *bio,
+ unsigned int idx,
+ bool is_sync)
+{
+ struct bfq_queue *waker_bfqq;
+ struct bfq_queue *bfqq;
+ bool new_queue = false;
+
+ bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
+ &new_queue);
+ if (unlikely(new_queue))
+ return bfqq;
+
+ /* If the queue was seeky for too long, break it apart. */
+ if (!bfq_bfqq_coop(bfqq) || !bfq_bfqq_split_coop(bfqq) ||
+ bic->bfqq_data[idx].stably_merged)
+ return bfqq;
+
+ waker_bfqq = bfq_waker_bfqq(bfqq);
+
+ /* Update bic before losing reference to bfqq */
+ if (bfq_bfqq_in_large_burst(bfqq))
+ bic->bfqq_data[idx].saved_in_large_burst = true;
+
+ bfqq = bfq_split_bfqq(bic, bfqq);
+ if (bfqq) {
+ bfq_bfqq_resume_state(bfqq, bfqd, bic, true);
+ return bfqq;
+ }
+
+ bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL);
+ if (unlikely(bfqq == &bfqd->oom_bfqq))
+ return bfqq;
+
+ bfq_bfqq_resume_state(bfqq, bfqd, bic, false);
+ bfqq->waker_bfqq = waker_bfqq;
+ bfqq->tentative_waker_bfqq = NULL;
+
+ /*
+ * If the waker queue disappears, then new_bfqq->waker_bfqq must be
+ * reset. So insert new_bfqq into the
+ * woken_list of the waker. See
+ * bfq_check_waker for details.
+ */
+ if (waker_bfqq)
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqq->waker_bfqq->woken_list);
+
+ return bfqq;
+}
+
/*
* If needed, init rq, allocate bfq data structures associated with
* rq, and increment reference counters in the destination bfq_queue
@@ -6852,8 +6928,6 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
struct bfq_io_cq *bic;
const int is_sync = rq_is_sync(rq);
struct bfq_queue *bfqq;
- bool new_queue = false;
- bool bfqq_already_existing = false, split = false;
unsigned int a_idx = bfq_actuator_index(bfqd, bio);
if (unlikely(!rq->elv.icq))
@@ -6870,54 +6944,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
return RQ_BFQQ(rq);
bic = icq_to_bic(rq->elv.icq);
-
bfq_check_ioprio_change(bic, bio);
-
bfq_bic_update_cgroup(bic, bio);
-
- bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
- &new_queue);
-
- if (likely(!new_queue)) {
- /* If the queue was seeky for too long, break it apart. */
- if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
- !bic->bfqq_data[a_idx].stably_merged) {
- struct bfq_queue *old_bfqq = bfqq;
-
- /* Update bic before losing reference to bfqq */
- if (bfq_bfqq_in_large_burst(bfqq))
- bic->bfqq_data[a_idx].saved_in_large_burst =
- true;
-
- bfqq = bfq_split_bfqq(bic, bfqq);
- split = true;
-
- if (!bfqq) {
- bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
- true, is_sync,
- NULL);
- if (unlikely(bfqq == &bfqd->oom_bfqq))
- bfqq_already_existing = true;
- } else
- bfqq_already_existing = true;
-
- if (!bfqq_already_existing) {
- bfqq->waker_bfqq = old_bfqq->waker_bfqq;
- bfqq->tentative_waker_bfqq = NULL;
-
- /*
- * If the waker queue disappears, then
- * new_bfqq->waker_bfqq must be
- * reset. So insert new_bfqq into the
- * woken_list of the waker. See
- * bfq_check_waker for details.
- */
- if (bfqq->waker_bfqq)
- hlist_add_head(&bfqq->woken_list_node,
- &bfqq->waker_bfqq->woken_list);
- }
- }
- }
+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, a_idx, is_sync);
bfqq_request_allocated(bfqq);
bfqq->ref++;
@@ -6934,18 +6963,9 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
* addition, if the queue has also just been split, we have to
* resume its state.
*/
- if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+ if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq &&
+ bfqq_process_refs(bfqq) == 1)
bfqq->bic = bic;
- if (split) {
- /*
- * The queue has just been split from a shared
- * queue: restore the idle window and the
- * possible weight raising period.
- */
- bfq_bfqq_resume_state(bfqq, bfqd, bic,
- bfqq_already_existing);
- }
- }
/*
* Consider bfqq as possibly belonging to a burst of newly
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 08ddf2cfae5b..687a3a7ba784 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -1156,6 +1156,8 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
+void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq,
+ struct bfq_queue *new_bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */
@@ -1183,11 +1185,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
"%s " fmt, pid_str, ##args); \
} while (0)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
- blk_add_cgroup_trace_msg((bfqd)->queue, \
- &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \
-} while (0)
-
#else /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
@@ -1197,7 +1194,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
} while (0)
-#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bio.c b/block/bio.c
index c4053d49679a..ac4d77c88932 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -931,7 +931,8 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
return false;
- *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
+ *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
+ PAGE_MASK));
if (!*same_page) {
if (IS_ENABLED(CONFIG_KMSAN))
return false;
@@ -1017,6 +1018,29 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
}
/**
+ * bio_add_hw_folio - attempt to add a folio to a bio with hw constraints
+ * @q: the target queue
+ * @bio: destination bio
+ * @folio: folio to add
+ * @len: vec entry length
+ * @offset: vec entry offset in the folio
+ * @max_sectors: maximum number of sectors that can be added
+ * @same_page: return if the segment has been merged inside the same folio
+ *
+ * Add a folio to a bio while respecting the hardware max_sectors, max_segment
+ * and gap limitations.
+ */
+int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
+ struct folio *folio, size_t len, size_t offset,
+ unsigned int max_sectors, bool *same_page)
+{
+ if (len > UINT_MAX || offset > UINT_MAX)
+ return 0;
+ return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset,
+ max_sectors, same_page);
+}
+
+/**
* bio_add_pc_page - attempt to add page to passthrough bio
* @q: the target queue
* @bio: destination bio
@@ -1166,7 +1190,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
struct folio_iter fi;
bio_for_each_folio_all(fi, bio) {
- struct page *page;
size_t nr_pages;
if (mark_dirty) {
@@ -1174,12 +1197,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
folio_mark_dirty(fi.folio);
folio_unlock(fi.folio);
}
- page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
fi.offset / PAGE_SIZE + 1;
- do {
- bio_release_page(bio, page++);
- } while (--nr_pages != 0);
+ unpin_user_folio(fi.folio, nr_pages);
}
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
@@ -1204,8 +1224,8 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
-static int bio_iov_add_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
+static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
+ size_t offset)
{
bool same_page = false;
@@ -1214,30 +1234,61 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset, &same_page)) {
+ folio_page(folio, 0), len, offset,
+ &same_page)) {
bio->bi_iter.bi_size += len;
- if (same_page)
- bio_release_page(bio, page);
+ if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
+ unpin_user_folio(folio, 1);
return 0;
}
- __bio_add_page(bio, page, len, offset);
+ bio_add_folio_nofail(bio, folio, len, offset);
return 0;
}
-static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
+static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio,
+ size_t len, size_t offset)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
bool same_page = false;
- if (bio_add_hw_page(q, bio, page, len, offset,
+ if (bio_add_hw_folio(q, bio, folio, len, offset,
queue_max_zone_append_sectors(q), &same_page) != len)
return -EINVAL;
- if (same_page)
- bio_release_page(bio, page);
+ if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
+ unpin_user_folio(folio, 1);
return 0;
}
+static unsigned int get_contig_folio_len(unsigned int *num_pages,
+ struct page **pages, unsigned int i,
+ struct folio *folio, size_t left,
+ size_t offset)
+{
+ size_t bytes = left;
+ size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes);
+ unsigned int j;
+
+ /*
+ * We might COW a single page in the middle of
+ * a large folio, so we have to check that all
+ * pages belong to the same folio.
+ */
+ bytes -= contig_sz;
+ for (j = i + 1; j < i + *num_pages; j++) {
+ size_t next = min_t(size_t, PAGE_SIZE, bytes);
+
+ if (page_folio(pages[j]) != folio ||
+ pages[j] != pages[j - 1] + 1) {
+ break;
+ }
+ contig_sz += next;
+ bytes -= next;
+ }
+ *num_pages = j - i;
+
+ return contig_sz;
+}
+
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
/**
@@ -1257,9 +1308,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
- ssize_t size, left;
- unsigned len, i = 0;
- size_t offset;
+ ssize_t size;
+ unsigned int num_pages, i = 0;
+ size_t offset, folio_offset, left, len;
int ret = 0;
/*
@@ -1299,17 +1350,28 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
goto out;
}
- for (left = size, i = 0; left > 0; left -= len, i++) {
+ for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
+ struct folio *folio = page_folio(page);
+
+ folio_offset = ((size_t)folio_page_idx(folio, page) <<
+ PAGE_SHIFT) + offset;
+
+ len = min(folio_size(folio) - folio_offset, left);
+
+ num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+
+ if (num_pages > 1)
+ len = get_contig_folio_len(&num_pages, pages, i,
+ folio, left, offset);
- len = min_t(size_t, PAGE_SIZE - offset, left);
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- ret = bio_iov_add_zone_append_page(bio, page, len,
- offset);
+ ret = bio_iov_add_zone_append_folio(bio, folio, len,
+ folio_offset);
if (ret)
break;
} else
- bio_iov_add_page(bio, page, len, offset);
+ bio_iov_add_folio(bio, folio, len, folio_offset);
offset = 0;
}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 69e70964398c..e68c725cf8d9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1458,7 +1458,6 @@ int blkcg_init_disk(struct gendisk *disk)
struct request_queue *q = disk->queue;
struct blkcg_gq *new_blkg, *blkg;
bool preloaded;
- int ret;
new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg)
@@ -1478,15 +1477,8 @@ int blkcg_init_disk(struct gendisk *disk)
if (preloaded)
radix_tree_preload_end();
- ret = blk_ioprio_init(disk);
- if (ret)
- goto err_destroy_all;
-
return 0;
-err_destroy_all:
- blkg_destroy_all(disk);
- return ret;
err_unlock:
spin_unlock_irq(&q->queue_lock);
if (preloaded)
@@ -1554,6 +1546,14 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
if (blkcg_policy_enabled(q, pol))
return 0;
+ /*
+ * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn,
+ * for example, ioprio. Such policy will work on blkcg level, not disk
+ * level, and don't need to be activated.
+ */
+ if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
+ return -EINVAL;
+
if (queue_is_mq(q))
blk_mq_freeze_queue(q);
retry:
@@ -1733,9 +1733,12 @@ int blkcg_policy_register(struct blkcg_policy *pol)
goto err_unlock;
}
- /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */
+ /*
+ * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
+ * without pd_alloc_fn/pd_free_fn can't be activated.
+ */
if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
- (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
+ (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
goto err_unlock;
/* register @pol */
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 864fad4a850b..b9e3265c1eb3 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -485,7 +485,6 @@ static inline void blkcg_deactivate_policy(struct gendisk *disk,
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 690ca99dfaca..9dc9323f84ac 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -648,7 +648,7 @@ static const struct ioc_params autop[] = {
* vrate adjust percentages indexed by ioc->busy_level. We adjust up on
* vtime credit shortage and down on device saturation.
*/
-static u32 vrate_adj_pct[] =
+static const u32 vrate_adj_pct[] =
{ 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -2076,7 +2076,7 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
struct ioc_now *now)
{
struct ioc_gq *iocg;
- u64 dur, usage_pct, nr_cycles;
+ u64 dur, usage_pct, nr_cycles, nr_cycles_shift;
/* if no debtor, reset the cycle */
if (!nr_debtors) {
@@ -2138,10 +2138,12 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
old_debt = iocg->abs_vdebt;
old_delay = iocg->delay;
+ nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1);
if (iocg->abs_vdebt)
- iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
+ iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1;
+
if (iocg->delay)
- iocg->delay = iocg->delay >> nr_cycles ?: 1;
+ iocg->delay = iocg->delay >> nr_cycles_shift ?: 1;
iocg_kick_waitq(iocg, true, now);
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 4051fada01f1..8fff7ccc0ac7 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -50,14 +50,6 @@ static const char *policy_name[] = {
static struct blkcg_policy ioprio_policy;
/**
- * struct ioprio_blkg - Per (cgroup, request queue) data.
- * @pd: blkg_policy_data structure.
- */
-struct ioprio_blkg {
- struct blkg_policy_data pd;
-};
-
-/**
* struct ioprio_blkcg - Per cgroup data.
* @cpd: blkcg_policy_data structure.
* @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
@@ -67,11 +59,6 @@ struct ioprio_blkcg {
enum prio_policy prio_policy;
};
-static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
-{
- return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL;
-}
-
static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg)
{
return container_of(blkcg_to_cpd(blkcg, &ioprio_policy),
@@ -84,16 +71,6 @@ ioprio_blkcg_from_css(struct cgroup_subsys_state *css)
return blkcg_to_ioprio_blkcg(css_to_blkcg(css));
}
-static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio)
-{
- struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy);
-
- if (!pd)
- return NULL;
-
- return blkcg_to_ioprio_blkcg(pd->blkg->blkcg);
-}
-
static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));
@@ -118,25 +95,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
return nbytes;
}
-static struct blkg_policy_data *
-ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp)
-{
- struct ioprio_blkg *ioprio_blkg;
-
- ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp);
- if (!ioprio_blkg)
- return NULL;
-
- return &ioprio_blkg->pd;
-}
-
-static void ioprio_free_pd(struct blkg_policy_data *pd)
-{
- struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd);
-
- kfree(ioprio_blkg);
-}
-
static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
{
struct ioprio_blkcg *blkcg;
@@ -179,14 +137,11 @@ static struct blkcg_policy ioprio_policy = {
.cpd_alloc_fn = ioprio_alloc_cpd,
.cpd_free_fn = ioprio_free_cpd,
-
- .pd_alloc_fn = ioprio_alloc_pd,
- .pd_free_fn = ioprio_free_pd,
};
void blkcg_set_ioprio(struct bio *bio)
{
- struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
+ struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg);
u16 prio;
if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
@@ -219,16 +174,6 @@ void blkcg_set_ioprio(struct bio *bio)
bio->bi_ioprio = prio;
}
-void blk_ioprio_exit(struct gendisk *disk)
-{
- blkcg_deactivate_policy(disk, &ioprio_policy);
-}
-
-int blk_ioprio_init(struct gendisk *disk)
-{
- return blkcg_activate_policy(disk, &ioprio_policy);
-}
-
static int __init ioprio_init(void)
{
return blkcg_policy_register(&ioprio_policy);
diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h
index b6afb8e80de0..9265143f9bc9 100644
--- a/block/blk-ioprio.h
+++ b/block/blk-ioprio.h
@@ -9,17 +9,8 @@ struct request_queue;
struct bio;
#ifdef CONFIG_BLK_CGROUP_IOPRIO
-int blk_ioprio_init(struct gendisk *disk);
-void blk_ioprio_exit(struct gendisk *disk);
void blkcg_set_ioprio(struct bio *bio);
#else
-static inline int blk_ioprio_init(struct gendisk *disk)
-{
- return 0;
-}
-static inline void blk_ioprio_exit(struct gendisk *disk)
-{
-}
static inline void blkcg_set_ioprio(struct bio *bio)
{
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index de5281bcadc5..56769c4bcd79 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -105,9 +105,33 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
-static struct bio *bio_split_discard(struct bio *bio,
- const struct queue_limits *lim,
- unsigned *nsegs, struct bio_set *bs)
+static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
+{
+ if (unlikely(split_sectors < 0)) {
+ bio->bi_status = errno_to_blk_status(split_sectors);
+ bio_endio(bio);
+ return NULL;
+ }
+
+ if (split_sectors) {
+ struct bio *split;
+
+ split = bio_split(bio, split_sectors, GFP_NOIO,
+ &bio->bi_bdev->bd_disk->bio_split);
+ split->bi_opf |= REQ_NOMERGE;
+ blkcg_bio_issue_init(split);
+ bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
+ WARN_ON_ONCE(bio_zone_write_plugging(bio));
+ submit_bio_noacct(bio);
+ return split;
+ }
+
+ return bio;
+}
+
+struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nsegs)
{
unsigned int max_discard_sectors, granularity;
sector_t tmp;
@@ -121,10 +145,10 @@ static struct bio *bio_split_discard(struct bio *bio,
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
if (unlikely(!max_discard_sectors))
- return NULL;
+ return bio;
if (bio_sectors(bio) <= max_discard_sectors)
- return NULL;
+ return bio;
split_sectors = max_discard_sectors;
@@ -139,19 +163,18 @@ static struct bio *bio_split_discard(struct bio *bio,
if (split_sectors > tmp)
split_sectors -= tmp;
- return bio_split(bio, split_sectors, GFP_NOIO, bs);
+ return bio_submit_split(bio, split_sectors);
}
-static struct bio *bio_split_write_zeroes(struct bio *bio,
- const struct queue_limits *lim,
- unsigned *nsegs, struct bio_set *bs)
+struct bio *bio_split_write_zeroes(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nsegs)
{
*nsegs = 0;
if (!lim->max_write_zeroes_sectors)
- return NULL;
+ return bio;
if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
- return NULL;
- return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
+ return bio;
+ return bio_submit_split(bio, lim->max_write_zeroes_sectors);
}
static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim,
@@ -274,27 +297,19 @@ static bool bvec_split_segs(const struct queue_limits *lim,
}
/**
- * bio_split_rw - split a bio in two bios
+ * bio_split_rw_at - check if and where to split a read/write bio
* @bio: [in] bio to be split
* @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
- * @bs: [in] bio set to allocate the clone from
* @max_bytes: [in] maximum number of bytes per bio
*
- * Clone @bio, update the bi_iter of the clone to represent the first sectors
- * of @bio and update @bio->bi_iter to represent the remaining sectors. The
- * following is guaranteed for the cloned bio:
- * - That it has at most @max_bytes worth of data
- * - That it has at most queue_max_segments(@q) segments.
- *
- * Except for discard requests the cloned bio will point at the bi_io_vec of
- * the original bio. It is the responsibility of the caller to ensure that the
- * original bio is not freed before the cloned bio. The caller is also
- * responsible for ensuring that @bs is only destroyed after processing of the
- * split bio has finished.
+ * Find out if @bio needs to be split to fit the queue limits in @lim and a
+ * maximum size of @max_bytes. Returns a negative error number if @bio can't be
+ * split, 0 if the bio doesn't have to be split, or a positive sector offset if
+ * @bio needs to be split.
*/
-struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
- unsigned *segs, struct bio_set *bs, unsigned max_bytes)
+int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
+ unsigned *segs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
@@ -324,22 +339,17 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
}
*segs = nsegs;
- return NULL;
+ return 0;
split:
- if (bio->bi_opf & REQ_ATOMIC) {
- bio->bi_status = BLK_STS_INVAL;
- bio_endio(bio);
- return ERR_PTR(-EINVAL);
- }
+ if (bio->bi_opf & REQ_ATOMIC)
+ return -EINVAL;
+
/*
* We can't sanely support splitting for a REQ_NOWAIT bio. End it
* with EAGAIN if splitting is required and return an error pointer.
*/
- if (bio->bi_opf & REQ_NOWAIT) {
- bio->bi_status = BLK_STS_AGAIN;
- bio_endio(bio);
- return ERR_PTR(-EAGAIN);
- }
+ if (bio->bi_opf & REQ_NOWAIT)
+ return -EAGAIN;
*segs = nsegs;
@@ -356,58 +366,36 @@ split:
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_polled(bio);
- return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
+ return bytes >> SECTOR_SHIFT;
}
-EXPORT_SYMBOL_GPL(bio_split_rw);
+EXPORT_SYMBOL_GPL(bio_split_rw_at);
-/**
- * __bio_split_to_limits - split a bio to fit the queue limits
- * @bio: bio to be split
- * @lim: queue limits to split based on
- * @nr_segs: returns the number of segments in the returned bio
- *
- * Check if @bio needs splitting based on the queue limits, and if so split off
- * a bio fitting the limits from the beginning of @bio and return it. @bio is
- * shortened to the remainder and re-submitted.
+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nr_segs)
+{
+ return bio_submit_split(bio,
+ bio_split_rw_at(bio, lim, nr_segs,
+ get_max_io_size(bio, lim) << SECTOR_SHIFT));
+}
+
+/*
+ * REQ_OP_ZONE_APPEND bios must never be split by the block layer.
*
- * The split bio is allocated from @q->bio_split, which is provided by the
- * block layer.
+ * But we want the nr_segs calculation provided by bio_split_rw_at, and having
+ * a good sanity check that the submitter built the bio correctly is nice to
+ * have as well.
*/
-struct bio *__bio_split_to_limits(struct bio *bio,
- const struct queue_limits *lim,
- unsigned int *nr_segs)
+struct bio *bio_split_zone_append(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nr_segs)
{
- struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
- struct bio *split;
-
- switch (bio_op(bio)) {
- case REQ_OP_DISCARD:
- case REQ_OP_SECURE_ERASE:
- split = bio_split_discard(bio, lim, nr_segs, bs);
- break;
- case REQ_OP_WRITE_ZEROES:
- split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
- break;
- default:
- split = bio_split_rw(bio, lim, nr_segs, bs,
- get_max_io_size(bio, lim) << SECTOR_SHIFT);
- if (IS_ERR(split))
- return NULL;
- break;
- }
-
- if (split) {
- /* there isn't chance to merge the split bio */
- split->bi_opf |= REQ_NOMERGE;
-
- blkcg_bio_issue_init(split);
- bio_chain(split, bio);
- trace_block_split(split, bio->bi_iter.bi_sector);
- WARN_ON_ONCE(bio_zone_write_plugging(bio));
- submit_bio_noacct(bio);
- return split;
- }
- return bio;
+ unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim);
+ int split_sectors;
+
+ split_sectors = bio_split_rw_at(bio, lim, nr_segs,
+ max_sectors << SECTOR_SHIFT);
+ if (WARN_ON_ONCE(split_sectors > 0))
+ split_sectors = -EINVAL;
+ return bio_submit_split(bio, split_sectors);
}
/**
@@ -426,9 +414,7 @@ struct bio *bio_split_to_limits(struct bio *bio)
const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
- if (bio_may_exceed_limits(bio, lim))
- return __bio_split_to_limits(bio, lim, &nr_segs);
- return bio;
+ return __bio_split_to_limits(bio, lim, &nr_segs);
}
EXPORT_SYMBOL(bio_split_to_limits);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e3c3c0c21b55..3f1f7d0b3ff3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2753,6 +2753,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
struct request *rq;
+ unsigned int depth;
/*
* We may have been called recursively midway through handling
@@ -2763,6 +2764,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
if (plug->rq_count == 0)
return;
+ depth = plug->rq_count;
plug->rq_count = 0;
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
@@ -2770,6 +2772,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
rq = rq_list_peek(&plug->mq_list);
q = rq->q;
+ trace_block_unplug(q, depth, true);
/*
* Peek first request and see if we have a ->queue_rqs() hook.
@@ -2939,7 +2942,7 @@ void blk_mq_submit_bio(struct bio *bio)
struct blk_plug *plug = current->plug;
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
- unsigned int nr_segs = 1;
+ unsigned int nr_segs;
struct request *rq;
blk_status_t ret;
@@ -2981,11 +2984,10 @@ void blk_mq_submit_bio(struct bio *bio)
goto queue_exit;
}
- if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
- bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
- if (!bio)
- goto queue_exit;
- }
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ goto queue_exit;
+
if (!bio_integrity_prep(bio))
goto queue_exit;
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index dd7310c94713..2cfb297d9a62 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -263,7 +263,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
TASK_UNINTERRUPTIBLE);
do {
- /* The memory barrier in set_task_state saves us here. */
+ /* The memory barrier in set_current_state saves us here. */
if (data.got_token)
break;
if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6943ec720f39..2c4192e12efa 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1584,6 +1584,22 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
spin_unlock_irq(&q->queue_lock);
}
+static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw)
+{
+ /* throtl is FIFO - if bios are already queued, should queue */
+ if (tg->service_queue.nr_queued[rw])
+ return false;
+
+ return tg_may_dispatch(tg, bio, NULL);
+}
+
+static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw)
+{
+ if (!bio_flagged(bio, BIO_BPS_THROTTLED))
+ tg->carryover_bytes[rw] -= throtl_bio_data_size(bio);
+ tg->carryover_ios[rw]--;
+}
+
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -1600,34 +1616,35 @@ bool __blk_throtl_bio(struct bio *bio)
sq = &tg->service_queue;
while (true) {
- if (tg->last_low_overflow_time[rw] == 0)
- tg->last_low_overflow_time[rw] = jiffies;
- /* throtl is FIFO - if bios are already queued, should queue */
- if (sq->nr_queued[rw])
- break;
-
- /* if above limits, break to queue */
- if (!tg_may_dispatch(tg, bio, NULL)) {
- tg->last_low_overflow_time[rw] = jiffies;
+ if (tg_within_limit(tg, bio, rw)) {
+ /* within limits, let's charge and dispatch directly */
+ throtl_charge_bio(tg, bio);
+
+ /*
+ * We need to trim slice even when bios are not being
+ * queued otherwise it might happen that a bio is not
+ * queued for a long time and slice keeps on extending
+ * and trim is not called for a long time. Now if limits
+ * are reduced suddenly we take into account all the IO
+ * dispatched so far at new low rate and * newly queued
+ * IO gets a really long dispatch time.
+ *
+ * So keep on trimming slice even if bio is not queued.
+ */
+ throtl_trim_slice(tg, rw);
+ } else if (bio_issue_as_root_blkg(bio)) {
+ /*
+ * IOs which may cause priority inversions are
+ * dispatched directly, even if they're over limit.
+ * Debts are handled by carryover_bytes/ios while
+ * calculating wait time.
+ */
+ tg_dispatch_in_debt(tg, bio, rw);
+ } else {
+ /* if above limits, break to queue */
break;
}
- /* within limits, let's charge and dispatch directly */
- throtl_charge_bio(tg, bio);
-
- /*
- * We need to trim slice even when bios are not being queued
- * otherwise it might happen that a bio is not queued for
- * a long time and slice keeps on extending and trim is not
- * called for a long time. Now if limits are reduced suddenly
- * we take into account all the IO dispatched so far at new
- * low rate and * newly queued IO gets a really long dispatch
- * time.
- *
- * So keep on trimming slice even if bio is not queued.
- */
- throtl_trim_slice(tg, rw);
-
/*
* @bio passed through this layer without being throttled.
* Climb up the ladder. If we're already at the top, it
@@ -1650,8 +1667,6 @@ bool __blk_throtl_bio(struct bio *bio)
tg->io_disp[rw], tg_iops_limit(tg, rw),
sq->nr_queued[READ], sq->nr_queued[WRITE]);
- tg->last_low_overflow_time[rw] = jiffies;
-
td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
throttled = true;
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index 4d9ef5abdf21..1a36d1278eea 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -106,8 +106,6 @@ struct throtl_grp {
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
- unsigned long last_low_overflow_time[2];
-
uint64_t last_bytes_disp[2];
unsigned int last_io_disp[2];
diff --git a/block/blk.h b/block/blk.h
index e180863f918b..86affb583eb6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -331,33 +331,67 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
-static inline bool bio_may_exceed_limits(struct bio *bio,
- const struct queue_limits *lim)
+struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nsegs);
+struct bio *bio_split_write_zeroes(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nsegs);
+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nr_segs);
+struct bio *bio_split_zone_append(struct bio *bio,
+ const struct queue_limits *lim, unsigned *nr_segs);
+
+/*
+ * All drivers must accept single-segments bios that are smaller than PAGE_SIZE.
+ *
+ * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is
+ * always valid if a bio has data. The check might lead to occasional false
+ * positives when bios are cloned, but compared to the performance impact of
+ * cloned bios themselves the loop below doesn't matter anyway.
+ */
+static inline bool bio_may_need_split(struct bio *bio,
+ const struct queue_limits *lim)
+{
+ return lim->chunk_sectors || bio->bi_vcnt != 1 ||
+ bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
+}
+
+/**
+ * __bio_split_to_limits - split a bio to fit the queue limits
+ * @bio: bio to be split
+ * @lim: queue limits to split based on
+ * @nr_segs: returns the number of segments in the returned bio
+ *
+ * Check if @bio needs splitting based on the queue limits, and if so split off
+ * a bio fitting the limits from the beginning of @bio and return it. @bio is
+ * shortened to the remainder and re-submitted.
+ *
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
+ */
+static inline struct bio *__bio_split_to_limits(struct bio *bio,
+ const struct queue_limits *lim, unsigned int *nr_segs)
{
switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ if (bio_may_need_split(bio, lim))
+ return bio_split_rw(bio, lim, nr_segs);
+ *nr_segs = 1;
+ return bio;
+ case REQ_OP_ZONE_APPEND:
+ return bio_split_zone_append(bio, lim, nr_segs);
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
+ return bio_split_discard(bio, lim, nr_segs);
case REQ_OP_WRITE_ZEROES:
- return true; /* non-trivial splitting decisions */
+ return bio_split_write_zeroes(bio, lim, nr_segs);
default:
- break;
+ /* other operations can't be split */
+ *nr_segs = 0;
+ return bio;
}
-
- /*
- * All drivers must accept single-segments bios that are <= PAGE_SIZE.
- * This is a quick and dirty check that relies on the fact that
- * bi_io_vec[0] is always valid if a bio has data. The check might
- * lead to occasional false negatives when bios are cloned, but compared
- * to the performance impact of cloned bios themselves the loop below
- * doesn't matter anyway.
- */
- return lim->chunk_sectors || bio->bi_vcnt != 1 ||
- bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
-struct bio *__bio_split_to_limits(struct bio *bio,
- const struct queue_limits *lim,
- unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -540,6 +574,10 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
+int bio_add_hw_folio(struct request_queue *q, struct bio *bio,
+ struct folio *folio, size_t len, size_t offset,
+ unsigned int max_sectors, bool *same_page);
+
/*
* Clean up a page appropriately, where the page may be pinned, may have a
* ref taken on it or neither.
diff --git a/block/ioctl.c b/block/ioctl.c
index e8e4a4190f18..44257bdfeacb 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -126,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
return -EINVAL;
filemap_invalidate_lock(bdev->bd_mapping);
- err = truncate_bdev_range(bdev, mode, start, start + len - 1);
+ err = truncate_bdev_range(bdev, mode, start, end - 1);
if (err)
goto fail;
@@ -163,7 +163,7 @@ fail:
static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
void __user *argp)
{
- uint64_t start, len;
+ uint64_t start, len, end;
uint64_t range[2];
int err;
@@ -178,11 +178,12 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
len = range[1];
if ((start & 511) || (len & 511))
return -EINVAL;
- if (start + len > bdev_nr_bytes(bdev))
+ if (check_add_overflow(start, len, &end) ||
+ end > bdev_nr_bytes(bdev))
return -EINVAL;
filemap_invalidate_lock(bdev->bd_mapping);
- err = truncate_bdev_range(bdev, mode, start, start + len - 1);
+ err = truncate_bdev_range(bdev, mode, start, end - 1);
if (!err)
err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
GFP_KERNEL);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index ab76e64f0f6c..5bd7a603092e 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -555,9 +555,11 @@ static bool blk_add_partition(struct gendisk *disk,
part = add_partition(disk, p, from, size, state->parts[p].flags,
&state->parts[p].info);
- if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
- printk(KERN_ERR " %s: p%d could not be added: %pe\n",
- disk->disk_name, p, part);
+ if (IS_ERR(part)) {
+ if (PTR_ERR(part) != -ENXIO) {
+ printk(KERN_ERR " %s: p%d could not be added: %pe\n",
+ disk->disk_name, p, part);
+ }
return true;
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 425e2836f3e1..e7052a728966 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -8,7 +8,6 @@
#include <linux/blk-integrity.h>
#include <linux/crc-t10dif.h>
#include <linux/crc64.h>
-#include <linux/module.h>
#include <net/checksum.h>
#include <asm/unaligned.h>
#include "blk.h"
@@ -240,9 +239,9 @@ static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
}
}
-static bool ext_pi_ref_escape(u8 *ref_tag)
+static bool ext_pi_ref_escape(const u8 ref_tag[6])
{
- static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+ static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0;
}
@@ -472,6 +471,3 @@ void blk_integrity_complete(struct request *rq, unsigned int nr_bytes)
else
t10_pi_type1_complete(rq, nr_bytes);
}
-
-MODULE_DESCRIPTION("T10 Protection Information module");
-MODULE_LICENSE("GPL");
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 94dc0a235919..2a05d955e30b 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -297,10 +297,6 @@ struct drbd_epoch {
unsigned long flags;
};
-/* Prototype declaration of function defined in drbd_receiver.c */
-int drbdd_init(struct drbd_thread *);
-int drbd_asender(struct drbd_thread *);
-
/* drbd_epoch flag bits */
enum {
DE_HAVE_BARRIER_NUMBER,
@@ -864,7 +860,6 @@ struct drbd_device {
struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
struct list_head net_ee; /* zero-copy network send in progress */
- int next_barrier_nr;
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
@@ -1390,9 +1385,6 @@ extern void conn_free_crypto(struct drbd_connection *connection);
extern void do_submit(struct work_struct *ws);
extern void __drbd_make_request(struct drbd_device *, struct bio *);
void drbd_submit_bio(struct bio *bio);
-extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
-extern int is_valid_ar_handle(struct drbd_request *, sector_t);
-
/* drbd_nl.c */
@@ -1474,7 +1466,6 @@ extern int w_resync_timer(struct drbd_work *, int);
extern int w_send_write_hint(struct drbd_work *, int);
extern int w_send_dblock(struct drbd_work *, int);
extern int w_send_read_req(struct drbd_work *, int);
-extern int w_e_reissue(struct drbd_work *, int);
extern int w_restart_disk_io(struct drbd_work *, int);
extern int w_send_out_of_sync(struct drbd_work *, int);
@@ -1488,7 +1479,6 @@ extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
sector_t start, unsigned int nr_sectors, int flags);
extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_ack_receiver(struct drbd_thread *thi);
-extern void drbd_send_ping_wf(struct work_struct *ws);
extern void drbd_send_acks_wf(struct work_struct *ws);
extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector,
@@ -1504,7 +1494,6 @@ extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request
#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
-extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
extern int drbd_connected(struct drbd_peer_device *);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a9e49b212341..449123eb54bf 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1550,7 +1550,7 @@ static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *pa
* put_page(); and would cause either a VM_BUG directly, or
* __page_cache_release a page that would actually still be referenced
* by someone, leading to some obscure delayed Oops somewhere else. */
- if (!drbd_disable_sendpage && sendpage_ok(page))
+ if (!drbd_disable_sendpage && sendpages_ok(page, len, offset))
msg.msg_flags |= MSG_NOSIGNAL | MSG_SPLICE_PAGES;
drbd_update_congested(peer_device->connection);
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index e858e7e0383f..c2b6c4d9729d 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -876,7 +876,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns)
ns.disk == D_OUTDATED)
rv = SS_CONNECTED_OUTDATES;
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
(nc->verify_alg[0] == 0))
rv = SS_NO_VERIFY_ALG;
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index c6ef0546ffc9..11901f2812ad 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2269,25 +2269,12 @@ static const struct file_operations mtip_flags_fops = {
.llseek = no_llseek,
};
-static int mtip_hw_debugfs_init(struct driver_data *dd)
+static void mtip_hw_debugfs_init(struct driver_data *dd)
{
- if (!dfs_parent)
- return -1;
-
dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent);
- if (IS_ERR_OR_NULL(dd->dfs_node)) {
- dev_warn(&dd->pdev->dev,
- "Error creating node %s under debugfs\n",
- dd->disk->disk_name);
- dd->dfs_node = NULL;
- return -1;
- }
-
debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops);
debugfs_create_file("registers", 0444, dd->dfs_node, dd,
&mtip_regs_fops);
-
- return 0;
}
static void mtip_hw_debugfs_exit(struct driver_data *dd)
@@ -4043,10 +4030,6 @@ static int __init mtip_init(void)
mtip_major = error;
dfs_parent = debugfs_create_dir("rssd", NULL);
- if (IS_ERR_OR_NULL(dfs_parent)) {
- pr_warn("Error creating debugfs parent\n");
- dfs_parent = NULL;
- }
/* Register our PCI operations. */
error = pci_register_driver(&mtip_pci_driver);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 41a90150b501..b852050d8a96 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -181,6 +181,17 @@ static void nbd_requeue_cmd(struct nbd_cmd *cmd)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
+ lockdep_assert_held(&cmd->lock);
+
+ /*
+ * Clear INFLIGHT flag so that this cmd won't be completed in
+ * normal completion path
+ *
+ * INFLIGHT flag will be set when the cmd is queued to nbd next
+ * time.
+ */
+ __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+
if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
blk_mq_requeue_request(req, true);
}
@@ -339,7 +350,7 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
lim = queue_limits_start_update(nbd->disk->queue);
if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
- lim.max_hw_discard_sectors = UINT_MAX;
+ lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT;
else
lim.max_hw_discard_sectors = 0;
if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) {
@@ -350,6 +361,11 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
lim.features |= BLK_FEAT_WRITE_CACHE;
lim.features &= ~BLK_FEAT_FUA;
}
+ if (nbd->config->flags & NBD_FLAG_ROTATIONAL)
+ lim.features |= BLK_FEAT_ROTATIONAL;
+ if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES)
+ lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT;
+
lim.logical_block_size = blksize;
lim.physical_block_size = blksize;
error = queue_limits_commit_update(nbd->disk->queue, &lim);
@@ -418,6 +434,8 @@ static u32 req_to_nbd_cmd_type(struct request *req)
return NBD_CMD_WRITE;
case REQ_OP_READ:
return NBD_CMD_READ;
+ case REQ_OP_WRITE_ZEROES:
+ return NBD_CMD_WRITE_ZEROES;
default:
return U32_MAX;
}
@@ -488,8 +506,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
nbd_mark_nsock_dead(nbd, nsock, 1);
mutex_unlock(&nsock->tx_lock);
}
- mutex_unlock(&cmd->lock);
nbd_requeue_cmd(cmd);
+ mutex_unlock(&cmd->lock);
nbd_config_put(nbd);
return BLK_EH_DONE;
}
@@ -634,6 +652,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
if (req->cmd_flags & REQ_FUA)
nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
+ if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES))
+ nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE;
/* We did a partial send previously, and we at least sent the whole
* request struct, so just go and send the rest of the pages in the
@@ -1703,6 +1723,10 @@ static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
seq_puts(s, "NBD_FLAG_SEND_FUA\n");
if (flags & NBD_FLAG_SEND_TRIM)
seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
+ if (flags & NBD_FLAG_SEND_WRITE_ZEROES)
+ seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n");
+ if (flags & NBD_FLAG_ROTATIONAL)
+ seq_puts(s, "NBD_FLAG_ROTATIONAL\n");
return 0;
}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 7cece5884b9c..3edb37a41312 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -498,8 +498,6 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
if (!pkt_debugfs_root)
return;
pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root);
- if (!pd->dfs_d_root)
- return;
pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root,
pd, &pkt_seq_fops);
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index f6e3a3c4b76c..08ce6d96d04c 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -149,15 +149,22 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
if (bio_add_page(bio, virt_to_page(data), datalen,
offset_in_page(data)) != datalen) {
- rnbd_srv_err(sess_dev, "Failed to map data to bio\n");
+ rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n");
err = -EINVAL;
goto bio_put;
}
+ bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
+ if (bio_has_data(bio) &&
+ bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) {
+ rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n",
+ bio->bi_iter.bi_size, msg->bi_size);
+ err = -EINVAL;
+ goto bio_put;
+ }
bio->bi_end_io = rnbd_dev_bi_end_io;
bio->bi_private = priv;
bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
- bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size);
prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR ||
usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio);
bio_set_prio(bio, prio);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 1d53a3f48a0e..bca06bfb4bc3 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -71,9 +71,6 @@ struct ublk_rq_data {
struct llist_node node;
struct kref ref;
- __u64 sector;
- __u32 operation;
- __u32 nr_zones;
};
struct ublk_uring_cmd_pdu {
@@ -214,6 +211,33 @@ static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
#ifdef CONFIG_BLK_DEV_ZONED
+struct ublk_zoned_report_desc {
+ __u64 sector;
+ __u32 operation;
+ __u32 nr_zones;
+};
+
+static DEFINE_XARRAY(ublk_zoned_report_descs);
+
+static int ublk_zoned_insert_report_desc(const struct request *req,
+ struct ublk_zoned_report_desc *desc)
+{
+ return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
+ desc, GFP_KERNEL);
+}
+
+static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
+ const struct request *req)
+{
+ return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
+}
+
+static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
+ const struct request *req)
+{
+ return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
+}
+
static int ublk_get_nr_zones(const struct ublk_device *ub)
{
const struct ublk_param_basic *p = &ub->params.basic;
@@ -308,7 +332,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
unsigned int zones_in_request =
min_t(unsigned int, remaining_zones, max_zones_per_request);
struct request *req;
- struct ublk_rq_data *pdu;
+ struct ublk_zoned_report_desc desc;
blk_status_t status;
memset(buffer, 0, buffer_length);
@@ -319,20 +343,23 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
goto out;
}
- pdu = blk_mq_rq_to_pdu(req);
- pdu->operation = UBLK_IO_OP_REPORT_ZONES;
- pdu->sector = sector;
- pdu->nr_zones = zones_in_request;
+ desc.operation = UBLK_IO_OP_REPORT_ZONES;
+ desc.sector = sector;
+ desc.nr_zones = zones_in_request;
+ ret = ublk_zoned_insert_report_desc(req, &desc);
+ if (ret)
+ goto free_req;
ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
GFP_KERNEL);
- if (ret) {
- blk_mq_free_request(req);
- goto out;
- }
+ if (ret)
+ goto erase_desc;
status = blk_execute_rq(req, 0);
ret = blk_status_to_errno(status);
+erase_desc:
+ ublk_zoned_erase_report_desc(req);
+free_req:
blk_mq_free_request(req);
if (ret)
goto out;
@@ -366,7 +393,7 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
- struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
+ struct ublk_zoned_report_desc *desc;
u32 ublk_op;
switch (req_op(req)) {
@@ -389,12 +416,15 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
break;
case REQ_OP_DRV_IN:
- ublk_op = pdu->operation;
+ desc = ublk_zoned_get_report_desc(req);
+ if (!desc)
+ return BLK_STS_IOERR;
+ ublk_op = desc->operation;
switch (ublk_op) {
case UBLK_IO_OP_REPORT_ZONES:
iod->op_flags = ublk_op | ublk_req_build_flags(req);
- iod->nr_zones = pdu->nr_zones;
- iod->start_sector = pdu->sector;
+ iod->nr_zones = desc->nr_zones;
+ iod->start_sector = desc->sector;
return BLK_STS_OK;
default:
return BLK_STS_IOERR;
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index efcb8d9d274c..84cd62efac0f 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -59,17 +59,17 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index,
static int zram_slot_trylock(struct zram *zram, u32 index)
{
- return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags);
+ return spin_trylock(&zram->table[index].lock);
}
static void zram_slot_lock(struct zram *zram, u32 index)
{
- bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags);
+ spin_lock(&zram->table[index].lock);
}
static void zram_slot_unlock(struct zram *zram, u32 index)
{
- bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
+ spin_unlock(&zram->table[index].lock);
}
static inline bool init_done(struct zram *zram)
@@ -1211,7 +1211,7 @@ static void zram_meta_free(struct zram *zram, u64 disksize)
static bool zram_meta_alloc(struct zram *zram, u64 disksize)
{
- size_t num_pages;
+ size_t num_pages, index;
num_pages = disksize >> PAGE_SHIFT;
zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
@@ -1226,6 +1226,9 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
if (!huge_class_size)
huge_class_size = zs_huge_class_size(zram->mem_pool);
+
+ for (index = 0; index < num_pages; index++)
+ spin_lock_init(&zram->table[index].lock);
return true;
}
@@ -1283,7 +1286,7 @@ out:
zram_set_handle(zram, index, 0);
zram_set_obj_size(zram, index, 0);
WARN_ON_ONCE(zram->table[index].flags &
- ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
+ ~(1UL << ZRAM_UNDER_WB));
}
/*
@@ -2401,9 +2404,10 @@ static void destroy_devices(void)
static int __init zram_init(void)
{
+ struct zram_table_entry zram_te;
int ret;
- BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG);
+ BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8);
ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
zcomp_cpu_up_prepare, zcomp_cpu_dead);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 35e322144629..531cefc66668 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -45,9 +45,7 @@
/* Flags for zram pages (table[page_no].flags) */
enum zram_pageflags {
- /* zram slot is locked */
- ZRAM_LOCK = ZRAM_FLAG_SHIFT,
- ZRAM_SAME, /* Page consists the same element */
+ ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */
ZRAM_WB, /* page is stored on backing_device */
ZRAM_UNDER_WB, /* page is under writeback */
ZRAM_HUGE, /* Incompressible page */
@@ -68,7 +66,8 @@ struct zram_table_entry {
unsigned long handle;
unsigned long element;
};
- unsigned long flags;
+ unsigned int flags;
+ spinlock_t lock;
#ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
ktime_t ac_time;
#endif
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 0c3323e0adb2..63682d27fc8d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3949,7 +3949,9 @@ static int __load_dirty_region_bitmap(struct raid_set *rs)
/* Try loading the bitmap unless "raid0", which does not have one */
if (!rs_is_raid0(rs) &&
!test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) {
- r = md_bitmap_load(&rs->md);
+ struct mddev *mddev = &rs->md;
+
+ r = mddev->bitmap_ops->load(mddev);
if (r)
DMERR("Failed to load bitmap");
}
@@ -4066,7 +4068,8 @@ static int raid_preresume(struct dm_target *ti)
mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) {
int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize;
- r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0);
+ r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors,
+ chunksize, false);
if (r)
DMERR("Failed to resize bitmap");
}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index db5330d97348..29da10e6f703 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -32,11 +32,210 @@
#include "md.h"
#include "md-bitmap.h"
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ * Version 5 is currently set only for clustered devices
+ */
+#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_CLUSTERED 5
+#define BITMAP_MAJOR_HOSTENDIAN 3
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed. The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync | counter |
+ * | needed | active | |
+ * | (0-1) | (0-1) | (0-16383) |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ * a '1' bit is read from storage at startup.
+ * a write request fails on some drives
+ * a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ * a resync is started on all drives, and resync_needed is set.
+ * resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared as well, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stripe is clean
+ * Anything that dirties the stripe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ * page pointer (32-bit)
+ *
+ * [ ] ------+
+ * |
+ * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
+ * c1 c2 c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ * hijacked page pointer (32-bit)
+ *
+ * [ ][ ] (no page memory allocated)
+ * counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SHIFT 9
+
+/*
+ * bitmap structures:
+ */
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+ /*
+ * map points to the actual memory page
+ */
+ char *map;
+ /*
+ * in emergencies (when map cannot be alloced), hijack the map
+ * pointer and use it as two counters itself
+ */
+ unsigned int hijacked:1;
+ /*
+ * If any counter in this page is '1' or '2' - and so could be
+ * cleared then that page is marked as 'pending'
+ */
+ unsigned int pending:1;
+ /*
+ * count of dirty bits on the page
+ */
+ unsigned int count:30;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+
+ struct bitmap_counts {
+ spinlock_t lock;
+ struct bitmap_page *bp;
+ /* total number of pages in the bitmap */
+ unsigned long pages;
+ /* number of pages not yet allocated */
+ unsigned long missing_pages;
+ /* chunksize = 2^chunkshift (for bitops) */
+ unsigned long chunkshift;
+ /* total number of data chunks for the array */
+ unsigned long chunks;
+ } counts;
+
+ struct mddev *mddev; /* the md device that the bitmap is for */
+
+ __u64 events_cleared;
+ int need_sync;
+
+ struct bitmap_storage {
+ /* backing disk file */
+ struct file *file;
+ /* cached copy of the bitmap file superblock */
+ struct page *sb_page;
+ unsigned long sb_index;
+ /* list of cache pages for the file */
+ struct page **filemap;
+ /* attributes associated filemap pages */
+ unsigned long *filemap_attr;
+ /* number of pages in the file */
+ unsigned long file_pages;
+ /* total bytes in the bitmap */
+ unsigned long bytes;
+ } storage;
+
+ unsigned long flags;
+
+ int allclean;
+
+ atomic_t behind_writes;
+ /* highest actual value at runtime */
+ unsigned long behind_writes_used;
+
+ /*
+ * the bitmap daemon - periodically wakes up and sweeps the bitmap
+ * file, cleaning up bits and flushing out pages to disk as necessary
+ */
+ unsigned long daemon_lastrun; /* jiffies of last run */
+ /*
+ * when we lasted called end_sync to update bitmap with resync
+ * progress.
+ */
+ unsigned long last_end_sync;
+
+ /* pending writes to the bitmap file */
+ atomic_t pending_writes;
+ wait_queue_head_t write_wait;
+ wait_queue_head_t overflow_wait;
+ wait_queue_head_t behind_wait;
+
+ struct kernfs_node *sysfs_can_clear;
+ /* slot offset for clustered env */
+ int cluster_slot;
+};
+
+static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+ int chunksize, bool init);
+
static inline char *bmname(struct bitmap *bitmap)
{
return bitmap->mddev ? mdname(bitmap->mddev) : "mdX";
}
+static bool __bitmap_enabled(struct bitmap *bitmap)
+{
+ return bitmap->storage.filemap &&
+ !test_bit(BITMAP_STALE, &bitmap->flags);
+}
+
+static bool bitmap_enabled(struct mddev *mddev)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return false;
+
+ return __bitmap_enabled(bitmap);
+}
+
/*
* check a page and, if necessary, allocate it (or hijack it if the alloc fails)
*
@@ -472,9 +671,10 @@ static void md_bitmap_wait_writes(struct bitmap *bitmap)
/* update the event counter and sync the superblock to disk */
-void md_bitmap_update_sb(struct bitmap *bitmap)
+static void bitmap_update_sb(void *data)
{
bitmap_super_t *sb;
+ struct bitmap *bitmap = data;
if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
return;
@@ -510,10 +710,8 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
write_sb_page(bitmap, bitmap->storage.sb_index,
bitmap->storage.sb_page, 1);
}
-EXPORT_SYMBOL(md_bitmap_update_sb);
-/* print out the bitmap file superblock */
-void md_bitmap_print_sb(struct bitmap *bitmap)
+static void bitmap_print_sb(struct bitmap *bitmap)
{
bitmap_super_t *sb;
@@ -760,7 +958,7 @@ out_no_sb:
bitmap->mddev->bitmap_info.space > sectors_reserved)
bitmap->mddev->bitmap_info.space = sectors_reserved;
} else {
- md_bitmap_print_sb(bitmap);
+ bitmap_print_sb(bitmap);
if (bitmap->cluster_slot < 0)
md_cluster_stop(bitmap->mddev);
}
@@ -893,7 +1091,7 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store)
static void md_bitmap_file_kick(struct bitmap *bitmap)
{
if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
- md_bitmap_update_sb(bitmap);
+ bitmap_update_sb(bitmap);
if (bitmap->storage.file) {
pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
@@ -1028,13 +1226,13 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
/* this gets called when the md device is ready to unplug its underlying
* (slave) device queues -- before we let any writes go down, we need to
* sync the dirty pages of the bitmap file to disk */
-void md_bitmap_unplug(struct bitmap *bitmap)
+static void __bitmap_unplug(struct bitmap *bitmap)
{
unsigned long i;
int dirty, need_write;
int writing = 0;
- if (!md_bitmap_enabled(bitmap))
+ if (!__bitmap_enabled(bitmap))
return;
/* look at each page to see if there are any set bits that need to be
@@ -1060,7 +1258,6 @@ void md_bitmap_unplug(struct bitmap *bitmap)
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
md_bitmap_file_kick(bitmap);
}
-EXPORT_SYMBOL(md_bitmap_unplug);
struct bitmap_unplug_work {
struct work_struct work;
@@ -1073,11 +1270,11 @@ static void md_bitmap_unplug_fn(struct work_struct *work)
struct bitmap_unplug_work *unplug_work =
container_of(work, struct bitmap_unplug_work, work);
- md_bitmap_unplug(unplug_work->bitmap);
+ __bitmap_unplug(unplug_work->bitmap);
complete(unplug_work->done);
}
-void md_bitmap_unplug_async(struct bitmap *bitmap)
+static void bitmap_unplug_async(struct bitmap *bitmap)
{
DECLARE_COMPLETION_ONSTACK(done);
struct bitmap_unplug_work unplug_work;
@@ -1089,7 +1286,19 @@ void md_bitmap_unplug_async(struct bitmap *bitmap)
queue_work(md_bitmap_wq, &unplug_work.work);
wait_for_completion(&done);
}
-EXPORT_SYMBOL(md_bitmap_unplug_async);
+
+static void bitmap_unplug(struct mddev *mddev, bool sync)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return;
+
+ if (sync)
+ __bitmap_unplug(bitmap);
+ else
+ bitmap_unplug_async(bitmap);
+}
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
@@ -1226,22 +1435,21 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
return ret;
}
-void md_bitmap_write_all(struct bitmap *bitmap)
+/* just flag bitmap pages as needing to be written. */
+static void bitmap_write_all(struct mddev *mddev)
{
- /* We don't actually write all bitmap blocks here,
- * just flag them as needing to be written
- */
int i;
+ struct bitmap *bitmap = mddev->bitmap;
if (!bitmap || !bitmap->storage.filemap)
return;
+
+ /* Only one copy, so nothing needed */
if (bitmap->storage.file)
- /* Only one copy, so nothing needed */
return;
for (i = 0; i < bitmap->storage.file_pages; i++)
- set_page_attr(bitmap, i,
- BITMAP_PAGE_NEEDWRITE);
+ set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
bitmap->allclean = 0;
}
@@ -1290,7 +1498,7 @@ out:
* bitmap daemon -- periodically wakes up to clean bits and flush pages
* out to disk
*/
-void md_bitmap_daemon_work(struct mddev *mddev)
+static void bitmap_daemon_work(struct mddev *mddev)
{
struct bitmap *bitmap;
unsigned long j;
@@ -1461,8 +1669,11 @@ __acquires(bitmap->lock)
&(bitmap->bp[page].map[pageoff]);
}
-int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
+static int bitmap_startwrite(struct mddev *mddev, sector_t offset,
+ unsigned long sectors, bool behind)
{
+ struct bitmap *bitmap = mddev->bitmap;
+
if (!bitmap)
return 0;
@@ -1523,13 +1734,15 @@ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long s
}
return 0;
}
-EXPORT_SYMBOL(md_bitmap_startwrite);
-void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
- unsigned long sectors, int success, int behind)
+static void bitmap_endwrite(struct mddev *mddev, sector_t offset,
+ unsigned long sectors, bool success, bool behind)
{
+ struct bitmap *bitmap = mddev->bitmap;
+
if (!bitmap)
return;
+
if (behind) {
if (atomic_dec_and_test(&bitmap->behind_writes))
wake_up(&bitmap->behind_wait);
@@ -1576,26 +1789,27 @@ void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
sectors = 0;
}
}
-EXPORT_SYMBOL(md_bitmap_endwrite);
-static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
- int degraded)
+static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset,
+ sector_t *blocks, bool degraded)
{
bitmap_counter_t *bmc;
- int rv;
+ bool rv;
+
if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
*blocks = 1024;
- return 1; /* always resync if no bitmap */
+ return true; /* always resync if no bitmap */
}
spin_lock_irq(&bitmap->counts.lock);
+
+ rv = false;
bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
- rv = 0;
if (bmc) {
/* locked */
- if (RESYNC(*bmc))
- rv = 1;
- else if (NEEDED(*bmc)) {
- rv = 1;
+ if (RESYNC(*bmc)) {
+ rv = true;
+ } else if (NEEDED(*bmc)) {
+ rv = true;
if (!degraded) { /* don't set/clear bits if degraded */
*bmc |= RESYNC_MASK;
*bmc &= ~NEEDED_MASK;
@@ -1603,11 +1817,12 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
}
}
spin_unlock_irq(&bitmap->counts.lock);
+
return rv;
}
-int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks,
- int degraded)
+static bool bitmap_start_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded)
{
/* bitmap_start_sync must always report on multiples of whole
* pages, otherwise resync (which is very PAGE_SIZE based) will
@@ -1616,21 +1831,22 @@ int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *block
* At least PAGE_SIZE>>9 blocks are covered.
* Return the 'or' of the result.
*/
- int rv = 0;
+ bool rv = false;
sector_t blocks1;
*blocks = 0;
while (*blocks < (PAGE_SIZE>>9)) {
- rv |= __bitmap_start_sync(bitmap, offset,
+ rv |= __bitmap_start_sync(mddev->bitmap, offset,
&blocks1, degraded);
offset += blocks1;
*blocks += blocks1;
}
+
return rv;
}
-EXPORT_SYMBOL(md_bitmap_start_sync);
-void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted)
+static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset,
+ sector_t *blocks, bool aborted)
{
bitmap_counter_t *bmc;
unsigned long flags;
@@ -1659,9 +1875,14 @@ void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks
unlock:
spin_unlock_irqrestore(&bitmap->counts.lock, flags);
}
-EXPORT_SYMBOL(md_bitmap_end_sync);
-void md_bitmap_close_sync(struct bitmap *bitmap)
+static void bitmap_end_sync(struct mddev *mddev, sector_t offset,
+ sector_t *blocks)
+{
+ __bitmap_end_sync(mddev->bitmap, offset, blocks, true);
+}
+
+static void bitmap_close_sync(struct mddev *mddev)
{
/* Sync has finished, and any bitmap chunks that weren't synced
* properly have been aborted. It remains to us to clear the
@@ -1669,19 +1890,23 @@ void md_bitmap_close_sync(struct bitmap *bitmap)
*/
sector_t sector = 0;
sector_t blocks;
+ struct bitmap *bitmap = mddev->bitmap;
+
if (!bitmap)
return;
+
while (sector < bitmap->mddev->resync_max_sectors) {
- md_bitmap_end_sync(bitmap, sector, &blocks, 0);
+ __bitmap_end_sync(bitmap, sector, &blocks, false);
sector += blocks;
}
}
-EXPORT_SYMBOL(md_bitmap_close_sync);
-void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
+static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector,
+ bool force)
{
sector_t s = 0;
sector_t blocks;
+ struct bitmap *bitmap = mddev->bitmap;
if (!bitmap)
return;
@@ -1700,34 +1925,32 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
sector &= ~((1ULL << bitmap->counts.chunkshift) - 1);
s = 0;
while (s < sector && s < bitmap->mddev->resync_max_sectors) {
- md_bitmap_end_sync(bitmap, s, &blocks, 0);
+ __bitmap_end_sync(bitmap, s, &blocks, false);
s += blocks;
}
bitmap->last_end_sync = jiffies;
sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
}
-EXPORT_SYMBOL(md_bitmap_cond_end_sync);
-void md_bitmap_sync_with_cluster(struct mddev *mddev,
- sector_t old_lo, sector_t old_hi,
- sector_t new_lo, sector_t new_hi)
+static void bitmap_sync_with_cluster(struct mddev *mddev,
+ sector_t old_lo, sector_t old_hi,
+ sector_t new_lo, sector_t new_hi)
{
struct bitmap *bitmap = mddev->bitmap;
sector_t sector, blocks = 0;
for (sector = old_lo; sector < new_lo; ) {
- md_bitmap_end_sync(bitmap, sector, &blocks, 0);
+ __bitmap_end_sync(bitmap, sector, &blocks, false);
sector += blocks;
}
WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n");
for (sector = old_hi; sector < new_hi; ) {
- md_bitmap_start_sync(bitmap, sector, &blocks, 0);
+ bitmap_start_sync(mddev, sector, &blocks, false);
sector += blocks;
}
WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n");
}
-EXPORT_SYMBOL(md_bitmap_sync_with_cluster);
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed)
{
@@ -1756,12 +1979,18 @@ static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, in
}
/* dirty the memory and file bits for bitmap chunks "s" to "e" */
-void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
+static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s,
+ unsigned long e)
{
unsigned long chunk;
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return;
for (chunk = s; chunk <= e; chunk++) {
sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift;
+
md_bitmap_set_memory_bits(bitmap, sec, 1);
md_bitmap_file_set_bit(bitmap, sec);
if (sec < bitmap->mddev->recovery_cp)
@@ -1773,10 +2002,7 @@ void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long
}
}
-/*
- * flush out any pending updates
- */
-void md_bitmap_flush(struct mddev *mddev)
+static void bitmap_flush(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
long sleep;
@@ -1789,23 +2015,21 @@ void md_bitmap_flush(struct mddev *mddev)
*/
sleep = mddev->bitmap_info.daemon_sleep * 2;
bitmap->daemon_lastrun -= sleep;
- md_bitmap_daemon_work(mddev);
+ bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
- md_bitmap_daemon_work(mddev);
+ bitmap_daemon_work(mddev);
bitmap->daemon_lastrun -= sleep;
- md_bitmap_daemon_work(mddev);
+ bitmap_daemon_work(mddev);
if (mddev->bitmap_info.external)
md_super_wait(mddev);
- md_bitmap_update_sb(bitmap);
+ bitmap_update_sb(bitmap);
}
-/*
- * free memory that was allocated
- */
-void md_bitmap_free(struct bitmap *bitmap)
+static void md_bitmap_free(void *data)
{
unsigned long k, pages;
struct bitmap_page *bp;
+ struct bitmap *bitmap = data;
if (!bitmap) /* there was no bitmap */
return;
@@ -1836,9 +2060,8 @@ void md_bitmap_free(struct bitmap *bitmap)
kfree(bp);
kfree(bitmap);
}
-EXPORT_SYMBOL(md_bitmap_free);
-void md_bitmap_wait_behind_writes(struct mddev *mddev)
+static void bitmap_wait_behind_writes(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
@@ -1852,14 +2075,14 @@ void md_bitmap_wait_behind_writes(struct mddev *mddev)
}
}
-void md_bitmap_destroy(struct mddev *mddev)
+static void bitmap_destroy(struct mddev *mddev)
{
struct bitmap *bitmap = mddev->bitmap;
if (!bitmap) /* there was no bitmap */
return;
- md_bitmap_wait_behind_writes(mddev);
+ bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy)
mddev_destroy_serial_pool(mddev, NULL);
@@ -1878,7 +2101,7 @@ void md_bitmap_destroy(struct mddev *mddev)
* if this returns an error, bitmap_destroy must be called to do clean up
* once mddev->bitmap is set
*/
-struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
+static struct bitmap *__bitmap_create(struct mddev *mddev, int slot)
{
struct bitmap *bitmap;
sector_t blocks = mddev->resync_max_sectors;
@@ -1948,7 +2171,8 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
goto error;
bitmap->daemon_lastrun = jiffies;
- err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
+ err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize,
+ true);
if (err)
goto error;
@@ -1965,7 +2189,18 @@ struct bitmap *md_bitmap_create(struct mddev *mddev, int slot)
return ERR_PTR(err);
}
-int md_bitmap_load(struct mddev *mddev)
+static int bitmap_create(struct mddev *mddev, int slot)
+{
+ struct bitmap *bitmap = __bitmap_create(mddev, slot);
+
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ mddev->bitmap = bitmap;
+ return 0;
+}
+
+static int bitmap_load(struct mddev *mddev)
{
int err = 0;
sector_t start = 0;
@@ -1989,10 +2224,10 @@ int md_bitmap_load(struct mddev *mddev)
*/
while (sector < mddev->resync_max_sectors) {
sector_t blocks;
- md_bitmap_start_sync(bitmap, sector, &blocks, 0);
+ bitmap_start_sync(mddev, sector, &blocks, false);
sector += blocks;
}
- md_bitmap_close_sync(bitmap);
+ bitmap_close_sync(mddev);
if (mddev->degraded == 0
|| bitmap->events_cleared == mddev->events)
@@ -2014,22 +2249,21 @@ int md_bitmap_load(struct mddev *mddev)
mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true);
md_wakeup_thread(mddev->thread);
- md_bitmap_update_sb(bitmap);
+ bitmap_update_sb(bitmap);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
err = -EIO;
out:
return err;
}
-EXPORT_SYMBOL_GPL(md_bitmap_load);
/* caller need to free returned bitmap with md_bitmap_free() */
-struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
+static void *bitmap_get_from_slot(struct mddev *mddev, int slot)
{
int rv = 0;
struct bitmap *bitmap;
- bitmap = md_bitmap_create(mddev, slot);
+ bitmap = __bitmap_create(mddev, slot);
if (IS_ERR(bitmap)) {
rv = PTR_ERR(bitmap);
return ERR_PTR(rv);
@@ -2043,20 +2277,19 @@ struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot)
return bitmap;
}
-EXPORT_SYMBOL(get_bitmap_from_slot);
/* Loads the bitmap associated with slot and copies the resync information
* to our bitmap
*/
-int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
- sector_t *low, sector_t *high, bool clear_bits)
+static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low,
+ sector_t *high, bool clear_bits)
{
int rv = 0, i, j;
sector_t block, lo = 0, hi = 0;
struct bitmap_counts *counts;
struct bitmap *bitmap;
- bitmap = get_bitmap_from_slot(mddev, slot);
+ bitmap = bitmap_get_from_slot(mddev, slot);
if (IS_ERR(bitmap)) {
pr_err("%s can't get bitmap from slot %d\n", __func__, slot);
return -1;
@@ -2076,53 +2309,59 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
}
if (clear_bits) {
- md_bitmap_update_sb(bitmap);
+ bitmap_update_sb(bitmap);
/* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs
* BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */
for (i = 0; i < bitmap->storage.file_pages; i++)
if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING))
set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE);
- md_bitmap_unplug(bitmap);
+ __bitmap_unplug(bitmap);
}
- md_bitmap_unplug(mddev->bitmap);
+ __bitmap_unplug(mddev->bitmap);
*low = lo;
*high = hi;
md_bitmap_free(bitmap);
return rv;
}
-EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot);
+static void bitmap_set_pages(void *data, unsigned long pages)
+{
+ struct bitmap *bitmap = data;
+
+ bitmap->counts.pages = pages;
+}
-void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
+static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats)
{
- unsigned long chunk_kb;
+ struct bitmap_storage *storage;
struct bitmap_counts *counts;
+ struct bitmap *bitmap = data;
+ bitmap_super_t *sb;
if (!bitmap)
- return;
+ return -ENOENT;
+
+ sb = kmap_local_page(bitmap->storage.sb_page);
+ stats->sync_size = le64_to_cpu(sb->sync_size);
+ kunmap_local(sb);
counts = &bitmap->counts;
+ stats->missing_pages = counts->missing_pages;
+ stats->pages = counts->pages;
- chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
- seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
- "%lu%s chunk",
- counts->pages - counts->missing_pages,
- counts->pages,
- (counts->pages - counts->missing_pages)
- << (PAGE_SHIFT - 10),
- chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
- chunk_kb ? "KB" : "B");
- if (bitmap->storage.file) {
- seq_printf(seq, ", file: ");
- seq_file_path(seq, bitmap->storage.file, " \t\n");
- }
+ storage = &bitmap->storage;
+ stats->file_pages = storage->file_pages;
+ stats->file = storage->file;
- seq_printf(seq, "\n");
+ stats->behind_writes = atomic_read(&bitmap->behind_writes);
+ stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait);
+ stats->events_cleared = bitmap->events_cleared;
+ return 0;
}
-int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
- int chunksize, int init)
+static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+ int chunksize, bool init)
{
/* If chunk_size is 0, choose an appropriate chunk size.
* Then possibly allocate new storage space.
@@ -2320,14 +2559,24 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
spin_unlock_irq(&bitmap->counts.lock);
if (!init) {
- md_bitmap_unplug(bitmap);
+ __bitmap_unplug(bitmap);
bitmap->mddev->pers->quiesce(bitmap->mddev, 0);
}
ret = 0;
err:
return ret;
}
-EXPORT_SYMBOL_GPL(md_bitmap_resize);
+
+static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize,
+ bool init)
+{
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
+ return 0;
+
+ return __bitmap_resize(bitmap, blocks, chunksize, init);
+}
static ssize_t
location_show(struct mddev *mddev, char *page)
@@ -2367,7 +2616,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
goto out;
}
- md_bitmap_destroy(mddev);
+ bitmap_destroy(mddev);
mddev->bitmap_info.offset = 0;
if (mddev->bitmap_info.file) {
struct file *f = mddev->bitmap_info.file;
@@ -2377,7 +2626,6 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
} else {
/* No bitmap, OK to set a location */
long long offset;
- struct bitmap *bitmap;
if (strncmp(buf, "none", 4) == 0)
/* nothing to be done */;
@@ -2404,17 +2652,14 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
}
mddev->bitmap_info.offset = offset;
- bitmap = md_bitmap_create(mddev, -1);
- if (IS_ERR(bitmap)) {
- rv = PTR_ERR(bitmap);
+ rv = bitmap_create(mddev, -1);
+ if (rv)
goto out;
- }
- mddev->bitmap = bitmap;
- rv = md_bitmap_load(mddev);
+ rv = bitmap_load(mddev);
if (rv) {
mddev->bitmap_info.offset = 0;
- md_bitmap_destroy(mddev);
+ bitmap_destroy(mddev);
goto out;
}
}
@@ -2450,6 +2695,7 @@ space_show(struct mddev *mddev, char *page)
static ssize_t
space_store(struct mddev *mddev, const char *buf, size_t len)
{
+ struct bitmap *bitmap;
unsigned long sectors;
int rv;
@@ -2460,8 +2706,8 @@ space_store(struct mddev *mddev, const char *buf, size_t len)
if (sectors == 0)
return -EINVAL;
- if (mddev->bitmap &&
- sectors < (mddev->bitmap->storage.bytes + 511) >> 9)
+ bitmap = mddev->bitmap;
+ if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9)
return -EFBIG; /* Bitmap is too big for this small space */
/* could make sure it isn't too big, but that isn't really
@@ -2569,7 +2815,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
mddev_create_serial_pool(mddev, rdev);
}
if (old_mwb != backlog)
- md_bitmap_update_sb(mddev->bitmap);
+ bitmap_update_sb(mddev->bitmap);
mddev_unlock_and_resume(mddev);
return len;
@@ -2638,10 +2884,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
static ssize_t can_clear_show(struct mddev *mddev, char *page)
{
int len;
+ struct bitmap *bitmap;
+
spin_lock(&mddev->lock);
- if (mddev->bitmap)
- len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ?
- "false" : "true"));
+ bitmap = mddev->bitmap;
+ if (bitmap)
+ len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" :
+ "true"));
else
len = sprintf(page, "\n");
spin_unlock(&mddev->lock);
@@ -2650,17 +2899,24 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page)
static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
{
- if (mddev->bitmap == NULL)
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (!bitmap)
return -ENOENT;
- if (strncmp(buf, "false", 5) == 0)
- mddev->bitmap->need_sync = 1;
- else if (strncmp(buf, "true", 4) == 0) {
+
+ if (strncmp(buf, "false", 5) == 0) {
+ bitmap->need_sync = 1;
+ return len;
+ }
+
+ if (strncmp(buf, "true", 4) == 0) {
if (mddev->degraded)
return -EBUSY;
- mddev->bitmap->need_sync = 0;
- } else
- return -EINVAL;
- return len;
+ bitmap->need_sync = 0;
+ return len;
+ }
+
+ return -EINVAL;
}
static struct md_sysfs_entry bitmap_can_clear =
@@ -2670,21 +2926,26 @@ static ssize_t
behind_writes_used_show(struct mddev *mddev, char *page)
{
ssize_t ret;
+ struct bitmap *bitmap;
+
spin_lock(&mddev->lock);
- if (mddev->bitmap == NULL)
+ bitmap = mddev->bitmap;
+ if (!bitmap)
ret = sprintf(page, "0\n");
else
- ret = sprintf(page, "%lu\n",
- mddev->bitmap->behind_writes_used);
+ ret = sprintf(page, "%lu\n", bitmap->behind_writes_used);
spin_unlock(&mddev->lock);
+
return ret;
}
static ssize_t
behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len)
{
- if (mddev->bitmap)
- mddev->bitmap->behind_writes_used = 0;
+ struct bitmap *bitmap = mddev->bitmap;
+
+ if (bitmap)
+ bitmap->behind_writes_used = 0;
return len;
}
@@ -2707,3 +2968,38 @@ const struct attribute_group md_bitmap_group = {
.name = "bitmap",
.attrs = md_bitmap_attrs,
};
+
+static struct bitmap_operations bitmap_ops = {
+ .enabled = bitmap_enabled,
+ .create = bitmap_create,
+ .resize = bitmap_resize,
+ .load = bitmap_load,
+ .destroy = bitmap_destroy,
+ .flush = bitmap_flush,
+ .write_all = bitmap_write_all,
+ .dirty_bits = bitmap_dirty_bits,
+ .unplug = bitmap_unplug,
+ .daemon_work = bitmap_daemon_work,
+ .wait_behind_writes = bitmap_wait_behind_writes,
+
+ .startwrite = bitmap_startwrite,
+ .endwrite = bitmap_endwrite,
+ .start_sync = bitmap_start_sync,
+ .end_sync = bitmap_end_sync,
+ .cond_end_sync = bitmap_cond_end_sync,
+ .close_sync = bitmap_close_sync,
+
+ .update_sb = bitmap_update_sb,
+ .get_stats = bitmap_get_stats,
+
+ .sync_with_cluster = bitmap_sync_with_cluster,
+ .get_from_slot = bitmap_get_from_slot,
+ .copy_from_slot = bitmap_copy_from_slot,
+ .set_pages = bitmap_set_pages,
+ .free = md_bitmap_free,
+};
+
+void mddev_set_bitmap_ops(struct mddev *mddev)
+{
+ mddev->bitmap_ops = &bitmap_ops;
+}
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h
index bb9eb418780a..662e6fc141a7 100644
--- a/drivers/md/md-bitmap.h
+++ b/drivers/md/md-bitmap.h
@@ -7,81 +7,7 @@
#ifndef BITMAP_H
#define BITMAP_H 1
-#define BITMAP_MAJOR_LO 3
-/* version 4 insists the bitmap is in little-endian order
- * with version 3, it is host-endian which is non-portable
- * Version 5 is currently set only for clustered devices
- */
-#define BITMAP_MAJOR_HI 4
-#define BITMAP_MAJOR_CLUSTERED 5
-#define BITMAP_MAJOR_HOSTENDIAN 3
-
-/*
- * in-memory bitmap:
- *
- * Use 16 bit block counters to track pending writes to each "chunk".
- * The 2 high order bits are special-purpose, the first is a flag indicating
- * whether a resync is needed. The second is a flag indicating whether a
- * resync is active.
- * This means that the counter is actually 14 bits:
- *
- * +--------+--------+------------------------------------------------+
- * | resync | resync | counter |
- * | needed | active | |
- * | (0-1) | (0-1) | (0-16383) |
- * +--------+--------+------------------------------------------------+
- *
- * The "resync needed" bit is set when:
- * a '1' bit is read from storage at startup.
- * a write request fails on some drives
- * a resync is aborted on a chunk with 'resync active' set
- * It is cleared (and resync-active set) when a resync starts across all drives
- * of the chunk.
- *
- *
- * The "resync active" bit is set when:
- * a resync is started on all drives, and resync_needed is set.
- * resync_needed will be cleared (as long as resync_active wasn't already set).
- * It is cleared when a resync completes.
- *
- * The counter counts pending write requests, plus the on-disk bit.
- * When the counter is '1' and the resync bits are clear, the on-disk
- * bit can be cleared as well, thus setting the counter to 0.
- * When we set a bit, or in the counter (to start a write), if the fields is
- * 0, we first set the disk bit and set the counter to 1.
- *
- * If the counter is 0, the on-disk bit is clear and the stripe is clean
- * Anything that dirties the stripe pushes the counter to 2 (at least)
- * and sets the on-disk bit (lazily).
- * If a periodic sweep find the counter at 2, it is decremented to 1.
- * If the sweep find the counter at 1, the on-disk bit is cleared and the
- * counter goes to zero.
- *
- * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
- * counters as a fallback when "page" memory cannot be allocated:
- *
- * Normal case (page memory allocated):
- *
- * page pointer (32-bit)
- *
- * [ ] ------+
- * |
- * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
- * c1 c2 c2048
- *
- * Hijacked case (page memory allocation failed):
- *
- * hijacked page pointer (32-bit)
- *
- * [ ][ ] (no page memory allocated)
- * counter #1 (16-bit) counter #2 (16-bit)
- *
- */
-
-#ifdef __KERNEL__
-
-#define PAGE_BITS (PAGE_SIZE << 3)
-#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+#define BITMAP_MAGIC 0x6d746962
typedef __u16 bitmap_counter_t;
#define COUNTER_BITS 16
@@ -91,26 +17,6 @@ typedef __u16 bitmap_counter_t;
#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
-#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
-#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
-#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
-
-/* how many counters per page? */
-#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
-/* same, except a shift value for more efficient bitops */
-#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
-/* same, except a mask value for more efficient bitops */
-#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
-
-#define BITMAP_BLOCK_SHIFT 9
-
-#endif
-
-/*
- * bitmap structures:
- */
-
-#define BITMAP_MAGIC 0x6d746962
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
@@ -152,136 +58,58 @@ typedef struct bitmap_super_s {
* devices. For raid10 it is the size of the array.
*/
-#ifdef __KERNEL__
+struct md_bitmap_stats {
+ u64 events_cleared;
+ int behind_writes;
+ bool behind_wait;
-/* the in-memory bitmap is represented by bitmap_pages */
-struct bitmap_page {
- /*
- * map points to the actual memory page
- */
- char *map;
- /*
- * in emergencies (when map cannot be alloced), hijack the map
- * pointer and use it as two counters itself
- */
- unsigned int hijacked:1;
- /*
- * If any counter in this page is '1' or '2' - and so could be
- * cleared then that page is marked as 'pending'
- */
- unsigned int pending:1;
- /*
- * count of dirty bits on the page
- */
- unsigned int count:30;
+ unsigned long missing_pages;
+ unsigned long file_pages;
+ unsigned long sync_size;
+ unsigned long pages;
+ struct file *file;
};
-/* the main bitmap structure - one per mddev */
-struct bitmap {
-
- struct bitmap_counts {
- spinlock_t lock;
- struct bitmap_page *bp;
- unsigned long pages; /* total number of pages
- * in the bitmap */
- unsigned long missing_pages; /* number of pages
- * not yet allocated */
- unsigned long chunkshift; /* chunksize = 2^chunkshift
- * (for bitops) */
- unsigned long chunks; /* Total number of data
- * chunks for the array */
- } counts;
-
- struct mddev *mddev; /* the md device that the bitmap is for */
-
- __u64 events_cleared;
- int need_sync;
-
- struct bitmap_storage {
- struct file *file; /* backing disk file */
- struct page *sb_page; /* cached copy of the bitmap
- * file superblock */
- unsigned long sb_index;
- struct page **filemap; /* list of cache pages for
- * the file */
- unsigned long *filemap_attr; /* attributes associated
- * w/ filemap pages */
- unsigned long file_pages; /* number of pages in the file*/
- unsigned long bytes; /* total bytes in the bitmap */
- } storage;
-
- unsigned long flags;
-
- int allclean;
-
- atomic_t behind_writes;
- unsigned long behind_writes_used; /* highest actual value at runtime */
-
- /*
- * the bitmap daemon - periodically wakes up and sweeps the bitmap
- * file, cleaning up bits and flushing out pages to disk as necessary
- */
- unsigned long daemon_lastrun; /* jiffies of last run */
- unsigned long last_end_sync; /* when we lasted called end_sync to
- * update bitmap with resync progress */
-
- atomic_t pending_writes; /* pending writes to the bitmap file */
- wait_queue_head_t write_wait;
- wait_queue_head_t overflow_wait;
- wait_queue_head_t behind_wait;
-
- struct kernfs_node *sysfs_can_clear;
- int cluster_slot; /* Slot offset for clustered env */
+struct bitmap_operations {
+ bool (*enabled)(struct mddev *mddev);
+ int (*create)(struct mddev *mddev, int slot);
+ int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize,
+ bool init);
+
+ int (*load)(struct mddev *mddev);
+ void (*destroy)(struct mddev *mddev);
+ void (*flush)(struct mddev *mddev);
+ void (*write_all)(struct mddev *mddev);
+ void (*dirty_bits)(struct mddev *mddev, unsigned long s,
+ unsigned long e);
+ void (*unplug)(struct mddev *mddev, bool sync);
+ void (*daemon_work)(struct mddev *mddev);
+ void (*wait_behind_writes)(struct mddev *mddev);
+
+ int (*startwrite)(struct mddev *mddev, sector_t offset,
+ unsigned long sectors, bool behind);
+ void (*endwrite)(struct mddev *mddev, sector_t offset,
+ unsigned long sectors, bool success, bool behind);
+ bool (*start_sync)(struct mddev *mddev, sector_t offset,
+ sector_t *blocks, bool degraded);
+ void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks);
+ void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force);
+ void (*close_sync)(struct mddev *mddev);
+
+ void (*update_sb)(void *data);
+ int (*get_stats)(void *data, struct md_bitmap_stats *stats);
+
+ void (*sync_with_cluster)(struct mddev *mddev,
+ sector_t old_lo, sector_t old_hi,
+ sector_t new_lo, sector_t new_hi);
+ void *(*get_from_slot)(struct mddev *mddev, int slot);
+ int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo,
+ sector_t *hi, bool clear_bits);
+ void (*set_pages)(void *data, unsigned long pages);
+ void (*free)(void *data);
};
/* the bitmap API */
-
-/* these are used only by md/bitmap */
-struct bitmap *md_bitmap_create(struct mddev *mddev, int slot);
-int md_bitmap_load(struct mddev *mddev);
-void md_bitmap_flush(struct mddev *mddev);
-void md_bitmap_destroy(struct mddev *mddev);
-
-void md_bitmap_print_sb(struct bitmap *bitmap);
-void md_bitmap_update_sb(struct bitmap *bitmap);
-void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
-
-int md_bitmap_setallbits(struct bitmap *bitmap);
-void md_bitmap_write_all(struct bitmap *bitmap);
-
-void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e);
-
-/* these are exported */
-int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
- unsigned long sectors, int behind);
-void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
- unsigned long sectors, int success, int behind);
-int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
-void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
-void md_bitmap_close_sync(struct bitmap *bitmap);
-void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
-void md_bitmap_sync_with_cluster(struct mddev *mddev,
- sector_t old_lo, sector_t old_hi,
- sector_t new_lo, sector_t new_hi);
-
-void md_bitmap_unplug(struct bitmap *bitmap);
-void md_bitmap_unplug_async(struct bitmap *bitmap);
-void md_bitmap_daemon_work(struct mddev *mddev);
-
-int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
- int chunksize, int init);
-struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot);
-int md_bitmap_copy_from_slot(struct mddev *mddev, int slot,
- sector_t *lo, sector_t *hi, bool clear_bits);
-void md_bitmap_free(struct bitmap *bitmap);
-void md_bitmap_wait_behind_writes(struct mddev *mddev);
-
-static inline bool md_bitmap_enabled(struct bitmap *bitmap)
-{
- return bitmap && bitmap->storage.filemap &&
- !test_bit(BITMAP_STALE, &bitmap->flags);
-}
-
-#endif
+void mddev_set_bitmap_ops(struct mddev *mddev);
#endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 1d0db62f0351..6595f89becdb 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -317,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread)
str, ret);
goto clear_bit;
}
- ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
+ ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true);
if (ret) {
pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
goto clear_bit;
@@ -497,8 +497,8 @@ static void process_suspend_info(struct mddev *mddev,
* we don't want to trigger lots of WARN.
*/
if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
- md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
- cinfo->sync_hi, lo, hi);
+ mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low,
+ cinfo->sync_hi, lo, hi);
cinfo->sync_low = lo;
cinfo->sync_hi = hi;
@@ -628,8 +628,9 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
break;
case BITMAP_RESIZE:
if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
- ret = md_bitmap_resize(mddev->bitmap,
- le64_to_cpu(msg->high), 0, 0);
+ ret = mddev->bitmap_ops->resize(mddev,
+ le64_to_cpu(msg->high),
+ 0, false);
break;
default:
ret = -1;
@@ -856,7 +857,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
}
/* Read the disk bitmap sb and check if it needs recovery */
- ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
+ ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false);
if (ret) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
lockres_free(bm_lockres);
@@ -1143,13 +1144,16 @@ static int update_bitmap_size(struct mddev *mddev, sector_t size)
static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
{
- struct bitmap_counts *counts;
- char str[64];
- struct dlm_lock_resource *bm_lockres;
- struct bitmap *bitmap = mddev->bitmap;
- unsigned long my_pages = bitmap->counts.pages;
+ void *bitmap = mddev->bitmap;
+ struct md_bitmap_stats stats;
+ unsigned long my_pages;
int i, rv;
+ rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
+ if (rv)
+ return rv;
+
+ my_pages = stats.pages;
/*
* We need to ensure all the nodes can grow to a larger
* bitmap size before make the reshaping.
@@ -1159,17 +1163,22 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
return rv;
for (i = 0; i < mddev->bitmap_info.nodes; i++) {
+ struct dlm_lock_resource *bm_lockres;
+ char str[64];
+
if (i == md_cluster_ops->slot_number(mddev))
continue;
- bitmap = get_bitmap_from_slot(mddev, i);
+ bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
bitmap = NULL;
goto out;
}
- counts = &bitmap->counts;
+ rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
+ if (rv)
+ goto out;
/*
* If we can hold the bitmap lock of one node then
* the slot is not occupied, update the pages.
@@ -1183,21 +1192,21 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (!rv)
- counts->pages = my_pages;
+ mddev->bitmap_ops->set_pages(bitmap, my_pages);
lockres_free(bm_lockres);
- if (my_pages != counts->pages)
+ if (my_pages != stats.pages)
/*
* Let's revert the bitmap size if one node
* can't resize bitmap
*/
goto out;
- md_bitmap_free(bitmap);
+ mddev->bitmap_ops->free(bitmap);
}
return 0;
out:
- md_bitmap_free(bitmap);
+ mddev->bitmap_ops->free(bitmap);
update_bitmap_size(mddev, oldsize);
return -1;
}
@@ -1207,24 +1216,27 @@ out:
*/
static int cluster_check_sync_size(struct mddev *mddev)
{
- int i, rv;
- bitmap_super_t *sb;
- unsigned long my_sync_size, sync_size = 0;
- int node_num = mddev->bitmap_info.nodes;
int current_slot = md_cluster_ops->slot_number(mddev);
- struct bitmap *bitmap = mddev->bitmap;
- char str[64];
+ int node_num = mddev->bitmap_info.nodes;
struct dlm_lock_resource *bm_lockres;
+ struct md_bitmap_stats stats;
+ void *bitmap = mddev->bitmap;
+ unsigned long sync_size = 0;
+ unsigned long my_sync_size;
+ char str[64];
+ int i, rv;
- sb = kmap_atomic(bitmap->storage.sb_page);
- my_sync_size = sb->sync_size;
- kunmap_atomic(sb);
+ rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
+ if (rv)
+ return rv;
+
+ my_sync_size = stats.sync_size;
for (i = 0; i < node_num; i++) {
if (i == current_slot)
continue;
- bitmap = get_bitmap_from_slot(mddev, i);
+ bitmap = mddev->bitmap_ops->get_from_slot(mddev, i);
if (IS_ERR(bitmap)) {
pr_err("can't get bitmap from slot %d\n", i);
return -1;
@@ -1238,25 +1250,28 @@ static int cluster_check_sync_size(struct mddev *mddev)
bm_lockres = lockres_init(mddev, str, NULL, 1);
if (!bm_lockres) {
pr_err("md-cluster: Cannot initialize %s\n", str);
- md_bitmap_free(bitmap);
+ mddev->bitmap_ops->free(bitmap);
return -1;
}
bm_lockres->flags |= DLM_LKF_NOQUEUE;
rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
if (!rv)
- md_bitmap_update_sb(bitmap);
+ mddev->bitmap_ops->update_sb(bitmap);
lockres_free(bm_lockres);
- sb = kmap_atomic(bitmap->storage.sb_page);
- if (sync_size == 0)
- sync_size = sb->sync_size;
- else if (sync_size != sb->sync_size) {
- kunmap_atomic(sb);
- md_bitmap_free(bitmap);
+ rv = mddev->bitmap_ops->get_stats(bitmap, &stats);
+ if (rv) {
+ mddev->bitmap_ops->free(bitmap);
+ return rv;
+ }
+
+ if (sync_size == 0) {
+ sync_size = stats.sync_size;
+ } else if (sync_size != stats.sync_size) {
+ mddev->bitmap_ops->free(bitmap);
return -1;
}
- kunmap_atomic(sb);
- md_bitmap_free(bitmap);
+ mddev->bitmap_ops->free(bitmap);
}
return (my_sync_size == sync_size) ? 0 : -1;
@@ -1585,7 +1600,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
if (sn == (cinfo->slot_number - 1))
continue;
- err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
+ err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false);
if (err) {
pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
goto out;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d3a837506a36..179ee4afe937 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -546,137 +546,30 @@ static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_n
return 0;
}
-/*
- * Generic flush handling for md
- */
-
-static void md_end_flush(struct bio *bio)
-{
- struct md_rdev *rdev = bio->bi_private;
- struct mddev *mddev = rdev->mddev;
-
- bio_put(bio);
-
- rdev_dec_pending(rdev, mddev);
-
- if (atomic_dec_and_test(&mddev->flush_pending))
- /* The pre-request flush has finished */
- queue_work(md_wq, &mddev->flush_work);
-}
-
-static void md_submit_flush_data(struct work_struct *ws);
-
-static void submit_flushes(struct work_struct *ws)
+bool md_flush_request(struct mddev *mddev, struct bio *bio)
{
- struct mddev *mddev = container_of(ws, struct mddev, flush_work);
struct md_rdev *rdev;
-
- mddev->start_flush = ktime_get_boottime();
- INIT_WORK(&mddev->flush_work, md_submit_flush_data);
- atomic_set(&mddev->flush_pending, 1);
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- !test_bit(Faulty, &rdev->flags)) {
- struct bio *bi;
-
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- bi = bio_alloc_bioset(rdev->bdev, 0,
- REQ_OP_WRITE | REQ_PREFLUSH,
- GFP_NOIO, &mddev->bio_set);
- bi->bi_end_io = md_end_flush;
- bi->bi_private = rdev;
- atomic_inc(&mddev->flush_pending);
- submit_bio(bi);
- rcu_read_lock();
- }
- rcu_read_unlock();
- if (atomic_dec_and_test(&mddev->flush_pending))
- queue_work(md_wq, &mddev->flush_work);
-}
-
-static void md_submit_flush_data(struct work_struct *ws)
-{
- struct mddev *mddev = container_of(ws, struct mddev, flush_work);
- struct bio *bio = mddev->flush_bio;
+ struct bio *new;
/*
- * must reset flush_bio before calling into md_handle_request to avoid a
- * deadlock, because other bios passed md_handle_request suspend check
- * could wait for this and below md_handle_request could wait for those
- * bios because of suspend check
+ * md_flush_reqeust() should be called under md_handle_request() and
+ * 'active_io' is already grabbed. Hence it's safe to get rdev directly
+ * without rcu protection.
*/
- spin_lock_irq(&mddev->lock);
- mddev->prev_flush_start = mddev->start_flush;
- mddev->flush_bio = NULL;
- spin_unlock_irq(&mddev->lock);
- wake_up(&mddev->sb_wait);
+ WARN_ON(percpu_ref_is_zero(&mddev->active_io));
- if (bio->bi_iter.bi_size == 0) {
- /* an empty barrier - all done */
- bio_endio(bio);
- } else {
- bio->bi_opf &= ~REQ_PREFLUSH;
-
- /*
- * make_requst() will never return error here, it only
- * returns error in raid5_make_request() by dm-raid.
- * Since dm always splits data and flush operation into
- * two separate io, io size of flush submitted by dm
- * always is 0, make_request() will not be called here.
- */
- if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio)))
- bio_io_error(bio);
- }
-
- /* The pair is percpu_ref_get() from md_flush_request() */
- percpu_ref_put(&mddev->active_io);
-}
+ rdev_for_each(rdev, mddev) {
+ if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+ continue;
-/*
- * Manages consolidation of flushes and submitting any flushes needed for
- * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
- * being finished in another context. Returns false if the flushing is
- * complete but still needs the I/O portion of the bio to be processed.
- */
-bool md_flush_request(struct mddev *mddev, struct bio *bio)
-{
- ktime_t req_start = ktime_get_boottime();
- spin_lock_irq(&mddev->lock);
- /* flush requests wait until ongoing flush completes,
- * hence coalescing all the pending requests.
- */
- wait_event_lock_irq(mddev->sb_wait,
- !mddev->flush_bio ||
- ktime_before(req_start, mddev->prev_flush_start),
- mddev->lock);
- /* new request after previous flush is completed */
- if (ktime_after(req_start, mddev->prev_flush_start)) {
- WARN_ON(mddev->flush_bio);
- /*
- * Grab a reference to make sure mddev_suspend() will wait for
- * this flush to be done.
- *
- * md_flush_reqeust() is called under md_handle_request() and
- * 'active_io' is already grabbed, hence percpu_ref_is_zero()
- * won't pass, percpu_ref_tryget_live() can't be used because
- * percpu_ref_kill() can be called by mddev_suspend()
- * concurrently.
- */
- WARN_ON(percpu_ref_is_zero(&mddev->active_io));
- percpu_ref_get(&mddev->active_io);
- mddev->flush_bio = bio;
- spin_unlock_irq(&mddev->lock);
- INIT_WORK(&mddev->flush_work, submit_flushes);
- queue_work(md_wq, &mddev->flush_work);
- return true;
+ new = bio_alloc_bioset(rdev->bdev, 0,
+ REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO,
+ &mddev->bio_set);
+ bio_chain(new, bio);
+ submit_bio(new);
}
- /* flush was performed for some other bio while we waited. */
- spin_unlock_irq(&mddev->lock);
- if (bio->bi_iter.bi_size == 0) {
- /* pure flush without data - all done */
+ if (bio_sectors(bio) == 0) {
bio_endio(bio);
return true;
}
@@ -763,7 +656,6 @@ int mddev_init(struct mddev *mddev)
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->sync_seq, 0);
spin_lock_init(&mddev->lock);
- atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
@@ -772,6 +664,7 @@ int mddev_init(struct mddev *mddev)
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
+ mddev_set_bitmap_ops(mddev);
INIT_WORK(&mddev->sync_work, md_start_sync);
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
@@ -1372,6 +1265,18 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
return ret;
}
+static u64 md_bitmap_events_cleared(struct mddev *mddev)
+{
+ struct md_bitmap_stats stats;
+ int err;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (err)
+ return 0;
+
+ return stats.events_cleared;
+}
+
/*
* validate_super for 0.90.0
* note: we are not using "freshest" for 0.9 superblock
@@ -1464,7 +1369,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
/* if adding to array with a bitmap, then we can accept an
* older device ... but not too old.
*/
- if (ev1 < mddev->bitmap->events_cleared)
+ if (ev1 < md_bitmap_events_cleared(mddev))
return 0;
if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
@@ -1991,7 +1896,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
/* If adding to array with a bitmap, then we can accept an
* older device, but not too old.
*/
- if (ev1 < mddev->bitmap->events_cleared)
+ if (ev1 < md_bitmap_events_cleared(mddev))
return 0;
if (ev1 < mddev->events)
set_bit(Bitmap_sync, &rdev->flags);
@@ -2323,7 +2228,6 @@ super_1_allow_new_offset(struct md_rdev *rdev,
unsigned long long new_offset)
{
/* All necessary checks on new >= old have been done */
- struct bitmap *bitmap;
if (new_offset >= rdev->data_offset)
return 1;
@@ -2340,11 +2244,18 @@ super_1_allow_new_offset(struct md_rdev *rdev,
*/
if (rdev->sb_start + (32+4)*2 > new_offset)
return 0;
- bitmap = rdev->mddev->bitmap;
- if (bitmap && !rdev->mddev->bitmap_info.file &&
- rdev->sb_start + rdev->mddev->bitmap_info.offset +
- bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
- return 0;
+
+ if (!rdev->mddev->bitmap_info.file) {
+ struct mddev *mddev = rdev->mddev;
+ struct md_bitmap_stats stats;
+ int err;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (!err && rdev->sb_start + mddev->bitmap_info.offset +
+ stats.file_pages * (PAGE_SIZE >> 9) > new_offset)
+ return 0;
+ }
+
if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
return 0;
@@ -2820,7 +2731,7 @@ repeat:
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
- md_bitmap_update_sb(mddev->bitmap);
+ mddev->bitmap_ops->update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
if (rdev->sb_loaded != 1)
continue; /* no noise on spare devices */
@@ -4142,6 +4053,34 @@ static struct md_sysfs_entry md_level =
__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
static ssize_t
+new_level_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d\n", mddev->new_level);
+}
+
+static ssize_t
+new_level_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int n;
+ int err;
+
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+
+ mddev->new_level = n;
+ md_update_sb(mddev, 1);
+
+ mddev_unlock(mddev);
+ return len;
+}
+static struct md_sysfs_entry md_new_level =
+__ATTR(new_level, 0664, new_level_show, new_level_store);
+
+static ssize_t
layout_show(struct mddev *mddev, char *page)
{
/* just a number, not meaningful for all levels */
@@ -4680,17 +4619,23 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len)
/* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
while (*buf) {
chunk = end_chunk = simple_strtoul(buf, &end, 0);
- if (buf == end) break;
+ if (buf == end)
+ break;
+
if (*end == '-') { /* range */
buf = end + 1;
end_chunk = simple_strtoul(buf, &end, 0);
- if (buf == end) break;
+ if (buf == end)
+ break;
}
- if (*end && !isspace(*end)) break;
- md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
+
+ if (*end && !isspace(*end))
+ break;
+
+ mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk);
buf = skip_spaces(end);
}
- md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
+ mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */
out:
mddev_unlock(mddev);
return len;
@@ -5666,6 +5611,7 @@ __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
static struct attribute *md_default_attrs[] = {
&md_level.attr,
+ &md_new_level.attr,
&md_layout.attr,
&md_raid_disks.attr,
&md_uuid.attr,
@@ -6206,16 +6152,10 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
- struct bitmap *bitmap;
-
- bitmap = md_bitmap_create(mddev, -1);
- if (IS_ERR(bitmap)) {
- err = PTR_ERR(bitmap);
+ err = mddev->bitmap_ops->create(mddev, -1);
+ if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
- } else
- mddev->bitmap = bitmap;
-
}
if (err)
goto bitmap_abort;
@@ -6285,7 +6225,7 @@ bitmap_abort:
pers->free(mddev, mddev->private);
mddev->private = NULL;
module_put(pers->owner);
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
exit_sync_set:
@@ -6304,9 +6244,10 @@ int do_md_run(struct mddev *mddev)
err = md_run(mddev);
if (err)
goto out;
- err = md_bitmap_load(mddev);
+
+ err = mddev->bitmap_ops->load(mddev);
if (err) {
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
goto out;
}
@@ -6450,7 +6391,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
}
- md_bitmap_flush(mddev);
+
+ mddev->bitmap_ops->flush(mddev);
if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
@@ -6477,7 +6419,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
static void mddev_detach(struct mddev *mddev)
{
- md_bitmap_wait_behind_writes(mddev);
+ mddev->bitmap_ops->wait_behind_writes(mddev);
if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) {
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
@@ -6492,7 +6434,8 @@ static void mddev_detach(struct mddev *mddev)
static void __md_stop(struct mddev *mddev)
{
struct md_personality *pers = mddev->pers;
- md_bitmap_destroy(mddev);
+
+ mddev->bitmap_ops->destroy(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
mddev->pers = NULL;
@@ -7270,22 +7213,19 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = 0;
if (mddev->pers) {
if (fd >= 0) {
- struct bitmap *bitmap;
+ err = mddev->bitmap_ops->create(mddev, -1);
+ if (!err)
+ err = mddev->bitmap_ops->load(mddev);
- bitmap = md_bitmap_create(mddev, -1);
- if (!IS_ERR(bitmap)) {
- mddev->bitmap = bitmap;
- err = md_bitmap_load(mddev);
- } else
- err = PTR_ERR(bitmap);
if (err) {
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
fd = -1;
}
} else if (fd < 0) {
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
}
}
+
if (fd < 0) {
struct file *f = mddev->bitmap_info.file;
if (f) {
@@ -7554,7 +7494,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
goto err;
}
if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
- struct bitmap *bitmap;
/* add the bitmap */
if (mddev->bitmap) {
rv = -EEXIST;
@@ -7568,24 +7507,24 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
- bitmap = md_bitmap_create(mddev, -1);
- if (!IS_ERR(bitmap)) {
- mddev->bitmap = bitmap;
- rv = md_bitmap_load(mddev);
- } else
- rv = PTR_ERR(bitmap);
+ rv = mddev->bitmap_ops->create(mddev, -1);
+ if (!rv)
+ rv = mddev->bitmap_ops->load(mddev);
+
if (rv)
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
} else {
- /* remove the bitmap */
- if (!mddev->bitmap) {
- rv = -ENOENT;
+ struct md_bitmap_stats stats;
+
+ rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (rv)
goto err;
- }
- if (mddev->bitmap->storage.file) {
+
+ if (stats.file) {
rv = -EINVAL;
goto err;
}
+
if (mddev->bitmap_info.nodes) {
/* hold PW on all the bitmap lock */
if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
@@ -7600,7 +7539,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- md_bitmap_destroy(mddev);
+ mddev->bitmap_ops->destroy(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -8370,6 +8309,33 @@ static void md_seq_stop(struct seq_file *seq, void *v)
spin_unlock(&all_mddevs_lock);
}
+static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev)
+{
+ struct md_bitmap_stats stats;
+ unsigned long used_pages;
+ unsigned long chunk_kb;
+ int err;
+
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (err)
+ return;
+
+ chunk_kb = mddev->bitmap_info.chunksize >> 10;
+ used_pages = stats.pages - stats.missing_pages;
+
+ seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk",
+ used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10),
+ chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
+ chunk_kb ? "KB" : "B");
+
+ if (stats.file) {
+ seq_puts(seq, ", file: ");
+ seq_file_path(seq, stats.file, " \t\n");
+ }
+
+ seq_putc(seq, '\n');
+}
+
static int md_seq_show(struct seq_file *seq, void *v)
{
struct mddev *mddev;
@@ -8390,14 +8356,19 @@ static int md_seq_show(struct seq_file *seq, void *v)
spin_unlock(&all_mddevs_lock);
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
- seq_printf(seq, "%s : %sactive", mdname(mddev),
- mddev->pers ? "" : "in");
+ seq_printf(seq, "%s : ", mdname(mddev));
if (mddev->pers) {
+ if (test_bit(MD_BROKEN, &mddev->flags))
+ seq_printf(seq, "broken");
+ else
+ seq_printf(seq, "active");
if (mddev->ro == MD_RDONLY)
seq_printf(seq, " (read-only)");
if (mddev->ro == MD_AUTO_READ)
seq_printf(seq, " (auto-read-only)");
seq_printf(seq, " %s", mddev->pers->name);
+ } else {
+ seq_printf(seq, "inactive");
}
sectors = 0;
@@ -8453,7 +8424,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
} else
seq_printf(seq, "\n ");
- md_bitmap_status(seq, mddev->bitmap);
+ md_bitmap_status(seq, mddev);
seq_printf(seq, "\n");
}
@@ -8668,7 +8639,6 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
BUG_ON(mddev->ro == MD_RDONLY);
if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
- flush_work(&mddev->sync_work);
mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
@@ -9506,7 +9476,7 @@ static void md_start_sync(struct work_struct *ws)
* stored on all devices. So make sure all bitmap pages get written.
*/
if (spares)
- md_bitmap_write_all(mddev->bitmap);
+ mddev->bitmap_ops->write_all(mddev);
name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
"reshape" : "resync";
@@ -9594,7 +9564,7 @@ static void unregister_sync_thread(struct mddev *mddev)
void md_check_recovery(struct mddev *mddev)
{
if (mddev->bitmap)
- md_bitmap_daemon_work(mddev);
+ mddev->bitmap_ops->daemon_work(mddev);
if (signal_pending(current)) {
if (mddev->pers->sync_request && !mddev->external) {
@@ -9965,7 +9935,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
if (ret)
pr_info("md-cluster: resize failed\n");
else
- md_bitmap_update_sb(mddev->bitmap);
+ mddev->bitmap_ops->update_sb(mddev->bitmap);
}
/* Check for change of roles in the active devices */
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a0d6827dced9..5d2e6bd58e4d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -535,7 +535,8 @@ struct mddev {
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
- struct bitmap *bitmap; /* the bitmap for the device */
+ void *bitmap; /* the bitmap for the device */
+ struct bitmap_operations *bitmap_ops;
struct {
struct file *file; /* the bitmap file */
loff_t offset; /* offset from superblock of
@@ -571,16 +572,6 @@ struct mddev {
*/
struct bio_set io_clone_set;
- /* Generic flush handling.
- * The last to finish preflush schedules a worker to submit
- * the rest of the request (without the REQ_PREFLUSH flag).
- */
- struct bio *flush_bio;
- atomic_t flush_pending;
- ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed
- * flush was started.
- */
- struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */
mempool_t *serial_info_pool;
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 2ea1710a3b70..4378d3250bd7 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -140,7 +140,7 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* If bitmap is not enabled, it's safe to submit the io directly, and
* this can get optimal performance.
*/
- if (!md_bitmap_enabled(mddev->bitmap)) {
+ if (!mddev->bitmap_ops->enabled(mddev)) {
raid1_submit_write(bio);
return true;
}
@@ -166,12 +166,9 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio,
* while current io submission must wait for bitmap io to be done. In order to
* avoid such deadlock, submit bitmap io asynchronously.
*/
-static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
+static inline void raid1_prepare_flush_writes(struct mddev *mddev)
{
- if (current->bio_list)
- md_bitmap_unplug_async(bitmap);
- else
- md_bitmap_unplug(bitmap);
+ mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL);
}
/*
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 761989d67906..6c9d24203f39 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -411,18 +411,20 @@ static void raid1_end_read_request(struct bio *bio)
static void close_write(struct r1bio *r1_bio)
{
+ struct mddev *mddev = r1_bio->mddev;
+
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
bio_free_pages(r1_bio->behind_master_bio);
bio_put(r1_bio->behind_master_bio);
r1_bio->behind_master_bio = NULL;
}
+
/* clear the bitmap if all writes complete successfully */
- md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
- r1_bio->sectors,
- !test_bit(R1BIO_Degraded, &r1_bio->state),
- test_bit(R1BIO_BehindIO, &r1_bio->state));
- md_write_end(r1_bio->mddev);
+ mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors,
+ !test_bit(R1BIO_Degraded, &r1_bio->state),
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
+ md_write_end(mddev);
}
static void r1_bio_write_done(struct r1bio *r1_bio)
@@ -900,7 +902,7 @@ static void wake_up_barrier(struct r1conf *conf)
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
- raid1_prepare_flush_writes(conf->mddev->bitmap);
+ raid1_prepare_flush_writes(conf->mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
@@ -1317,13 +1319,11 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct raid1_info *mirror;
struct bio *read_bio;
- struct bitmap *bitmap = mddev->bitmap;
const enum req_op op = bio_op(bio);
const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC;
int max_sectors;
int rdisk;
bool r1bio_existed = !!r1_bio;
- char b[BDEVNAME_SIZE];
/*
* If r1_bio is set, we are blocking the raid1d thread
@@ -1332,16 +1332,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
*/
gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO;
- if (r1bio_existed) {
- /* Need to get the block device name carefully */
- struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
-
- if (rdev)
- snprintf(b, sizeof(b), "%pg", rdev->bdev);
- else
- strcpy(b, "???");
- }
-
/*
* Still need barrier for READ in case that whole
* array is frozen.
@@ -1363,15 +1353,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
* used and no empty request is available.
*/
rdisk = read_balance(conf, r1_bio, &max_sectors);
-
if (rdisk < 0) {
/* couldn't find anywhere to read from */
- if (r1bio_existed) {
- pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
+ if (r1bio_existed)
+ pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n",
mdname(mddev),
- b,
- (unsigned long long)r1_bio->sector);
- }
+ conf->mirrors[r1_bio->read_disk].rdev->bdev,
+ r1_bio->sector);
raid_end_bio_io(r1_bio);
return;
}
@@ -1383,15 +1371,13 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
(unsigned long long)r1_bio->sector,
mirror->rdev->bdev);
- if (test_bit(WriteMostly, &mirror->rdev->flags) &&
- bitmap) {
+ if (test_bit(WriteMostly, &mirror->rdev->flags)) {
/*
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
*/
mddev_add_trace_msg(mddev, "raid1 wait behind writes");
- wait_event(bitmap->behind_wait,
- atomic_read(&bitmap->behind_writes) == 0);
+ mddev->bitmap_ops->wait_behind_writes(mddev);
}
if (max_sectors < bio_sectors(bio)) {
@@ -1432,7 +1418,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
struct r1conf *conf = mddev->private;
struct r1bio *r1_bio;
int i, disks;
- struct bitmap *bitmap = mddev->bitmap;
unsigned long flags;
struct md_rdev *blocked_rdev;
int first_clone;
@@ -1585,7 +1570,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* at a time and thus needs a new bio that can fit the whole payload
* this bio in page sized chunks.
*/
- if (write_behind && bitmap)
+ if (write_behind && mddev->bitmap)
max_sectors = min_t(int, max_sectors,
BIO_MAX_VECS * (PAGE_SIZE >> 9));
if (max_sectors < bio_sectors(bio)) {
@@ -1612,19 +1597,23 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
continue;
if (first_clone) {
+ unsigned long max_write_behind =
+ mddev->bitmap_info.max_write_behind;
+ struct md_bitmap_stats stats;
+ int err;
+
/* do behind I/O ?
* Not if there are too many, or cannot
* allocate memory, or a reader on WriteMostly
* is waiting for behind writes to flush */
- if (bitmap && write_behind &&
- (atomic_read(&bitmap->behind_writes)
- < mddev->bitmap_info.max_write_behind) &&
- !waitqueue_active(&bitmap->behind_wait)) {
+ err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats);
+ if (!err && write_behind && !stats.behind_wait &&
+ stats.behind_writes < max_write_behind)
alloc_behind_master_bio(r1_bio, bio);
- }
- md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors,
- test_bit(R1BIO_BehindIO, &r1_bio->state));
+ mddev->bitmap_ops->startwrite(
+ mddev, r1_bio->sector, r1_bio->sectors,
+ test_bit(R1BIO_BehindIO, &r1_bio->state));
first_clone = 0;
}
@@ -2042,7 +2031,7 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio)
/* make sure these bits don't get cleared. */
do {
- md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks);
s += sync_blocks;
sectors_to_go -= sync_blocks;
} while (sectors_to_go > 0);
@@ -2771,7 +2760,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
int wonly = -1;
int write_targets = 0, read_targets = 0;
sector_t sync_blocks;
- int still_degraded = 0;
+ bool still_degraded = false;
int good_sectors = RESYNC_SECTORS;
int min_bad = 0; /* number of sectors that are bad in all devices */
int idx = sector_to_idx(sector_nr);
@@ -2788,12 +2777,12 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* We can find the current addess in mddev->curr_resync
*/
if (mddev->curr_resync < max_sector) /* aborted */
- md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
- &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- md_bitmap_close_sync(mddev->bitmap);
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
@@ -2813,7 +2802,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
/* before building a request, check if we can skip these blocks..
* This call the bitmap_start_sync doesn't actually record anything
*/
- if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
/* We can skip this block, and probably several more */
*skipped = 1;
@@ -2831,9 +2820,9 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* sector_nr + two times RESYNC_SECTORS
*/
- md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
- mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
-
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
+ mddev_is_clustered(mddev) &&
+ (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
if (raise_barrier(conf, sector_nr))
return 0;
@@ -2864,7 +2853,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags)) {
if (i < conf->raid_disks)
- still_degraded = 1;
+ still_degraded = true;
} else if (!test_bit(In_sync, &rdev->flags)) {
bio->bi_opf = REQ_OP_WRITE;
bio->bi_end_io = end_sync_write;
@@ -2988,8 +2977,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (len == 0)
break;
if (sync_blocks == 0) {
- if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
- &sync_blocks, still_degraded) &&
+ if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
+ &sync_blocks, still_degraded) &&
!conf->fullsync &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
break;
@@ -3313,14 +3302,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* worth it.
*/
sector_t newsize = raid1_size(mddev, sectors, 0);
+ int ret;
+
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
- if (mddev->bitmap) {
- int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
- if (ret)
- return ret;
- }
+
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
+ if (ret)
+ return ret;
+
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2a9c4ee982e0..f3bf1116794a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -426,12 +426,13 @@ static void raid10_end_read_request(struct bio *bio)
static void close_write(struct r10bio *r10_bio)
{
+ struct mddev *mddev = r10_bio->mddev;
+
/* clear the bitmap if all writes complete successfully */
- md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
- r10_bio->sectors,
- !test_bit(R10BIO_Degraded, &r10_bio->state),
- 0);
- md_write_end(r10_bio->mddev);
+ mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors,
+ !test_bit(R10BIO_Degraded, &r10_bio->state),
+ false);
+ md_write_end(mddev);
}
static void one_write_done(struct r10bio *r10_bio)
@@ -884,7 +885,7 @@ static void flush_pending_writes(struct r10conf *conf)
__set_current_state(TASK_RUNNING);
blk_start_plug(&plug);
- raid1_prepare_flush_writes(conf->mddev->bitmap);
+ raid1_prepare_flush_writes(conf->mddev);
wake_up(&conf->wait_barrier);
while (bio) { /* submit pending writes */
@@ -1100,7 +1101,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
/* we aren't scheduling, so we can do the write-out directly. */
bio = bio_list_get(&plug->pending);
- raid1_prepare_flush_writes(mddev->bitmap);
+ raid1_prepare_flush_writes(mddev);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
@@ -1492,7 +1493,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
- md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
+ mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors,
+ false);
for (i = 0; i < conf->copies; i++) {
if (r10_bio->devs[i].bio)
@@ -2465,7 +2467,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
s = PAGE_SIZE >> 9;
rdev = conf->mirrors[dr].rdev;
- addr = r10_bio->devs[0].addr + sect,
+ addr = r10_bio->devs[0].addr + sect;
ok = sync_page_io(rdev,
addr,
s << 9,
@@ -3192,13 +3194,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (mddev->curr_resync < max_sector) { /* aborted */
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
- md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
- &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev,
+ mddev->curr_resync,
+ &sync_blocks);
else for (i = 0; i < conf->geo.raid_disks; i++) {
sector_t sect =
raid10_find_virt(conf, mddev->curr_resync, i);
- md_bitmap_end_sync(mddev->bitmap, sect,
- &sync_blocks, 1);
+
+ mddev->bitmap_ops->end_sync(mddev, sect,
+ &sync_blocks);
}
} else {
/* completed sync */
@@ -3218,7 +3222,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
}
conf->fullsync = 0;
}
- md_bitmap_close_sync(mddev->bitmap);
+ mddev->bitmap_ops->close_sync(mddev);
close_sync(conf);
*skipped = 1;
return sectors_skipped;
@@ -3287,10 +3291,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio = NULL;
for (i = 0 ; i < conf->geo.raid_disks; i++) {
- int still_degraded;
+ bool still_degraded;
struct r10bio *rb2;
sector_t sect;
- int must_sync;
+ bool must_sync;
int any_working;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
@@ -3307,7 +3311,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if (!mrdev && !mreplace)
continue;
- still_degraded = 0;
+ still_degraded = false;
/* want to reconstruct this device */
rb2 = r10_bio;
sect = raid10_find_virt(conf, sector_nr, i);
@@ -3320,8 +3324,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* we only need to recover the block if it is set in
* the bitmap
*/
- must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
- &sync_blocks, 1);
+ must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
+ &sync_blocks,
+ true);
if (sync_blocks < max_sync)
max_sync = sync_blocks;
if (!must_sync &&
@@ -3359,13 +3364,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
struct md_rdev *rdev = conf->mirrors[j].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
- still_degraded = 1;
+ still_degraded = false;
break;
}
}
- must_sync = md_bitmap_start_sync(mddev->bitmap, sect,
- &sync_blocks, still_degraded);
+ must_sync = mddev->bitmap_ops->start_sync(mddev, sect,
+ &sync_blocks, still_degraded);
any_working = 0;
for (j=0; j<conf->copies;j++) {
@@ -3538,12 +3543,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
- md_bitmap_cond_end_sync(mddev->bitmap, sector_nr,
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
- if (!md_bitmap_start_sync(mddev->bitmap, sector_nr,
- &sync_blocks, mddev->degraded) &&
+ if (!mddev->bitmap_ops->start_sync(mddev, sector_nr,
+ &sync_blocks,
+ mddev->degraded) &&
!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
&mddev->recovery)) {
/* We can skip this block */
@@ -4190,6 +4196,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
*/
struct r10conf *conf = mddev->private;
sector_t oldsize, size;
+ int ret;
if (mddev->reshape_position != MaxSector)
return -EBUSY;
@@ -4202,11 +4209,11 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > size)
return -EINVAL;
- if (mddev->bitmap) {
- int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0);
- if (ret)
- return ret;
- }
+
+ ret = mddev->bitmap_ops->resize(mddev, size, 0, false);
+ if (ret)
+ return ret;
+
md_set_array_sectors(mddev, size);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > oldsize) {
@@ -4472,7 +4479,7 @@ static int raid10_start_reshape(struct mddev *mddev)
newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
if (!mddev_is_clustered(mddev)) {
- ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
goto abort;
else
@@ -4487,20 +4494,20 @@ static int raid10_start_reshape(struct mddev *mddev)
/*
* some node is already performing reshape, and no need to
- * call md_bitmap_resize again since it should be called when
+ * call bitmap_ops->resize again since it should be called when
* receiving BITMAP_RESIZE msg
*/
if ((sb && (le32_to_cpu(sb->feature_map) &
MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
goto out;
- ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false);
if (ret)
goto abort;
ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
if (ret) {
- md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
+ mddev->bitmap_ops->resize(mddev, oldsize, 0, false);
goto abort;
}
}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 874874fe4fa1..b4f7b79fd187 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -313,10 +313,10 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
if (sh->dev[i].written) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
r5c_return_dev_pending_writes(conf, &sh->dev[i]);
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- 0);
+ conf->mddev->bitmap_ops->endwrite(conf->mddev,
+ sh->sector, RAID5_STRIPE_SECTORS(conf),
+ !test_bit(STRIPE_DEGRADED, &sh->state),
+ false);
}
}
}
@@ -2798,7 +2798,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
{
struct r5l_log *log = READ_ONCE(conf->log);
int i;
- int do_wakeup = 0;
sector_t tree_index;
void __rcu **pslot;
uintptr_t refcount;
@@ -2815,7 +2814,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
for (i = sh->disks; i--; ) {
clear_bit(R5_InJournal, &sh->dev[i].flags);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- do_wakeup = 1;
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
}
/*
@@ -2828,9 +2827,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
if (atomic_dec_and_test(&conf->pending_full_writes))
md_wakeup_thread(conf->mddev->thread);
- if (do_wakeup)
- wake_up(&conf->wait_for_overlap);
-
spin_lock_irq(&log->stripe_in_journal_lock);
list_del_init(&sh->r5c);
spin_unlock_irq(&log->stripe_in_journal_lock);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c14cf2410365..dc2ea636d173 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2337,7 +2337,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_and_clear_bit(R5_Overlap, &dev->flags))
- wake_up(&sh->raid_conf->wait_for_overlap);
+ wake_up_bit(&dev->flags, R5_Overlap);
}
}
local_unlock(&conf->percpu->lock);
@@ -3473,7 +3473,7 @@ static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi,
* With PPL only writes to consecutive data chunks within a
* stripe are allowed because for a single stripe_head we can
* only have one PPL entry at a time, which describes one data
- * range. Not really an overlap, but wait_for_overlap can be
+ * range. Not really an overlap, but R5_Overlap can be
* used to handle this.
*/
sector_t sector;
@@ -3563,8 +3563,8 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi,
*/
set_bit(STRIPE_BITMAP_PENDING, &sh->state);
spin_unlock_irq(&sh->stripe_lock);
- md_bitmap_startwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0);
+ conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf), false);
spin_lock_irq(&sh->stripe_lock);
clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
if (!sh->batch_head) {
@@ -3652,7 +3652,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
log_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) {
@@ -3663,8 +3663,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
bi = nextbi;
}
if (bitmap_end)
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0, 0);
+ conf->mddev->bitmap_ops->endwrite(conf->mddev,
+ sh->sector, RAID5_STRIPE_SECTORS(conf),
+ false, false);
bitmap_end = 0;
/* and fail all 'written' */
bi = sh->dev[i].written;
@@ -3696,7 +3697,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
sh->dev[i].toread = NULL;
spin_unlock_irq(&sh->stripe_lock);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
if (bi)
s->to_read--;
while (bi && bi->bi_iter.bi_sector <
@@ -3709,8 +3710,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
}
}
if (bitmap_end)
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf), 0, 0);
+ conf->mddev->bitmap_ops->endwrite(conf->mddev,
+ sh->sector, RAID5_STRIPE_SECTORS(conf),
+ false, false);
/* If we were in the middle of a write the parity block might
* still be locked - so just clear all R5_LOCKED flags
*/
@@ -3734,7 +3736,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(sh->batch_head);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
s->syncing = 0;
s->replacing = 0;
/* There is nothing more to do for sync/check/repair.
@@ -4059,10 +4061,10 @@ returnbi:
bio_endio(wbi);
wbi = wbi2;
}
- md_bitmap_endwrite(conf->mddev->bitmap, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- !test_bit(STRIPE_DEGRADED, &sh->state),
- 0);
+ conf->mddev->bitmap_ops->endwrite(conf->mddev,
+ sh->sector, RAID5_STRIPE_SECTORS(conf),
+ !test_bit(STRIPE_DEGRADED, &sh->state),
+ false);
if (head_sh->batch_head) {
sh = list_first_entry(&sh->batch_list,
struct stripe_head,
@@ -4875,7 +4877,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
{
struct stripe_head *sh, *next;
int i;
- int do_wakeup = 0;
list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
@@ -4911,7 +4912,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&sh->stripe_lock);
for (i = 0; i < sh->disks; i++) {
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
- do_wakeup = 1;
+ wake_up_bit(&sh->dev[i].flags, R5_Overlap);
sh->dev[i].flags = head_sh->dev[i].flags &
(~((1 << R5_WriteError) | (1 << R5_Overlap)));
}
@@ -4925,12 +4926,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
spin_unlock_irq(&head_sh->stripe_lock);
for (i = 0; i < head_sh->disks; i++)
if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
- do_wakeup = 1;
+ wake_up_bit(&head_sh->dev[i].flags, R5_Overlap);
if (head_sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &head_sh->state);
-
- if (do_wakeup)
- wake_up(&head_sh->raid_conf->wait_for_overlap);
}
static void handle_stripe(struct stripe_head *sh)
@@ -5196,7 +5194,7 @@ static void handle_stripe(struct stripe_head *sh)
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
clear_bit(STRIPE_SYNCING, &sh->state);
if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
- wake_up(&conf->wait_for_overlap);
+ wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap);
}
/* If the failed drives are just a ReadError, then we might need
@@ -5259,7 +5257,7 @@ static void handle_stripe(struct stripe_head *sh)
} else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1);
}
@@ -5753,12 +5751,11 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
int d;
again:
sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
- prepare_to_wait(&conf->wait_for_overlap, &w,
- TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
raid5_release_stripe(sh);
- schedule();
+ wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap,
+ TASK_UNINTERRUPTIBLE);
goto again;
}
clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
@@ -5770,12 +5767,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
raid5_release_stripe(sh);
- schedule();
+ wait_on_bit(&sh->dev[d].flags, R5_Overlap,
+ TASK_UNINTERRUPTIBLE);
goto again;
}
}
set_bit(STRIPE_DISCARD, &sh->state);
- finish_wait(&conf->wait_for_overlap, &w);
sh->overwrite_disks = 0;
for (d = 0; d < conf->raid_disks; d++) {
if (d == sh->pd_idx || d == sh->qd_idx)
@@ -5788,13 +5785,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
}
spin_unlock_irq(&sh->stripe_lock);
if (conf->mddev->bitmap) {
- for (d = 0;
- d < conf->raid_disks - conf->max_degraded;
+ for (d = 0; d < conf->raid_disks - conf->max_degraded;
d++)
- md_bitmap_startwrite(mddev->bitmap,
- sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- 0);
+ mddev->bitmap_ops->startwrite(mddev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf), false);
sh->bm_seq = conf->seq_flush + 1;
set_bit(STRIPE_BIT_DELAY, &sh->state);
}
@@ -5855,7 +5849,6 @@ static int add_all_stripe_bios(struct r5conf *conf,
struct bio *bi, int forwrite, int previous)
{
int dd_idx;
- int ret = 1;
spin_lock_irq(&sh->stripe_lock);
@@ -5871,14 +5864,19 @@ static int add_all_stripe_bios(struct r5conf *conf,
if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) {
set_bit(R5_Overlap, &dev->flags);
- ret = 0;
- continue;
+ spin_unlock_irq(&sh->stripe_lock);
+ raid5_release_stripe(sh);
+ /* release batch_last before wait to avoid risk of deadlock */
+ if (ctx->batch_last) {
+ raid5_release_stripe(ctx->batch_last);
+ ctx->batch_last = NULL;
+ }
+ md_wakeup_thread(conf->mddev->thread);
+ wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE);
+ return 0;
}
}
- if (!ret)
- goto out;
-
for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) {
struct r5dev *dev = &sh->dev[dd_idx];
@@ -5894,9 +5892,8 @@ static int add_all_stripe_bios(struct r5conf *conf,
RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do);
}
-out:
spin_unlock_irq(&sh->stripe_lock);
- return ret;
+ return 1;
}
enum reshape_loc {
@@ -5992,17 +5989,17 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
goto out_release;
}
- if (test_bit(STRIPE_EXPANDING, &sh->state) ||
- !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
- /*
- * Stripe is busy expanding or add failed due to
- * overlap. Flush everything and wait a while.
- */
+ if (test_bit(STRIPE_EXPANDING, &sh->state)) {
md_wakeup_thread(mddev->thread);
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out_release;
}
+ if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) {
+ ret = STRIPE_RETRY;
+ goto out;
+ }
+
if (stripe_can_batch(sh)) {
stripe_add_to_batch_list(conf, sh, ctx->batch_last);
if (ctx->batch_last)
@@ -6073,6 +6070,7 @@ static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf,
static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ bool on_wq;
struct r5conf *conf = mddev->private;
sector_t logical_sector;
struct stripe_request_ctx ctx = {};
@@ -6146,11 +6144,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
* sequential IO pattern. We don't bother with the optimization when
* reshaping as the performance benefit is not worth the complexity.
*/
- if (likely(conf->reshape_progress == MaxSector))
+ if (likely(conf->reshape_progress == MaxSector)) {
logical_sector = raid5_bio_lowest_chunk_sector(conf, bi);
+ on_wq = false;
+ } else {
+ add_wait_queue(&conf->wait_for_reshape, &wait);
+ on_wq = true;
+ }
s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf);
- add_wait_queue(&conf->wait_for_overlap, &wait);
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
@@ -6161,6 +6163,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
continue;
if (res == STRIPE_SCHEDULE_AND_RETRY) {
+ WARN_ON_ONCE(!on_wq);
/*
* Must release the reference to batch_last before
* scheduling and waiting for work to be done,
@@ -6185,7 +6188,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
logical_sector = ctx.first_sector +
(s << RAID5_STRIPE_SHIFT(conf));
}
- remove_wait_queue(&conf->wait_for_overlap, &wait);
+ if (unlikely(on_wq))
+ remove_wait_queue(&conf->wait_for_reshape, &wait);
if (ctx.batch_last)
raid5_release_stripe(ctx.batch_last);
@@ -6338,7 +6342,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
: (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
/* Cannot proceed until we've updated the superblock... */
- wait_event(conf->wait_for_overlap,
+ wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes)==0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6364,7 +6368,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
@@ -6447,7 +6451,7 @@ finish:
(sector_nr - mddev->curr_resync_completed) * 2
>= mddev->resync_max - mddev->curr_resync_completed) {
/* Cannot proceed until we've updated the superblock... */
- wait_event(conf->wait_for_overlap,
+ wait_event(conf->wait_for_reshape,
atomic_read(&conf->reshape_stripes) == 0
|| test_bit(MD_RECOVERY_INTR, &mddev->recovery));
if (atomic_read(&conf->reshape_stripes) != 0)
@@ -6473,7 +6477,7 @@ finish:
spin_lock_irq(&conf->device_lock);
conf->reshape_safe = mddev->reshape_position;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
}
ret:
@@ -6486,7 +6490,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
sector_t sync_blocks;
- int still_degraded = 0;
+ bool still_degraded = false;
int i;
if (sector_nr >= max_sector) {
@@ -6498,17 +6502,17 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (mddev->curr_resync < max_sector) /* aborted */
- md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
- &sync_blocks, 1);
+ mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync,
+ &sync_blocks);
else /* completed sync */
conf->fullsync = 0;
- md_bitmap_close_sync(mddev->bitmap);
+ mddev->bitmap_ops->close_sync(mddev);
return 0;
}
/* Allow raid5_quiesce to complete */
- wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+ wait_event(conf->wait_for_reshape, conf->quiesce != 2);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
@@ -6531,7 +6535,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
}
if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
!conf->fullsync &&
- !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
+ !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
+ true) &&
sync_blocks >= RAID5_STRIPE_SECTORS(conf)) {
/* we can skip this block, and probably more */
do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf));
@@ -6540,7 +6545,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
return sync_blocks * RAID5_STRIPE_SECTORS(conf);
}
- md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
+ mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false);
sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
@@ -6559,10 +6564,11 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
struct md_rdev *rdev = conf->disks[i].rdev;
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
- still_degraded = 1;
+ still_degraded = true;
}
- md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
+ mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks,
+ still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
@@ -6767,7 +6773,7 @@ static void raid5d(struct md_thread *thread)
/* Now is a good time to flush some bitmap updates */
conf->seq_flush++;
spin_unlock_irq(&conf->device_lock);
- md_bitmap_unplug(mddev->bitmap);
+ mddev->bitmap_ops->unplug(mddev, true);
spin_lock_irq(&conf->device_lock);
conf->seq_write = conf->seq_flush;
activate_bit_delay(conf, conf->temp_inactive_list);
@@ -7492,7 +7498,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
init_waitqueue_head(&conf->wait_for_quiescent);
init_waitqueue_head(&conf->wait_for_stripe);
- init_waitqueue_head(&conf->wait_for_overlap);
+ init_waitqueue_head(&conf->wait_for_reshape);
INIT_LIST_HEAD(&conf->handle_list);
INIT_LIST_HEAD(&conf->loprio_list);
INIT_LIST_HEAD(&conf->hold_list);
@@ -8312,6 +8318,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
*/
sector_t newsize;
struct r5conf *conf = mddev->private;
+ int ret;
if (raid5_has_log(conf) || raid5_has_ppl(conf))
return -EINVAL;
@@ -8320,11 +8327,11 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
if (mddev->external_size &&
mddev->array_sectors > newsize)
return -EINVAL;
- if (mddev->bitmap) {
- int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0);
- if (ret)
- return ret;
- }
+
+ ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false);
+ if (ret)
+ return ret;
+
md_set_array_sectors(mddev, newsize);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
@@ -8550,7 +8557,7 @@ static void end_reshape(struct r5conf *conf)
!test_bit(In_sync, &rdev->flags))
rdev->recovery_offset = MaxSector;
spin_unlock_irq(&conf->device_lock);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
mddev_update_io_opt(conf->mddev,
conf->raid_disks - conf->max_degraded);
@@ -8614,13 +8621,13 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
conf->quiesce = 1;
unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
} else {
/* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_quiescent);
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
unlock_all_device_hash_locks_irq(conf);
}
log_quiesce(conf, quiesce);
@@ -8939,7 +8946,7 @@ static void raid5_prepare_suspend(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- wake_up(&conf->wait_for_overlap);
+ wake_up(&conf->wait_for_reshape);
}
static struct md_personality raid6_personality =
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9b5a7dc3f2a0..896ecfc4afa6 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -668,7 +668,7 @@ struct r5conf {
struct llist_head released_stripes;
wait_queue_head_t wait_for_quiescent;
wait_queue_head_t wait_for_stripe;
- wait_queue_head_t wait_for_overlap;
+ wait_queue_head_t wait_for_reshape;
unsigned long cache_state;
struct shrinker *shrinker;
int pool_size; /* number of disks in stripeheads in pool */
diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c
index 6f7e7a8fa5ae..ed5167f942d8 100644
--- a/drivers/nvme/common/keyring.c
+++ b/drivers/nvme/common/keyring.c
@@ -20,6 +20,28 @@ key_serial_t nvme_keyring_id(void)
}
EXPORT_SYMBOL_GPL(nvme_keyring_id);
+static bool nvme_tls_psk_revoked(struct key *psk)
+{
+ return test_bit(KEY_FLAG_REVOKED, &psk->flags) ||
+ test_bit(KEY_FLAG_INVALIDATED, &psk->flags);
+}
+
+struct key *nvme_tls_key_lookup(key_serial_t key_id)
+{
+ struct key *key = key_lookup(key_id);
+
+ if (IS_ERR(key)) {
+ pr_err("key id %08x not found\n", key_id);
+ return key;
+ }
+ if (nvme_tls_psk_revoked(key)) {
+ pr_err("key id %08x revoked\n", key_id);
+ return ERR_PTR(-EKEYREVOKED);
+ }
+ return key;
+}
+EXPORT_SYMBOL_GPL(nvme_tls_key_lookup);
+
static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m)
{
seq_puts(m, key->description);
@@ -36,14 +58,12 @@ static bool nvme_tls_psk_match(const struct key *key,
pr_debug("%s: no key description\n", __func__);
return false;
}
- match_len = strlen(key->description);
- pr_debug("%s: id %s len %zd\n", __func__, key->description, match_len);
-
if (!match_data->raw_data) {
pr_debug("%s: no match data\n", __func__);
return false;
}
match_id = match_data->raw_data;
+ match_len = strlen(match_id);
pr_debug("%s: match '%s' '%s' len %zd\n",
__func__, match_id, key->description, match_len);
return !memcmp(key->description, match_id, match_len);
@@ -71,7 +91,7 @@ static struct key_type nvme_tls_psk_key_type = {
static struct key *nvme_tls_psk_lookup(struct key *keyring,
const char *hostnqn, const char *subnqn,
- int hmac, bool generated)
+ u8 hmac, u8 psk_ver, bool generated)
{
char *identity;
size_t identity_len = (NVMF_NQN_SIZE) * 2 + 11;
@@ -82,8 +102,8 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring,
if (!identity)
return ERR_PTR(-ENOMEM);
- snprintf(identity, identity_len, "NVMe0%c%02d %s %s",
- generated ? 'G' : 'R', hmac, hostnqn, subnqn);
+ snprintf(identity, identity_len, "NVMe%u%c%02u %s %s",
+ psk_ver, generated ? 'G' : 'R', hmac, hostnqn, subnqn);
if (!keyring)
keyring = nvme_keyring;
@@ -107,21 +127,38 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring,
/*
* NVMe PSK priority list
*
- * 'Retained' PSKs (ie 'generated == false')
- * should be preferred to 'generated' PSKs,
- * and SHA-384 should be preferred to SHA-256.
+ * 'Retained' PSKs (ie 'generated == false') should be preferred to 'generated'
+ * PSKs, PSKs with hash (psk_ver 1) should be preferred to PSKs without hash
+ * (psk_ver 0), and SHA-384 should be preferred to SHA-256.
*/
static struct nvme_tls_psk_priority_list {
bool generated;
+ u8 psk_ver;
enum nvme_tcp_tls_cipher cipher;
} nvme_tls_psk_prio[] = {
{ .generated = false,
+ .psk_ver = 1,
+ .cipher = NVME_TCP_TLS_CIPHER_SHA384, },
+ { .generated = false,
+ .psk_ver = 1,
+ .cipher = NVME_TCP_TLS_CIPHER_SHA256, },
+ { .generated = false,
+ .psk_ver = 0,
.cipher = NVME_TCP_TLS_CIPHER_SHA384, },
{ .generated = false,
+ .psk_ver = 0,
+ .cipher = NVME_TCP_TLS_CIPHER_SHA256, },
+ { .generated = true,
+ .psk_ver = 1,
+ .cipher = NVME_TCP_TLS_CIPHER_SHA384, },
+ { .generated = true,
+ .psk_ver = 1,
.cipher = NVME_TCP_TLS_CIPHER_SHA256, },
{ .generated = true,
+ .psk_ver = 0,
.cipher = NVME_TCP_TLS_CIPHER_SHA384, },
{ .generated = true,
+ .psk_ver = 0,
.cipher = NVME_TCP_TLS_CIPHER_SHA256, },
};
@@ -137,10 +174,11 @@ key_serial_t nvme_tls_psk_default(struct key *keyring,
for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) {
bool generated = nvme_tls_psk_prio[prio].generated;
+ u8 ver = nvme_tls_psk_prio[prio].psk_ver;
enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher;
tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn,
- cipher, generated);
+ cipher, ver, generated);
if (!IS_ERR(tls_key)) {
tls_key_id = tls_key->serial;
key_put(tls_key);
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index a3caef75aa0a..486afe598184 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -41,6 +41,7 @@ config NVME_HWMON
config NVME_FABRICS
select NVME_CORE
+ select NVME_KEYRING if NVME_TCP_TLS
tristate
config NVME_RDMA
@@ -94,7 +95,6 @@ config NVME_TCP
config NVME_TCP_TLS
bool "NVMe over Fabrics TCP TLS encryption support"
depends on NVME_TCP
- select NVME_KEYRING
select NET_HANDSHAKE
select KEYS
help
@@ -109,6 +109,7 @@ config NVME_HOST_AUTH
bool "NVMe over Fabrics In-Band Authentication in host side"
depends on NVME_CORE
select NVME_AUTH
+ select NVME_KEYRING if NVME_TCP_TLS
help
This provides support for NVMe over Fabrics In-Band Authentication in
host side.
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 983909a600ad..ca9959a8fb9e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4,6 +4,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
*/
+#include <linux/async.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-integrity.h>
@@ -987,8 +988,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
cmnd->rw.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
cmnd->rw.reftag = 0;
- cmnd->rw.apptag = 0;
- cmnd->rw.appmask = 0;
+ cmnd->rw.lbat = 0;
+ cmnd->rw.lbatm = 0;
if (ns->head->ms) {
/*
@@ -4040,6 +4041,35 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
}
}
+/**
+ * struct async_scan_info - keeps track of controller & NSIDs to scan
+ * @ctrl: Controller on which namespaces are being scanned
+ * @next_nsid: Index of next NSID to scan in ns_list
+ * @ns_list: Pointer to list of NSIDs to scan
+ *
+ * Note: There is a single async_scan_info structure shared by all instances
+ * of nvme_scan_ns_async() scanning a given controller, so the atomic
+ * operations on next_nsid are critical to ensure each instance scans a unique
+ * NSID.
+ */
+struct async_scan_info {
+ struct nvme_ctrl *ctrl;
+ atomic_t next_nsid;
+ __le32 *ns_list;
+};
+
+static void nvme_scan_ns_async(void *data, async_cookie_t cookie)
+{
+ struct async_scan_info *scan_info = data;
+ int idx;
+ u32 nsid;
+
+ idx = (u32)atomic_fetch_inc(&scan_info->next_nsid);
+ nsid = le32_to_cpu(scan_info->ns_list[idx]);
+
+ nvme_scan_ns(scan_info->ctrl, nsid);
+}
+
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid)
{
@@ -4066,11 +4096,15 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
__le32 *ns_list;
u32 prev = 0;
int ret = 0, i;
+ ASYNC_DOMAIN(domain);
+ struct async_scan_info scan_info;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
+ scan_info.ctrl = ctrl;
+ scan_info.ns_list = ns_list;
for (;;) {
struct nvme_command cmd = {
.identify.opcode = nvme_admin_identify,
@@ -4086,19 +4120,23 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
goto free;
}
+ atomic_set(&scan_info.next_nsid, 0);
for (i = 0; i < nr_entries; i++) {
u32 nsid = le32_to_cpu(ns_list[i]);
if (!nsid) /* end of the list? */
goto out;
- nvme_scan_ns(ctrl, nsid);
+ async_schedule_domain(nvme_scan_ns_async, &scan_info,
+ &domain);
while (++prev < nsid)
nvme_ns_remove_by_nsid(ctrl, prev);
}
+ async_synchronize_full_domain(&domain);
}
out:
nvme_remove_invalid_namespaces(ctrl, prev);
free:
+ async_synchronize_full_domain(&domain);
kfree(ns_list);
return ret;
}
@@ -4568,7 +4606,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
set->flags = BLK_MQ_F_SHOULD_MERGE;
if (ctrl->ops->flags & NVME_F_BLOCKING)
set->flags |= BLK_MQ_F_BLOCKING;
- set->cmd_size = cmd_size,
+ set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = ctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
@@ -4678,7 +4716,6 @@ static void nvme_free_ctrl(struct device *dev)
if (!subsys || ctrl->instance != subsys->instance)
ida_free(&nvme_instance_ida, ctrl->instance);
- key_put(ctrl->tls_key);
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
cleanup_srcu_struct(&ctrl->srcu);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index f5f545fa0103..432efcbf9e2f 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -665,7 +665,7 @@ static struct key *nvmf_parse_key(int key_id)
return ERR_PTR(-EINVAL);
}
- key = key_lookup(key_id);
+ key = nvme_tls_key_lookup(key_id);
if (IS_ERR(key))
pr_err("key id %08x not found\n", key_id);
else
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index f1d58e70933f..1d769c842fbf 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -4,6 +4,7 @@
* Copyright (c) 2017-2021 Christoph Hellwig.
*/
#include <linux/bio-integrity.h>
+#include <linux/blk-integrity.h>
#include <linux/ptrace.h> /* for force_successful_syscall_return */
#include <linux/nvme_ioctl.h>
#include <linux/io_uring/cmd.h>
@@ -119,9 +120,14 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
struct request_queue *q = req->q;
struct nvme_ns *ns = q->queuedata;
struct block_device *bdev = ns ? ns->disk->part0 : NULL;
+ bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk);
+ bool has_metadata = meta_buffer && meta_len;
struct bio *bio = NULL;
int ret;
+ if (has_metadata && !supports_metadata)
+ return -EINVAL;
+
if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
struct iov_iter iter;
@@ -143,15 +149,15 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
goto out;
bio = req->bio;
- if (bdev) {
+ if (bdev)
bio_set_dev(bio, bdev);
- if (meta_buffer && meta_len) {
- ret = bio_integrity_map_user(bio, meta_buffer, meta_len,
- meta_seed);
- if (ret)
- goto out_unmap;
- req->cmd_flags |= REQ_INTEGRITY;
- }
+
+ if (has_metadata) {
+ ret = bio_integrity_map_user(bio, meta_buffer, meta_len,
+ meta_seed);
+ if (ret)
+ goto out_unmap;
+ req->cmd_flags |= REQ_INTEGRITY;
}
return ret;
@@ -260,8 +266,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.control = cpu_to_le16(io.control);
c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
c.rw.reftag = cpu_to_le32(io.reftag);
- c.rw.apptag = cpu_to_le16(io.apptag);
- c.rw.appmask = cpu_to_le16(io.appmask);
+ c.rw.lbat = cpu_to_le16(io.apptag);
+ c.rw.lbatm = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata,
meta_len, lower_32_bits(io.slba), NULL, 0, 0);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index da57947130cc..313a4f978a2c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -91,6 +91,11 @@ enum nvme_quirks {
NVME_QUIRK_NO_DEEPEST_PS = (1 << 5),
/*
+ * Problems seen with concurrent commands
+ */
+ NVME_QUIRK_QDEPTH_ONE = (1 << 6),
+
+ /*
* Set MEDIUM priority on SQ creation
*/
NVME_QUIRK_MEDIUM_PRIO_SQ = (1 << 7),
@@ -372,7 +377,7 @@ struct nvme_ctrl {
struct nvme_dhchap_key *ctrl_key;
u16 transaction;
#endif
- struct key *tls_key;
+ key_serial_t tls_pskid;
/* Power saving configuration */
u64 ps_max_latency_us;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c0533f3f64cb..7990c3f22ecf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2563,15 +2563,8 @@ static int nvme_pci_enable(struct nvme_dev *dev)
else
dev->io_sqes = NVME_NVM_IOSQES;
- /*
- * Temporary fix for the Apple controller found in the MacBook8,1 and
- * some MacBook7,1 to avoid controller resets and data loss.
- */
- if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
+ if (dev->ctrl.quirks & NVME_QUIRK_QDEPTH_ONE) {
dev->q_depth = 2;
- dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
- "set queue depth=%u to work around controller resets\n",
- dev->q_depth);
} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
(pdev->device == 0xa821 || pdev->device == 0xa822) &&
NVME_CAP_MQES(dev->ctrl.cap) == 0) {
@@ -3442,6 +3435,8 @@ static const struct pci_device_id nvme_id_table[] = {
NVME_QUIRK_BOGUS_NID, },
{ PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_BOGUS_NID, },
+ { PCI_DEVICE(0x1217, 0x8760), /* O2 Micro 64GB Steam Deck */
+ .driver_data = NVME_QUIRK_QDEPTH_ONE },
{ PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_BOGUS_NID, },
@@ -3576,7 +3571,12 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
- .driver_data = NVME_QUIRK_SINGLE_VECTOR },
+ /*
+ * Fix for the Apple controller found in the MacBook8,1 and
+ * some MacBook7,1 to avoid controller resets and data loss.
+ */
+ .driver_data = NVME_QUIRK_SINGLE_VECTOR |
+ NVME_QUIRK_QDEPTH_ONE },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 2eb33842f971..15b5e06039a5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1363,8 +1363,8 @@ static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
if (control & NVME_RW_PRINFO_PRCHK_REF)
domain->sig.dif.ref_remap = true;
- domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
- domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
+ domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat);
+ domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm);
domain->sig.dif.app_escape = true;
if (pi_type == NVME_NS_DPS_PI_TYPE3)
domain->sig.dif.ref_escape = true;
@@ -1876,6 +1876,8 @@ static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
*/
priv.hrqsize = cpu_to_le16(queue->queue_size);
priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
+ /* cntlid should only be set when creating an I/O queue */
+ priv.cntlid = cpu_to_le16(ctrl->ctrl.cntlid);
}
ret = rdma_connect_locked(queue->cm_id, &param);
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index ba05faaac562..eb345551d6fe 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -664,19 +664,6 @@ static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR,
nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store);
#endif
-#ifdef CONFIG_NVME_TCP_TLS
-static ssize_t tls_key_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
-
- if (!ctrl->tls_key)
- return 0;
- return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key));
-}
-static DEVICE_ATTR_RO(tls_key);
-#endif
-
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -704,9 +691,6 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_dhchap_secret.attr,
&dev_attr_dhchap_ctrl_secret.attr,
#endif
-#ifdef CONFIG_NVME_TCP_TLS
- &dev_attr_tls_key.attr,
-#endif
&dev_attr_adm_passthru_err_log_enabled.attr,
NULL
};
@@ -737,11 +721,6 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts)
return 0;
#endif
-#ifdef CONFIG_NVME_TCP_TLS
- if (a == &dev_attr_tls_key.attr &&
- (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp")))
- return 0;
-#endif
return a->mode;
}
@@ -752,8 +731,77 @@ const struct attribute_group nvme_dev_attrs_group = {
};
EXPORT_SYMBOL_GPL(nvme_dev_attrs_group);
+#ifdef CONFIG_NVME_TCP_TLS
+static ssize_t tls_key_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ if (!ctrl->tls_pskid)
+ return 0;
+ return sysfs_emit(buf, "%08x\n", ctrl->tls_pskid);
+}
+static DEVICE_ATTR_RO(tls_key);
+
+static ssize_t tls_configured_key_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct key *key = ctrl->opts->tls_key;
+
+ return sysfs_emit(buf, "%08x\n", key_serial(key));
+}
+static DEVICE_ATTR_RO(tls_configured_key);
+
+static ssize_t tls_keyring_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct key *keyring = ctrl->opts->keyring;
+
+ return sysfs_emit(buf, "%s\n", keyring->description);
+}
+static DEVICE_ATTR_RO(tls_keyring);
+
+static struct attribute *nvme_tls_attrs[] = {
+ &dev_attr_tls_key.attr,
+ &dev_attr_tls_configured_key.attr,
+ &dev_attr_tls_keyring.attr,
+};
+
+static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj,
+ struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ if (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp"))
+ return 0;
+
+ if (a == &dev_attr_tls_key.attr &&
+ !ctrl->opts->tls)
+ return 0;
+ if (a == &dev_attr_tls_configured_key.attr &&
+ !ctrl->opts->tls_key)
+ return 0;
+ if (a == &dev_attr_tls_keyring.attr &&
+ !ctrl->opts->keyring)
+ return 0;
+
+ return a->mode;
+}
+
+const struct attribute_group nvme_tls_attrs_group = {
+ .attrs = nvme_tls_attrs,
+ .is_visible = nvme_tls_attrs_are_visible,
+};
+#endif
+
const struct attribute_group *nvme_dev_attr_groups[] = {
&nvme_dev_attrs_group,
+#ifdef CONFIG_NVME_TCP_TLS
+ &nvme_tls_attrs_group,
+#endif
NULL,
};
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a2a47d3ab99f..89c44413c593 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -165,6 +165,7 @@ struct nvme_tcp_queue {
bool hdr_digest;
bool data_digest;
+ bool tls_enabled;
struct ahash_request *rcv_hash;
struct ahash_request *snd_hash;
__le32 exp_ddgst;
@@ -213,7 +214,21 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
return queue - queue->ctrl->queues;
}
-static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl)
+/*
+ * Check if the queue is TLS encrypted
+ */
+static inline bool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue)
+{
+ if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
+ return 0;
+
+ return queue->tls_enabled;
+}
+
+/*
+ * Check if TLS is configured for the controller.
+ */
+static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl)
{
if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
return 0;
@@ -368,7 +383,7 @@ static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue)
static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
{
- return !nvme_tcp_tls(&queue->ctrl->ctrl) &&
+ return !nvme_tcp_queue_tls(queue) &&
nvme_tcp_queue_has_pending(queue);
}
@@ -1051,7 +1066,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
else
msg.msg_flags |= MSG_MORE;
- if (!sendpage_ok(page))
+ if (!sendpages_ok(page, len, offset))
msg.msg_flags &= ~MSG_SPLICE_PAGES;
bvec_set_page(&bvec, page, len, offset);
@@ -1427,7 +1442,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
memset(&msg, 0, sizeof(msg));
iov.iov_base = icresp;
iov.iov_len = sizeof(*icresp);
- if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
+ if (nvme_tcp_queue_tls(queue)) {
msg.msg_control = cbuf;
msg.msg_controllen = sizeof(cbuf);
}
@@ -1439,7 +1454,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
goto free_icresp;
}
ret = -ENOTCONN;
- if (nvme_tcp_tls(&queue->ctrl->ctrl)) {
+ if (nvme_tcp_queue_tls(queue)) {
ctype = tls_get_record_type(queue->sock->sk,
(struct cmsghdr *)cbuf);
if (ctype != TLS_RECORD_TYPE_DATA) {
@@ -1581,13 +1596,16 @@ static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
goto out_complete;
}
- tls_key = key_lookup(pskid);
+ tls_key = nvme_tls_key_lookup(pskid);
if (IS_ERR(tls_key)) {
dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n",
qid, pskid);
queue->tls_err = -ENOKEY;
} else {
- ctrl->ctrl.tls_key = tls_key;
+ queue->tls_enabled = true;
+ if (qid == 0)
+ ctrl->ctrl.tls_pskid = key_serial(tls_key);
+ key_put(tls_key);
queue->tls_err = 0;
}
@@ -1768,7 +1786,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
}
/* If PSKs are configured try to start TLS */
- if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) {
+ if (nvme_tcp_tls_configured(nctrl) && pskid) {
ret = nvme_tcp_start_tls(nctrl, queue, pskid);
if (ret)
goto err_init_connect;
@@ -1829,6 +1847,8 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
mutex_lock(&queue->queue_lock);
if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
__nvme_tcp_stop_queue(queue);
+ /* Stopping the queue will disable TLS */
+ queue->tls_enabled = false;
mutex_unlock(&queue->queue_lock);
}
@@ -1925,16 +1945,17 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
int ret;
key_serial_t pskid = 0;
- if (nvme_tcp_tls(ctrl)) {
+ if (nvme_tcp_tls_configured(ctrl)) {
if (ctrl->opts->tls_key)
pskid = key_serial(ctrl->opts->tls_key);
- else
+ else {
pskid = nvme_tls_psk_default(ctrl->opts->keyring,
ctrl->opts->host->nqn,
ctrl->opts->subsysnqn);
- if (!pskid) {
- dev_err(ctrl->device, "no valid PSK found\n");
- return -ENOKEY;
+ if (!pskid) {
+ dev_err(ctrl->device, "no valid PSK found\n");
+ return -ENOKEY;
+ }
}
}
@@ -1957,13 +1978,14 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
{
int i, ret;
- if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) {
+ if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) {
dev_err(ctrl->device, "no PSK negotiated\n");
return -ENOKEY;
}
+
for (i = 1; i < ctrl->queue_count; i++) {
ret = nvme_tcp_alloc_queue(ctrl, i,
- key_serial(ctrl->tls_key));
+ ctrl->tls_pskid);
if (ret)
goto out_free_queues;
}
@@ -2144,6 +2166,11 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
if (remove)
nvme_unquiesce_admin_queue(ctrl);
nvme_tcp_destroy_admin_queue(ctrl, remove);
+ if (ctrl->tls_pskid) {
+ dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n",
+ ctrl->tls_pskid);
+ ctrl->tls_pskid = 0;
+ }
}
static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 85006b2df8ae..954d4c074770 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -1015,8 +1015,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
if (nvme_is_fabrics(cmd))
return nvmet_parse_fabrics_admin_cmd(req);
- if (unlikely(!nvmet_check_auth_status(req)))
- return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR;
if (nvmet_is_disc_subsys(nvmet_req_subsys(req)))
return nvmet_parse_discovery_cmd(req);
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 8bc3f431c77f..7897d02c681d 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -25,6 +25,18 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
unsigned char key_hash;
char *dhchap_secret;
+ if (!strlen(secret)) {
+ if (set_ctrl) {
+ kfree(host->dhchap_ctrl_secret);
+ host->dhchap_ctrl_secret = NULL;
+ host->dhchap_ctrl_key_hash = 0;
+ } else {
+ kfree(host->dhchap_secret);
+ host->dhchap_secret = NULL;
+ host->dhchap_key_hash = 0;
+ }
+ return 0;
+ }
if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1)
return -EINVAL;
if (key_hash > 3) {
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 1eff8ca6a5f1..1b6264fa5803 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -578,8 +578,8 @@ static void nvmet_rdma_set_sig_domain(struct blk_integrity *bi,
if (control & NVME_RW_PRINFO_PRCHK_REF)
domain->sig.dif.ref_remap = true;
- domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
- domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
+ domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat);
+ domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm);
domain->sig.dif.app_escape = true;
if (pi_type == NVME_NS_DPS_PI_TYPE3)
domain->sig.dif.ref_escape = true;
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index ce13416bc10f..fec5c6cde0a7 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -73,20 +73,13 @@ struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
struct btrfs_bio *orig_bbio,
- u64 map_length, bool use_append)
+ u64 map_length)
{
struct btrfs_bio *bbio;
struct bio *bio;
- if (use_append) {
- unsigned int nr_segs;
-
- bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
- &btrfs_clone_bioset, map_length);
- } else {
- bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
- GFP_NOFS, &btrfs_clone_bioset);
- }
+ bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
+ &btrfs_clone_bioset);
bbio = btrfs_bio(bio);
btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
bbio->inode = orig_bbio->inode;
@@ -659,6 +652,19 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
return true;
}
+static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
+{
+ unsigned int nr_segs;
+ int sector_offset;
+
+ map_length = min(map_length, bbio->fs_info->max_zone_append_size);
+ sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
+ &nr_segs, map_length);
+ if (sector_offset)
+ return sector_offset << SECTOR_SHIFT;
+ return map_length;
+}
+
static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
{
struct btrfs_inode *inode = bbio->inode;
@@ -688,10 +694,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
map_length = min(map_length, length);
if (use_append)
- map_length = min(map_length, fs_info->max_zone_append_size);
+ map_length = btrfs_append_map_length(bbio, map_length);
if (map_length < length) {
- bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
+ bbio = btrfs_split_bio(fs_info, bbio, map_length);
bio = &bbio->bio;
}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index a46e2047bea4..faceadb040f9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,8 +324,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio)
void bio_trim(struct bio *bio, sector_t offset, sector_t size);
extern struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs);
-struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
- unsigned *segs, struct bio_set *bs, unsigned max_bytes);
+int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
+ unsigned *segs, unsigned max_bytes);
/**
* bio_next_split - get next @sectors from a bio, splitting if necessary
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b7664d593486..643c9020a35a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1187,7 +1187,8 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
return q->limits.max_segment_size;
}
-static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
+static inline unsigned int
+queue_limits_max_zone_append_sectors(const struct queue_limits *l)
{
unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3065972c94ba..3066af38e94e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1605,6 +1605,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
+void unpin_user_folio(struct folio *folio, unsigned long npages);
void unpin_folios(struct folio **folios, unsigned long nfolios);
static inline bool is_cow_mapping(vm_flags_t flags)
diff --git a/include/linux/net.h b/include/linux/net.h
index 688320b79fcc..b75bc534c1b3 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -322,6 +322,25 @@ static inline bool sendpage_ok(struct page *page)
return !PageSlab(page) && page_count(page) >= 1;
}
+/*
+ * Check sendpage_ok on contiguous pages.
+ */
+static inline bool sendpages_ok(struct page *page, size_t len, size_t offset)
+{
+ struct page *p = page + (offset >> PAGE_SHIFT);
+ size_t count = 0;
+
+ while (count < len) {
+ if (!sendpage_ok(p))
+ return false;
+
+ p++;
+ count += PAGE_SIZE;
+ }
+
+ return true;
+}
+
int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
size_t num, size_t len);
int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
diff --git a/include/linux/nvme-keyring.h b/include/linux/nvme-keyring.h
index e10333d78dbb..19d2b256180f 100644
--- a/include/linux/nvme-keyring.h
+++ b/include/linux/nvme-keyring.h
@@ -12,7 +12,7 @@ key_serial_t nvme_tls_psk_default(struct key *keyring,
const char *hostnqn, const char *subnqn);
key_serial_t nvme_keyring_id(void);
-
+struct key *nvme_tls_key_lookup(key_serial_t key_id);
#else
static inline key_serial_t nvme_tls_psk_default(struct key *keyring,
@@ -24,5 +24,9 @@ static inline key_serial_t nvme_keyring_id(void)
{
return 0;
}
+static inline struct key *nvme_tls_key_lookup(key_serial_t key_id)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
#endif /* !CONFIG_NVME_KEYRING */
#endif /* _NVME_KEYRING_H */
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index eb2f04d636c8..97c5f00b9aa3 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -25,6 +25,7 @@ enum nvme_rdma_cm_status {
NVME_RDMA_CM_NO_RSC = 0x06,
NVME_RDMA_CM_INVALID_IRD = 0x07,
NVME_RDMA_CM_INVALID_ORD = 0x08,
+ NVME_RDMA_CM_INVALID_CNTLID = 0x09,
};
static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
@@ -46,6 +47,8 @@ static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status)
return "invalid IRD";
case NVME_RDMA_CM_INVALID_ORD:
return "Invalid ORD";
+ case NVME_RDMA_CM_INVALID_CNTLID:
+ return "invalid controller ID";
default:
return "unrecognized reason";
}
@@ -64,7 +67,8 @@ struct nvme_rdma_cm_req {
__le16 qid;
__le16 hrqsize;
__le16 hsqsize;
- u8 rsvd[24];
+ __le16 cntlid;
+ u8 rsvd[22];
};
/**
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7b2ae2e43544..b58d9405d65e 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -987,8 +987,8 @@ struct nvme_rw_command {
__le16 control;
__le32 dsmgmt;
__le32 reftag;
- __le16 apptag;
- __le16 appmask;
+ __le16 lbat;
+ __le16 lbatm;
};
enum {
@@ -1057,8 +1057,8 @@ struct nvme_write_zeroes_cmd {
__le16 control;
__le32 dsmgmt;
__le32 reftag;
- __le16 apptag;
- __le16 appmask;
+ __le16 lbat;
+ __le16 lbatm;
};
enum nvme_zone_mgmt_action {
diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index 80ce0ef43afd..f1d468acfb25 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -42,8 +42,9 @@ enum {
NBD_CMD_WRITE = 1,
NBD_CMD_DISC = 2,
NBD_CMD_FLUSH = 3,
- NBD_CMD_TRIM = 4
+ NBD_CMD_TRIM = 4,
/* userspace defines additional extension commands */
+ NBD_CMD_WRITE_ZEROES = 6,
};
/* values for flags field, these are server interaction specific. */
@@ -51,12 +52,15 @@ enum {
#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */
#define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */
#define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */
-/* there is a gap here to match userspace */
+#define NBD_FLAG_ROTATIONAL (1 << 4) /* device is rotational */
#define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */
+#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6) /* supports WRITE_ZEROES */
+/* there is a gap here to match userspace */
#define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */
/* values for cmd flags in the upper 16 bits of request type */
#define NBD_CMD_FLAG_FUA (1 << 16) /* FUA (forced unit access) op */
+#define NBD_CMD_FLAG_NO_HOLE (1 << 17) /* Do not punch a hole for WRITE_ZEROES */
/* These are client behavior specific flags. */
#define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on
diff --git a/mm/gup.c b/mm/gup.c
index 54d0dc3831fb..02c46ae33028 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -416,6 +416,19 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
EXPORT_SYMBOL(unpin_user_pages);
/**
+ * unpin_user_folio() - release pages of a folio
+ * @folio: pointer to folio to be released
+ * @npages: number of pages of same folio
+ *
+ * Release npages of the folio
+ */
+void unpin_user_folio(struct folio *folio, unsigned long npages)
+{
+ gup_put_folio(folio, npages, FOLL_PIN);
+}
+EXPORT_SYMBOL(unpin_user_folio);
+
+/**
* unpin_folios() - release an array of gup-pinned folios.
* @folios: array of folios to be marked dirty and released.
* @nfolios: number of folios in the @folios array.