diff options
60 files changed, 657 insertions, 740 deletions
diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst deleted file mode 100644 index 2ae7f064736a..000000000000 --- a/Documentation/block/capability.rst +++ /dev/null @@ -1,10 +0,0 @@ -=============================== -Generic Block Device Capability -=============================== - -This file documents the sysfs file ``block/<disk>/capability``. - -``capability`` is a bitfield, printed in hexadecimal, indicating which -capabilities a specific block device supports: - -.. kernel-doc:: include/linux/blkdev.h diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst index c4c73db748a8..102953166429 100644 --- a/Documentation/block/index.rst +++ b/Documentation/block/index.rst @@ -10,7 +10,6 @@ Block bfq-iosched biovecs blk-mq - capability cmdline-partition data-integrity deadline-iosched diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b42956ab5550..4fdbbec71647 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -405,7 +405,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) parent = bfqg_parent(bfqg); - lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); + lockdep_assert_held(&bfqg_to_blkg(bfqg)->disk->queue->queue_lock); if (unlikely(!parent)) return; @@ -513,12 +513,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd) kfree(cpd_to_bfqgd(cpd)); } -static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *bfq_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct bfq_group *bfqg; - bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node); + bfqg = kzalloc_node(sizeof(*bfqg), gfp, disk->node_id); if (!bfqg) return NULL; @@ -536,7 +536,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) { struct blkcg_gq *blkg = pd_to_blkg(pd); struct bfq_group *bfqg = blkg_to_bfqg(blkg); - struct bfq_data *bfqd = blkg->q->elevator->elevator_data; + struct bfq_data *bfqd = blkg->disk->queue->elevator->elevator_data; struct bfq_entity *entity = &bfqg->entity; struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); @@ -1001,7 +1001,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd) { struct blkcg_gq *blkg; - list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { + list_for_each_entry(blkg, &bfqd->queue->disk->blkg_list, entry) { struct bfq_group *bfqg = blkg_to_bfqg(blkg); bfq_end_wr_async_queues(bfqd, bfqg); @@ -1201,7 +1201,7 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(&blkg->q->queue_lock); + lockdep_assert_held(&blkg->disk->queue->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { @@ -1291,11 +1291,11 @@ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { int ret; - ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); + ret = blkcg_activate_policy(bfqd->queue->disk, &blkcg_policy_bfq); if (ret) return NULL; - return blkg_to_bfqg(bfqd->queue->root_blkg); + return blkg_to_bfqg(bfqd->queue->disk->root_blkg); } struct blkcg_policy blkcg_policy_bfq = { diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 4705c4be90e7..777dcab73c8e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7146,7 +7146,7 @@ static void bfq_exit_queue(struct elevator_queue *e) bfqg_and_blkg_put(bfqd->root_group); #ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); + blkcg_deactivate_policy(bfqd->queue->disk, &blkcg_policy_bfq); #else spin_lock_irq(&bfqd->lock); bfq_put_async_queues(bfqd, bfqd->root_group); @@ -7156,7 +7156,7 @@ static void bfq_exit_queue(struct elevator_queue *e) blk_stat_disable_accounting(bfqd->queue); clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); - wbt_enable_default(bfqd->queue); + wbt_enable_default(bfqd->queue->disk); kfree(bfqd); } @@ -7344,7 +7344,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); - wbt_disable_default(q); + wbt_disable_default(q->disk); blk_stat_enable_accounting(q); return 0; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 3f5685c00e36..a3776064c52a 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -124,23 +124,18 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_vec *iv; if (bip->bip_vcnt >= bip->bip_max_vcnt) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } - iv = bip->bip_vec + bip->bip_vcnt; - if (bip->bip_vcnt && bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; - iv->bv_page = page; - iv->bv_len = len; - iv->bv_offset = offset; + bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; return len; diff --git a/block/bio.c b/block/bio.c index d7fbc7adfc50..71e411a0c129 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1029,10 +1029,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, if (bio->bi_vcnt >= queue_max_segments(q)) return 0; - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); bio->bi_vcnt++; bio->bi_iter.bi_size += len; return len; @@ -1108,15 +1105,10 @@ EXPORT_SYMBOL_GPL(bio_add_zone_append_page); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); - bv->bv_page = page; - bv->bv_offset = off; - bv->bv_len = len; - + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; } diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c index 3304e841df7c..b8b8c82e667a 100644 --- a/block/blk-cgroup-rwstat.c +++ b/block/blk-cgroup-rwstat.c @@ -107,7 +107,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, struct cgroup_subsys_state *pos_css; unsigned int i; - lockdep_assert_held(&blkg->q->queue_lock); + lockdep_assert_held(&blkg->disk->queue->queue_lock); memset(sum, 0, sizeof(*sum)); rcu_read_lock(); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index cb110fc51940..c46778d1f3c2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -108,64 +108,41 @@ static struct cgroup_subsys_state *blkcg_css(void) return task_css(current, io_cgrp_id); } -static bool blkcg_policy_enabled(struct request_queue *q, +static bool blkcg_policy_enabled(struct gendisk *disk, const struct blkcg_policy *pol) { - return pol && test_bit(pol->plid, q->blkcg_pols); + return pol && test_bit(pol->plid, disk->blkcg_pols); } -static void blkg_free_workfn(struct work_struct *work) +static void blkg_free(struct blkcg_gq *blkg) { - struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, - free_work); - struct request_queue *q = blkg->q; int i; /* * pd_free_fn() can also be called from blkcg_deactivate_policy(), * in order to make sure pd_free_fn() is called in order, the deletion - * of the list blkg->q_node is delayed to here from blkg_destroy(), and + * of the list blkg->entry is delayed to here from blkg_destroy(), and * blkcg_mutex is used to synchronize blkg_free_workfn() and * blkcg_deactivate_policy(). */ - if (q) - mutex_lock(&q->blkcg_mutex); - + mutex_lock(&blkg->disk->blkcg_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - if (blkg->parent) blkg_put(blkg->parent); + list_del_init(&blkg->entry); + mutex_unlock(&blkg->disk->blkcg_mutex); - if (q) { - list_del_init(&blkg->q_node); - mutex_unlock(&q->blkcg_mutex); - blk_put_queue(q); - } - + put_disk(blkg->disk); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } -/** - * blkg_free - free a blkg - * @blkg: blkg to free - * - * Free @blkg which may be partially allocated. - */ -static void blkg_free(struct blkcg_gq *blkg) +static void blkg_free_workfn(struct work_struct *work) { - if (!blkg) - return; - - /* - * Both ->pd_free_fn() and request queue's release handler may - * sleep, so free us by scheduling one work func - */ - INIT_WORK(&blkg->free_work, blkg_free_workfn); - schedule_work(&blkg->free_work); + blkg_free(container_of(work, struct blkcg_gq, free_work)); } static void __blkg_release(struct rcu_head *rcu) @@ -176,7 +153,10 @@ static void __blkg_release(struct rcu_head *rcu) /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); - blkg_free(blkg); + + /* ->pd_free_fn() may sleep, so free from a work queue */ + INIT_WORK(&blkg->free_work, blkg_free_workfn); + schedule_work(&blkg->free_work); } /* @@ -265,19 +245,18 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; - if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) - goto err_free; - + goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) - goto err_free; + goto out_exit_refcnt; - if (!blk_get_queue(disk->queue)) - goto err_free; + if (test_bit(GD_DEAD, &disk->state)) + goto out_free_iostat; + get_device(disk_to_dev(disk)); + blkg->disk = disk; - blkg->q = disk->queue; - INIT_LIST_HEAD(&blkg->q_node); + INIT_LIST_HEAD(&blkg->entry); spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); @@ -293,14 +272,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; - if (!blkcg_policy_enabled(disk->queue, pol)) + if (!blkcg_policy_enabled(disk, pol)) continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg); + pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) - goto err_free; - + goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; @@ -309,8 +287,17 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, return blkg; -err_free: - blkg_free(blkg); +out_free_pds: + while (--i >= 0) + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + put_disk(blkg->disk); +out_free_iostat: + free_percpu(blkg->iostat_cpu); +out_exit_refcnt: + percpu_ref_exit(&blkg->refcnt); +out_free_blkg: + kfree(blkg); return NULL; } @@ -350,7 +337,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, /* link parent */ if (blkcg_parent(blkcg)) { - blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); + blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; @@ -371,7 +358,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - list_add(&blkg->q_node, &disk->queue->blkg_list); + list_add(&blkg->entry, &disk->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -396,7 +383,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, err_put_css: css_put(&blkcg->css); err_free_blkg: - blkg_free(new_blkg); + if (new_blkg) + blkg_free(new_blkg); return ERR_PTR(ret); } @@ -422,12 +410,12 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, WARN_ON_ONCE(!rcu_read_lock_held()); - blkg = blkg_lookup(blkcg, q); + blkg = blkg_lookup(blkcg, disk); if (blkg) return blkg; spin_lock_irqsave(&q->queue_lock, flags); - blkg = blkg_lookup(blkcg, q); + blkg = blkg_lookup(blkcg, disk); if (blkg) { if (blkcg != &blkcg_root && blkg != rcu_dereference(blkcg->blkg_hint)) @@ -443,10 +431,10 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); - struct blkcg_gq *ret_blkg = q->root_blkg; + struct blkcg_gq *ret_blkg = disk->root_blkg; while (parent) { - blkg = blkg_lookup(parent, q); + blkg = blkg_lookup(parent, disk); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; @@ -475,7 +463,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg *blkcg = blkg->blkcg; int i; - lockdep_assert_held(&blkg->q->queue_lock); + lockdep_assert_held(&blkg->disk->queue->queue_lock); lockdep_assert_held(&blkcg->lock); /* @@ -499,7 +487,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) blkg->online = false; - radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); + radix_tree_delete(&blkcg->blkg_tree, blkg->disk->queue->id); hlist_del_init_rcu(&blkg->blkcg_node); /* @@ -525,7 +513,7 @@ static void blkg_destroy_all(struct gendisk *disk) restart: spin_lock_irq(&q->queue_lock); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + list_for_each_entry_safe(blkg, n, &disk->blkg_list, entry) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); @@ -544,7 +532,7 @@ restart: } } - q->root_blkg = NULL; + disk->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } @@ -586,9 +574,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - if (!blkg->q->disk || !blkg->q->disk->bdi->dev) - return NULL; - return bdi_dev_name(blkg->q->disk->bdi); + return bdi_dev_name(blkg->disk->bdi); } /** @@ -620,10 +606,10 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - spin_lock_irq(&blkg->q->queue_lock); - if (blkcg_policy_enabled(blkg->q, pol)) + spin_lock_irq(&blkg->disk->queue->queue_lock); + if (blkcg_policy_enabled(blkg->disk, pol)) total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(&blkg->q->queue_lock); + spin_unlock_irq(&blkg->disk->queue->queue_lock); } rcu_read_unlock(); @@ -729,12 +715,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_lock(); spin_lock_irq(&q->queue_lock); - if (!blkcg_policy_enabled(q, pol)) { + if (!blkcg_policy_enabled(disk, pol)) { ret = -EOPNOTSUPP; goto fail_unlock; } - blkg = blkg_lookup(blkcg, q); + blkg = blkg_lookup(blkcg, disk); if (blkg) goto success; @@ -748,7 +734,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); - while (parent && !blkg_lookup(parent, q)) { + while (parent && !blkg_lookup(parent, disk)) { pos = parent; parent = blkcg_parent(parent); } @@ -772,13 +758,13 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, rcu_read_lock(); spin_lock_irq(&q->queue_lock); - if (!blkcg_policy_enabled(q, pol)) { + if (!blkcg_policy_enabled(disk, pol)) { blkg_free(new_blkg); ret = -EOPNOTSUPP; goto fail_preloaded; } - blkg = blkg_lookup(pos, q); + blkg = blkg_lookup(pos, disk); if (blkg) { blkg_free(new_blkg); } else { @@ -952,7 +938,7 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); - struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; + struct blkcg_gq *blkg = bdev->bd_disk->root_blkg; struct blkg_iostat tmp; int cpu; unsigned long flags; @@ -1047,9 +1033,9 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - spin_lock_irq(&blkg->q->queue_lock); + spin_lock_irq(&blkg->disk->queue->queue_lock); blkcg_print_one_stat(blkg, sf); - spin_unlock_irq(&blkg->q->queue_lock); + spin_unlock_irq(&blkg->disk->queue->queue_lock); } rcu_read_unlock(); return 0; @@ -1119,7 +1105,7 @@ static void blkcg_destroy_blkgs(struct blkcg *blkcg) while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); - struct request_queue *q = blkg->q; + struct request_queue *q = blkg->disk->queue; if (need_resched() || !spin_trylock(&q->queue_lock)) { /* @@ -1299,8 +1285,8 @@ int blkcg_init_disk(struct gendisk *disk) bool preloaded; int ret; - INIT_LIST_HEAD(&q->blkg_list); - mutex_init(&q->blkcg_mutex); + INIT_LIST_HEAD(&disk->blkg_list); + mutex_init(&disk->blkcg_mutex); new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) @@ -1314,7 +1300,7 @@ int blkcg_init_disk(struct gendisk *disk) blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; - q->root_blkg = blkg; + disk->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (preloaded) @@ -1377,9 +1363,9 @@ static void blkcg_bind(struct cgroup_subsys_state *root_css) static void blkcg_exit(struct task_struct *tsk) { - if (tsk->throttle_queue) - blk_put_queue(tsk->throttle_queue); - tsk->throttle_queue = NULL; + if (tsk->throttle_disk) + put_disk(tsk->throttle_disk); + tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { @@ -1405,14 +1391,14 @@ struct cgroup_subsys io_cgrp_subsys = { EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** - * blkcg_activate_policy - activate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_activate_policy - activate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to activate * - * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through + * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * - * Activation happens with @q bypassed, so nobody would be accessing blkgs + * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. @@ -1420,14 +1406,14 @@ EXPORT_SYMBOL_GPL(io_cgrp_subsys); * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) +int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; - if (blkcg_policy_enabled(q, pol)) + if (blkcg_policy_enabled(disk, pol)) return 0; if (queue_is_mq(q)) @@ -1436,7 +1422,7 @@ retry: spin_lock_irq(&q->queue_lock); /* blkg_list is pushed at the head, reverse walk to allocate parents first */ - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { + list_for_each_entry_reverse(blkg, &disk->blkg_list, entry) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) @@ -1447,8 +1433,8 @@ retry: pd = pd_prealloc; pd_prealloc = NULL; } else { - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, - blkg->blkcg); + pd = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_NOWAIT | __GFP_NOWARN); } if (!pd) { @@ -1465,8 +1451,8 @@ retry: if (pd_prealloc) pol->pd_free_fn(pd_prealloc); - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, - blkg->blkcg); + pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_KERNEL); if (pd_prealloc) goto retry; else @@ -1481,16 +1467,16 @@ retry: /* all allocated, init in the same order */ if (pol->pd_init_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + list_for_each_entry_reverse(blkg, &disk->blkg_list, entry) pol->pd_init_fn(blkg->pd[pol->plid]); - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { + list_for_each_entry_reverse(blkg, &disk->blkg_list, entry) { if (pol->pd_online_fn) pol->pd_online_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid]->online = true; } - __set_bit(pol->plid, q->blkcg_pols); + __set_bit(pol->plid, disk->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); @@ -1506,7 +1492,7 @@ out: enomem: /* alloc failed, nothing's initialized yet, free everything */ spin_lock_irq(&q->queue_lock); - list_for_each_entry(blkg, &q->blkg_list, q_node) { + list_for_each_entry(blkg, &disk->blkg_list, entry) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); @@ -1523,30 +1509,31 @@ enomem: EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** - * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to deactivate * - * Deactivate @pol on @q. Follows the same synchronization rules as + * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ -void blkcg_deactivate_policy(struct request_queue *q, +void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; - if (!blkcg_policy_enabled(q, pol)) + if (!blkcg_policy_enabled(disk, pol)) return; if (queue_is_mq(q)) blk_mq_freeze_queue(q); - mutex_lock(&q->blkcg_mutex); + mutex_lock(&disk->blkcg_mutex); spin_lock_irq(&q->queue_lock); - __clear_bit(pol->plid, q->blkcg_pols); + __clear_bit(pol->plid, disk->blkcg_pols); - list_for_each_entry(blkg, &q->blkg_list, q_node) { + list_for_each_entry(blkg, &disk->blkg_list, entry) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); @@ -1560,7 +1547,7 @@ void blkcg_deactivate_policy(struct request_queue *q, } spin_unlock_irq(&q->queue_lock); - mutex_unlock(&q->blkcg_mutex); + mutex_unlock(&disk->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); @@ -1830,29 +1817,29 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we - * check to see if current->throttle_queue is set and if not this doesn't do + * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { - struct request_queue *q = current->throttle_queue; + struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; - if (!q) + if (!disk) return; - current->throttle_queue = NULL; + current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; - blkg = blkg_lookup(blkcg, q); + blkg = blkg_lookup(blkcg, disk); if (!blkg) goto out; if (!blkg_tryget(blkg)) @@ -1861,11 +1848,10 @@ void blkcg_maybe_throttle_current(void) blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); - blk_put_queue(q); + put_disk(disk); return; out: rcu_read_unlock(); - blk_put_queue(q); } /** @@ -1887,18 +1873,17 @@ out: */ void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { - struct request_queue *q = disk->queue; - if (unlikely(current->flags & PF_KTHREAD)) return; - if (current->throttle_queue != q) { - if (!blk_get_queue(q)) + if (current->throttle_disk != disk) { + if (test_bit(GD_DEAD, &disk->state)) return; + get_device(disk_to_dev(disk)); - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; + if (current->throttle_disk) + put_disk(current->throttle_disk); + current->throttle_disk = disk; } if (use_memdelay) @@ -1959,7 +1944,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, * Associate @bio with the blkg found by combining the css's blkg and the * request_queue of the @bio. An association failure is handled by walking up * the blkg tree. Therefore, the blkg associated can be anything between @blkg - * and q->root_blkg. This situation only happens when a cgroup is dying and + * and disk->root_blkg. This situation only happens when a cgroup is dying and * then the remaining bios will spill to the closest alive blkg. * * A reference will be taken on the blkg and will be released when @bio is @@ -1974,8 +1959,8 @@ void bio_associate_blkg_from_css(struct bio *bio, if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); - bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; + blkg_get(bio->bi_bdev->bd_disk->root_blkg); + bio->bi_blkg = bio->bi_bdev->bd_disk->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b13ee84f358e..e442b406ca0d 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -53,9 +53,8 @@ struct blkg_iostat_set { /* association between a blk cgroup and a request queue */ struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; + struct gendisk *disk; + struct list_head entry; struct hlist_node blkcg_node; struct blkcg *blkcg; @@ -155,8 +154,8 @@ typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); -typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, - struct request_queue *q, struct blkcg *blkcg); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp); typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); @@ -195,9 +194,8 @@ void blkcg_exit_disk(struct gendisk *disk); /* Blkio controller policy registration */ int blkcg_policy_register(struct blkcg_policy *pol); void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, +int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol); const char *blkg_dev_name(struct blkcg_gq *blkg); @@ -236,30 +234,30 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio) } /** - * blkg_lookup - lookup blkg for the specified blkcg - q pair + * blkg_lookup - lookup blkg for the specified blkcg - disk pair * @blkcg: blkcg of interest - * @q: request_queue of interest + * @disk: gendisk of interest * - * Lookup blkg for the @blkcg - @q pair. + * Lookup blkg for the @blkcg - @disk pair. * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) + struct gendisk *disk) { struct blkcg_gq *blkg; WARN_ON_ONCE(!rcu_read_lock_held()); if (blkcg == &blkcg_root) - return q->root_blkg; + return disk->root_blkg; blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) + if (blkg && blkg->disk == disk) return blkg; - blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q != q) + blkg = radix_tree_lookup(&blkcg->blkg_tree, disk->queue->id); + if (blkg && blkg->disk != disk) blkg = NULL; return blkg; } @@ -359,7 +357,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q))) + (p_blkg)->disk))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants @@ -374,7 +372,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q))) + (p_blkg)->disk))) bool __blkcg_punt_bio_submit(struct bio *bio); @@ -496,9 +494,9 @@ static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } static inline void blkcg_exit_disk(struct gendisk *disk) { } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, +static inline int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, +static inline void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 9694394ed883..7a2dc9dc8e3b 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -669,7 +669,7 @@ static struct ioc *q_to_ioc(struct request_queue *q) static const char __maybe_unused *ioc_name(struct ioc *ioc) { - struct gendisk *disk = ioc->rqos.q->disk; + struct gendisk *disk = ioc->rqos.disk; if (!disk) return "<unknown>"; @@ -808,11 +808,11 @@ static int ioc_autop_idx(struct ioc *ioc) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(ioc->rqos.q)) + if (!blk_queue_nonrot(ioc->rqos.disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ - if (blk_queue_depth(ioc->rqos.q) == 1) + if (blk_queue_depth(ioc->rqos.disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ @@ -2649,7 +2649,7 @@ retry_lock: if (use_debt) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q->disk, + blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); iocg_unlock(iocg, ioc_locked, &flags); return; @@ -2750,7 +2750,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, if (likely(!list_empty(&iocg->active_list))) { iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q->disk, + blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { iocg_commit_bio(iocg, bio, abs_cost, cost); @@ -2821,7 +2821,7 @@ static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); - blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); spin_lock_irq(&ioc->lock); ioc->running = IOC_STOP; @@ -2832,7 +2832,7 @@ static void ioc_rqos_exit(struct rq_qos *rqos) kfree(ioc); } -static struct rq_qos_ops ioc_rqos_ops = { +static const struct rq_qos_ops ioc_rqos_ops = { .throttle = ioc_rqos_throttle, .merge = ioc_rqos_merge, .done_bio = ioc_rqos_done_bio, @@ -2843,9 +2843,7 @@ static struct rq_qos_ops ioc_rqos_ops = { static int blk_iocost_init(struct gendisk *disk) { - struct request_queue *q = disk->queue; struct ioc *ioc; - struct rq_qos *rqos; int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); @@ -2868,11 +2866,6 @@ static int blk_iocost_init(struct gendisk *disk) local64_set(&ccs->rq_wait_ns, 0); } - rqos = &ioc->rqos; - rqos->id = RQ_QOS_COST; - rqos->ops = &ioc_rqos_ops; - rqos->q = q; - spin_lock_init(&ioc->lock); timer_setup(&ioc->timer, ioc_timer_fn, 0); INIT_LIST_HEAD(&ioc->active_iocgs); @@ -2896,17 +2889,17 @@ static int blk_iocost_init(struct gendisk *disk) * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. */ - ret = rq_qos_add(q, rqos); + ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); if (ret) goto err_free_ioc; - ret = blkcg_activate_policy(q, &blkcg_policy_iocost); + ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); if (ret) goto err_del_qos; return 0; err_del_qos: - rq_qos_del(q, rqos); + rq_qos_del(&ioc->rqos); err_free_ioc: free_percpu(ioc->pcpu_stat); kfree(ioc); @@ -2930,13 +2923,14 @@ static void ioc_cpd_free(struct blkcg_policy_data *cpd) kfree(container_of(cpd, struct ioc_cgrp, cpd)); } -static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; - iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, + disk->node_id); if (!iocg) return NULL; @@ -2953,7 +2947,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); - struct ioc *ioc = q_to_ioc(blkg->q); + struct ioc *ioc = q_to_ioc(blkg->disk->queue); struct ioc_now now; struct blkcg_gq *tblkg; unsigned long flags; @@ -3285,11 +3279,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; - wbt_disable_default(disk->queue); + wbt_disable_default(disk); } else { blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; - wbt_enable_default(disk->queue); + wbt_enable_default(disk); } if (user) { diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index ecdc10741836..5d5aa1e526b7 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -292,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); if (use_delay) - blkcg_schedule_throttle(rqos->q->disk, use_memdelay); + blkcg_schedule_throttle(rqos->disk, use_memdelay); /* * To avoid priority inversions we want to just take a slot if we are @@ -330,7 +330,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, struct child_latency_info *lat_info, bool up) { - unsigned long qd = blkiolat->rqos.q->nr_requests; + unsigned long qd = blkiolat->rqos.disk->queue->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = atomic_read(&lat_info->scale_cookie); unsigned long max_scale = qd << 1; @@ -372,7 +372,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, */ static void scale_change(struct iolatency_grp *iolat, bool up) { - unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; + unsigned long qd = iolat->blkiolat->rqos.disk->queue->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = iolat->max_depth; @@ -646,11 +646,11 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos) timer_shutdown_sync(&blkiolat->timer); flush_work(&blkiolat->enable_work); - blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iolatency); kfree(blkiolat); } -static struct rq_qos_ops blkcg_iolatency_ops = { +static const struct rq_qos_ops blkcg_iolatency_ops = { .throttle = blkcg_iolatency_throttle, .done_bio = blkcg_iolatency_done_bio, .exit = blkcg_iolatency_exit, @@ -665,7 +665,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, - blkiolat->rqos.q->root_blkg) { + blkiolat->rqos.disk->root_blkg) { struct iolatency_grp *iolat; struct child_latency_info *lat_info; unsigned long flags; @@ -749,32 +749,26 @@ static void blkiolatency_enable_work_fn(struct work_struct *work) */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { - blk_mq_freeze_queue(blkiolat->rqos.q); + blk_mq_freeze_queue(blkiolat->rqos.disk->queue); blkiolat->enabled = enabled; - blk_mq_unfreeze_queue(blkiolat->rqos.q); + blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue); } } int blk_iolatency_init(struct gendisk *disk) { - struct request_queue *q = disk->queue; struct blk_iolatency *blkiolat; - struct rq_qos *rqos; int ret; blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); if (!blkiolat) return -ENOMEM; - rqos = &blkiolat->rqos; - rqos->id = RQ_QOS_LATENCY; - rqos->ops = &blkcg_iolatency_ops; - rqos->q = q; - - ret = rq_qos_add(q, rqos); + ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY, + &blkcg_iolatency_ops); if (ret) goto err_free; - ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); + ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency); if (ret) goto err_qos_del; @@ -784,7 +778,7 @@ int blk_iolatency_init(struct gendisk *disk) return 0; err_qos_del: - rq_qos_del(q, rqos); + rq_qos_del(&blkiolat->rqos); err_free: kfree(blkiolat); return ret; @@ -952,13 +946,12 @@ static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) iolat->max_depth, avg_lat, cur_win); } -static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, - struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct iolatency_grp *iolat; - iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); + iolat = kzalloc_node(sizeof(*iolat), gfp, disk->node_id); if (!iolat) return NULL; iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), @@ -974,12 +967,12 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); - struct rq_qos *rqos = blkcg_rq_qos(blkg->q); + struct rq_qos *rqos = blkcg_rq_qos(blkg->disk->queue); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); u64 now = ktime_to_ns(ktime_get()); int cpu; - if (blk_queue_nonrot(blkg->q)) + if (blk_queue_nonrot(blkg->disk->queue)) iolat->ssd = true; else iolat->ssd = false; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 8bb6b8eba4ce..055529b9b92b 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -116,7 +116,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, } static struct blkg_policy_data * -ioprio_alloc_pd(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) +ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct ioprio_blkg *ioprio_blkg; @@ -204,12 +204,12 @@ void blkcg_set_ioprio(struct bio *bio) void blk_ioprio_exit(struct gendisk *disk) { - blkcg_deactivate_policy(disk->queue, &ioprio_policy); + blkcg_deactivate_policy(disk, &ioprio_policy); } int blk_ioprio_init(struct gendisk *disk) { - return blkcg_activate_policy(disk->queue, &ioprio_policy); + return blkcg_activate_policy(disk, &ioprio_policy); } static int __init ioprio_init(void) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index bd942341b638..b01818f8e216 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -813,9 +813,9 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { - lockdep_assert_held(&rqos->q->debugfs_mutex); + lockdep_assert_held(&rqos->disk->queue->debugfs_mutex); - if (!rqos->q->debugfs_dir) + if (!rqos->disk->queue->debugfs_dir) return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; @@ -823,7 +823,7 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { - struct request_queue *q = rqos->q; + struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); lockdep_assert_held(&q->debugfs_mutex); @@ -835,9 +835,7 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) q->rqos_debugfs_dir = debugfs_create_dir("rqos", q->debugfs_dir); - rqos->debugfs_dir = debugfs_create_dir(dir_name, - rqos->q->rqos_debugfs_dir); - + rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 23d1a90fec42..06b312c69114 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -19,8 +19,7 @@ #include "blk-wbt.h" /* - * Mark a hardware queue as needing a restart. For shared queues, maintain - * a count of how many hardware queues are marked for restart. + * Mark a hardware queue as needing a restart. */ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { @@ -82,7 +81,7 @@ dispatch: /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. @@ -210,7 +209,7 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. diff --git a/block/blk-mq.c b/block/blk-mq.c index 9d463f7563bc..89b4dd81ae17 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -658,7 +658,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * allocator for this for the rare use case of a command tied to * a specific queue. */ - if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED)))) + if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || + WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) @@ -1825,12 +1826,13 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; + struct sbitmap_queue *sbq; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; - if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && + !(blk_mq_is_shared_tags(hctx->flags))) { blk_mq_sched_mark_restart_hctx(hctx); /* @@ -1848,6 +1850,10 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, if (!list_empty_careful(&wait->entry)) return false; + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) + sbq = &hctx->tags->breserved_tags; + else + sbq = &hctx->tags->bitmap_tags; wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); @@ -1917,16 +1923,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) static void blk_mq_handle_dev_resource(struct request *rq, struct list_head *list) { - struct request *next = - list_first_entry_or_null(list, struct request, queuelist); - - /* - * If an I/O scheduler has been configured and we got a driver tag for - * the next request already, free it. - */ - if (next) - blk_mq_put_driver_tag(next); - list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); } @@ -2002,6 +1998,23 @@ static void blk_mq_release_budgets(struct request_queue *q, } /* + * blk_mq_commit_rqs will notify driver using bd->last that there is no + * more requests. (See comment in struct blk_mq_ops for commit_rqs for + * details) + * Attention, we should explicitly call this in unusual cases: + * 1) did not queue everything initially scheduled to queue + * 2) the last attempt to queue a request failed + */ +static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, + bool from_schedule) +{ + if (hctx->queue->mq_ops->commit_rqs && queued) { + trace_block_unplug(hctx->queue, queued, !from_schedule); + hctx->queue->mq_ops->commit_rqs(hctx); + } +} + +/* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, @@ -2009,8 +2022,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, { enum prep_dispatch prep; struct request_queue *q = hctx->queue; - struct request *rq, *nxt; - int errors, queued; + struct request *rq; + int queued; blk_status_t ret = BLK_STS_OK; LIST_HEAD(zone_list); bool needs_resource = false; @@ -2021,7 +2034,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, /* * Now process all the entries, sending them to the driver. */ - errors = queued = 0; + queued = 0; do { struct blk_mq_queue_data bd; @@ -2035,17 +2048,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, list_del_init(&rq->queuelist); bd.rq = rq; - - /* - * Flag last if we have no more requests, or if we have more - * but can't assign a driver tag to it. - */ - if (list_empty(list)) - bd.last = true; - else { - nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt); - } + bd.last = list_empty(list); /* * once the request is queued to lld, no need to cover the @@ -2074,7 +2077,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, needs_resource = true; break; default: - errors++; blk_mq_end_request(rq, ret); } } while (!list_empty(list)); @@ -2085,9 +2087,9 @@ out: /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ - if ((!list_empty(list) || errors || needs_resource || - ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued) - q->mq_ops->commit_rqs(hctx); + if (!list_empty(list) || ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); + /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. @@ -2096,7 +2098,8 @@ out: bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && - (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); + ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || + blk_mq_is_shared_tags(hctx->flags)); if (nr_budgets) blk_mq_release_budgets(q, list); @@ -2151,10 +2154,10 @@ out: blk_mq_update_dispatch_busy(hctx, true); return false; - } else - blk_mq_update_dispatch_busy(hctx, false); + } - return (queued + errors) != 0; + blk_mq_update_dispatch_busy(hctx, false); + return true; } /** @@ -2548,16 +2551,6 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, spin_unlock(&ctx->lock); } -static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued, - bool from_schedule) -{ - if (hctx->queue->mq_ops->commit_rqs) { - trace_block_unplug(hctx->queue, *queued, !from_schedule); - hctx->queue->mq_ops->commit_rqs(hctx); - } - *queued = 0; -} - static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { @@ -2681,20 +2674,21 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last); } -static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) +static void blk_mq_plug_issue_direct(struct blk_plug *plug) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; - int errors = 0; + blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(&plug->mq_list))) { bool last = rq_list_empty(plug->mq_list); - blk_status_t ret; if (hctx != rq->mq_hctx) { - if (hctx) - blk_mq_commit_rqs(hctx, &queued, from_schedule); + if (hctx) { + blk_mq_commit_rqs(hctx, queued, false); + queued = 0; + } hctx = rq->mq_hctx; } @@ -2706,21 +2700,16 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, false, true); - blk_mq_commit_rqs(hctx, &queued, from_schedule); - return; + goto out; default: blk_mq_end_request(rq, ret); - errors++; break; } } - /* - * If we didn't flush the entire list, we could have told the driver - * there was more coming, but that turned out to be a lie. - */ - if (errors) - blk_mq_commit_rqs(hctx, &queued, from_schedule); +out: + if (ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); } static void __blk_mq_flush_plug_list(struct request_queue *q, @@ -2791,7 +2780,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) } blk_mq_run_dispatch_ops(q, - blk_mq_plug_issue_direct(plug, false)); + blk_mq_plug_issue_direct(plug)); if (rq_list_empty(plug->mq_list)) return; } @@ -2805,36 +2794,32 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; - int errors = 0; + blk_status_t ret = BLK_STS_OK; while (!list_empty(list)) { - blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); - if (ret != BLK_STS_OK) { - errors++; - if (ret == BLK_STS_RESOURCE || - ret == BLK_STS_DEV_RESOURCE) { - blk_mq_request_bypass_insert(rq, false, - list_empty(list)); - break; - } - blk_mq_end_request(rq, ret); - } else + switch (ret) { + case BLK_STS_OK: queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, false, + list_empty(list)); + goto out; + default: + blk_mq_end_request(rq, ret); + break; + } } - /* - * If we didn't flush the entire list, we could have told - * the driver there was more coming, but that turned out to - * be a lie. - */ - if ((!list_empty(list) || errors) && - hctx->queue->mq_ops->commit_rqs && queued) - hctx->queue->mq_ops->commit_rqs(hctx); +out: + if (ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); } static bool blk_mq_attempt_bio_merge(struct request_queue *q, diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 88f0fe7dcf54..d8cc820a365e 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -294,3 +294,70 @@ void rq_qos_exit(struct request_queue *q) rqos->ops->exit(rqos); } } + +int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, + const struct rq_qos_ops *ops) +{ + struct request_queue *q = disk->queue; + + rqos->disk = disk; + rqos->id = id; + rqos->ops = ops; + + /* + * No IO can be in-flight when adding rqos, so freeze queue, which + * is fine since we only support rq_qos for blk-mq queue. + * + * Reuse ->queue_lock for protecting against other concurrent + * rq_qos adding/deleting + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); + if (rq_qos_id(q, rqos->id)) + goto ebusy; + rqos->next = q->rq_qos; + q->rq_qos = rqos; + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); + + if (rqos->ops->debugfs_attrs) { + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); + } + + return 0; +ebusy: + spin_unlock_irq(&q->queue_lock); + blk_mq_unfreeze_queue(q); + return -EBUSY; +} + +void rq_qos_del(struct rq_qos *rqos) +{ + struct request_queue *q = rqos->disk->queue; + struct rq_qos **cur; + + /* + * See comment in rq_qos_add() about freezing queue & using + * ->queue_lock. + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); + for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { + if (*cur == rqos) { + *cur = rqos->next; + break; + } + } + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_unregister_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); +} diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 1ef1f7d4bc3c..b02a1a3d33a8 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -25,8 +25,8 @@ struct rq_wait { }; struct rq_qos { - struct rq_qos_ops *ops; - struct request_queue *q; + const struct rq_qos_ops *ops; + struct gendisk *disk; enum rq_qos_id id; struct rq_qos *next; #ifdef CONFIG_BLK_DEBUG_FS @@ -85,65 +85,9 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) init_waitqueue_head(&rq_wait->wait); } -static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos) -{ - /* - * No IO can be in-flight when adding rqos, so freeze queue, which - * is fine since we only support rq_qos for blk-mq queue. - * - * Reuse ->queue_lock for protecting against other concurrent - * rq_qos adding/deleting - */ - blk_mq_freeze_queue(q); - - spin_lock_irq(&q->queue_lock); - if (rq_qos_id(q, rqos->id)) - goto ebusy; - rqos->next = q->rq_qos; - q->rq_qos = rqos; - spin_unlock_irq(&q->queue_lock); - - blk_mq_unfreeze_queue(q); - - if (rqos->ops->debugfs_attrs) { - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_register_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); - } - - return 0; -ebusy: - spin_unlock_irq(&q->queue_lock); - blk_mq_unfreeze_queue(q); - return -EBUSY; - -} - -static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) -{ - struct rq_qos **cur; - - /* - * See comment in rq_qos_add() about freezing queue & using - * ->queue_lock. - */ - blk_mq_freeze_queue(q); - - spin_lock_irq(&q->queue_lock); - for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { - if (*cur == rqos) { - *cur = rqos->next; - break; - } - } - spin_unlock_irq(&q->queue_lock); - - blk_mq_unfreeze_queue(q); - - mutex_lock(&q->debugfs_mutex); - blk_mq_debugfs_unregister_rqos(rqos); - mutex_unlock(&q->debugfs_mutex); -} +int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, + const struct rq_qos_ops *ops); +void rq_qos_del(struct rq_qos *rqos); typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); diff --git a/block/blk-settings.c b/block/blk-settings.c index 9c9713c9269c..896b4654ab00 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -16,6 +16,7 @@ #include <linux/dma-mapping.h> #include "blk.h" +#include "blk-rq-qos.h" #include "blk-wbt.h" void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) diff --git a/block/blk-stat.c b/block/blk-stat.c index 2ea01b5c1aca..c6ca16abf911 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -58,7 +58,8 @@ void blk_stat_add(struct request *rq, u64 now) value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; - blk_throtl_stat_add(rq, value); + if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE) + blk_throtl_stat_add(rq, value); rcu_read_lock(); cpu = get_cpu(); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5486b6c57f6b..d70ebecb5347 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -16,6 +16,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" +#include "blk-rq-qos.h" #include "blk-wbt.h" #include "blk-cgroup.h" #include "blk-throttle.h" @@ -500,7 +501,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, rqos = wbt_rq_qos(q); if (!rqos) { - ret = wbt_init(q); + ret = wbt_init(q->disk); if (ret) return ret; } @@ -826,7 +827,7 @@ int blk_register_queue(struct gendisk *disk) goto out_elv_unregister; blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); - wbt_enable_default(q); + wbt_enable_default(disk); blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 6fb5a2f9e1ee..902203bdddb4 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -335,14 +335,13 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq) timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } -static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, - struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct throtl_grp *tg; int rw; - tg = kzalloc_node(sizeof(*tg), gfp, q->node); + tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); if (!tg) return NULL; @@ -388,7 +387,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); struct blkcg_gq *blkg = tg_to_blkg(tg); - struct throtl_data *td = blkg->q->td; + struct throtl_data *td = blkg->disk->queue->td; struct throtl_service_queue *sq = &tg->service_queue; /* @@ -452,7 +451,8 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) bool low_valid = false; rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + blkg_for_each_descendant_post(blkg, pos_css, + td->queue->disk->root_blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || @@ -1175,13 +1175,13 @@ static void throtl_pending_timer_fn(struct timer_list *t) /* throtl_data may be gone, so figure out request queue by blkg */ if (tg) - q = tg->pd.blkg->q; + q = tg->pd.blkg->disk->queue; else q = td->queue; spin_lock_irq(&q->queue_lock); - if (!q->root_blkg) + if (!q->disk->root_blkg) goto out_unlock; if (throtl_can_upgrade(td, NULL)) @@ -1323,7 +1323,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) * blk-throttle. */ blkg_for_each_descendant_pre(blkg, pos_css, - global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { + global ? tg->td->queue->disk->root_blkg : + tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); struct throtl_grp *parent_tg; @@ -1718,7 +1719,7 @@ void blk_throtl_cancel_bios(struct gendisk *disk) * path need RCU protection and to prevent warning from lockdep. */ rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + blkg_for_each_descendant_post(blkg, pos_css, disk->root_blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq = &tg->service_queue; @@ -1872,7 +1873,8 @@ static bool throtl_can_upgrade(struct throtl_data *td, return false; rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + blkg_for_each_descendant_post(blkg, pos_css, + td->queue->disk->root_blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); if (tg == this_tg) @@ -1918,7 +1920,8 @@ static void throtl_upgrade_state(struct throtl_data *td) td->low_upgrade_time = jiffies; td->scale = 0; rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + blkg_for_each_descendant_post(blkg, pos_css, + td->queue->disk->root_blkg) { struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq = &tg->service_queue; @@ -2395,7 +2398,7 @@ int blk_throtl_init(struct gendisk *disk) td->low_downgrade_time = jiffies; /* activate policy */ - ret = blkcg_activate_policy(q, &blkcg_policy_throtl); + ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { free_percpu(td->latency_buckets[READ]); free_percpu(td->latency_buckets[WRITE]); @@ -2411,7 +2414,7 @@ void blk_throtl_exit(struct gendisk *disk) BUG_ON(!q->td); del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); - blkcg_deactivate_policy(q, &blkcg_policy_throtl); + blkcg_deactivate_policy(disk, &blkcg_policy_throtl); free_percpu(q->td->latency_buckets[READ]); free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 68a774d7a7c9..e49a48684532 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,6 +25,7 @@ #include <linux/backing-dev.h> #include <linux/swap.h> +#include "blk-stat.h" #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" @@ -32,6 +33,72 @@ #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> +enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_DISCARD = 8, /* discard */ + + WBT_NR_BITS = 4, /* number of bits */ +}; + +enum { + WBT_RWQ_BG = 0, + WBT_RWQ_KSWAPD, + WBT_RWQ_DISCARD, + WBT_NUM_RWQ, +}; + +/* + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + + short enable_state; /* WBT_STATE_* */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct blk_stat_callback *cb; + + u64 sync_issue; + void *sync_cookie; + + unsigned int wc; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + struct rq_qos rqos; + struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; +}; + +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + static inline void wbt_clear_state(struct request *rq) { rq->wbt_flags = 0; @@ -98,7 +165,7 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->disk->bdi->wb; + struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb; return time_before(jiffies, wb->dirty_sleep + HZ); } @@ -226,6 +293,16 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb) return now - issue; } +static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; +} + enum { LAT_OK = 1, LAT_UNKNOWN, @@ -235,7 +312,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -288,7 +365,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->disk->bdi; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -358,13 +435,12 @@ static void wb_timer_fn(struct blk_stat_callback *cb) unsigned int inflight = wbt_inflight(rwb); int status; - if (!rwb->rqos.q->disk) + if (!rwb->rqos.disk) return; status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->disk->bdi, status, rqd->scale_step, - inflight); + trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, @@ -650,8 +726,9 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) /* * Enable wbt if defaults are configured that way */ -void wbt_enable_default(struct request_queue *q) +void wbt_enable_default(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct rq_qos *rqos; bool disable_flag = q->elevator && test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags); @@ -670,7 +747,7 @@ void wbt_enable_default(struct request_queue *q) return; if (queue_is_mq(q) && !disable_flag) - wbt_init(q); + wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -701,16 +778,15 @@ static int wbt_data_dir(const struct request *rq) static void wbt_queue_depth_changed(struct rq_qos *rqos) { - RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q); + RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); wbt_update_limits(RQWB(rqos)); } static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); - struct request_queue *q = rqos->q; - blk_stat_remove_callback(q, rwb->cb); + blk_stat_remove_callback(rqos->disk->queue, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); } @@ -718,9 +794,9 @@ static void wbt_exit(struct rq_qos *rqos) /* * Disable wbt, if enabled by default. */ -void wbt_disable_default(struct request_queue *q) +void wbt_disable_default(struct gendisk *disk) { - struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_qos *rqos = wbt_rq_qos(disk->queue); struct rq_wb *rwb; if (!rqos) return; @@ -820,7 +896,7 @@ static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { }; #endif -static struct rq_qos_ops wbt_rqos_ops = { +static const struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, .track = wbt_track, @@ -834,8 +910,9 @@ static struct rq_qos_ops wbt_rqos_ops = { #endif }; -int wbt_init(struct request_queue *q) +int wbt_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct rq_wb *rwb; int i; int ret; @@ -853,22 +930,19 @@ int wbt_init(struct request_queue *q) for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); - rwb->rqos.id = RQ_QOS_WBT; - rwb->rqos.ops = &wbt_rqos_ops; - rwb->rqos.q = q; rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); - - wbt_queue_depth_changed(&rwb->rqos); + rwb->rq_depth.queue_depth = blk_queue_depth(q); + wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ - ret = rq_qos_add(q, &rwb->rqos); + ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); if (ret) goto err_free; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index e3ea6e7e2900..ba6cca5849a6 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -2,97 +2,11 @@ #ifndef WB_THROTTLE_H #define WB_THROTTLE_H -#include <linux/kernel.h> -#include <linux/atomic.h> -#include <linux/wait.h> -#include <linux/timer.h> -#include <linux/ktime.h> - -#include "blk-stat.h" -#include "blk-rq-qos.h" - -enum wbt_flags { - WBT_TRACKED = 1, /* write, tracked for throttling */ - WBT_READ = 2, /* read */ - WBT_KSWAPD = 4, /* write, from kswapd */ - WBT_DISCARD = 8, /* discard */ - - WBT_NR_BITS = 4, /* number of bits */ -}; - -enum { - WBT_RWQ_BG = 0, - WBT_RWQ_KSWAPD, - WBT_RWQ_DISCARD, - WBT_NUM_RWQ, -}; - -/* - * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other - * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered - * to WBT_STATE_OFF/ON_MANUAL. - */ -enum { - WBT_STATE_ON_DEFAULT = 1, /* on by default */ - WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ - WBT_STATE_OFF_DEFAULT = 3, /* off by default */ - WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ -}; - -struct rq_wb { - /* - * Settings that govern how we throttle - */ - unsigned int wb_background; /* background writeback */ - unsigned int wb_normal; /* normal writeback */ - - short enable_state; /* WBT_STATE_* */ - - /* - * Number of consecutive periods where we don't have enough - * information to make a firm scale up/down decision. - */ - unsigned int unknown_cnt; - - u64 win_nsec; /* default window size */ - u64 cur_win_nsec; /* current window size */ - - struct blk_stat_callback *cb; - - u64 sync_issue; - void *sync_cookie; - - unsigned int wc; - - unsigned long last_issue; /* last non-throttled issue */ - unsigned long last_comp; /* last non-throttled comp */ - unsigned long min_lat_nsec; - struct rq_qos rqos; - struct rq_wait rq_wait[WBT_NUM_RWQ]; - struct rq_depth rq_depth; -}; - -static inline struct rq_wb *RQWB(struct rq_qos *rqos) -{ - return container_of(rqos, struct rq_wb, rqos); -} - -static inline unsigned int wbt_inflight(struct rq_wb *rwb) -{ - unsigned int i, ret = 0; - - for (i = 0; i < WBT_NUM_RWQ; i++) - ret += atomic_read(&rwb->rq_wait[i].inflight); - - return ret; -} - - #ifdef CONFIG_BLK_WBT -int wbt_init(struct request_queue *); -void wbt_disable_default(struct request_queue *); -void wbt_enable_default(struct request_queue *); +int wbt_init(struct gendisk *disk); +void wbt_disable_default(struct gendisk *disk); +void wbt_enable_default(struct gendisk *disk); u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); @@ -104,14 +18,14 @@ u64 wbt_default_latency_nsec(struct request_queue *); #else -static inline int wbt_init(struct request_queue *q) +static inline int wbt_init(struct gendisk *disk) { return -EINVAL; } -static inline void wbt_disable_default(struct request_queue *q) +static inline void wbt_disable_default(struct gendisk *disk) { } -static inline void wbt_enable_default(struct request_queue *q) +static inline void wbt_enable_default(struct gendisk *disk) { } static inline void wbt_set_write_cache(struct request_queue *q, bool wc) diff --git a/block/genhd.c b/block/genhd.c index 23cf83b3331c..7e031559bf51 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -466,10 +466,14 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - ret = blk_integrity_add(disk); + ret = blkcg_init_disk(disk); if (ret) goto out_del_block_link; + ret = blk_integrity_add(disk); + if (ret) + goto out_blkcg_exit; + disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); if (!disk->part0->bd_holder_dir) { @@ -534,6 +538,8 @@ out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); out_del_integrity: blk_integrity_del(disk); +out_blkcg_exit: + blkcg_exit_disk(disk); out_del_block_link: if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(ddev)); @@ -662,6 +668,8 @@ void del_gendisk(struct gendisk *disk) rq_qos_exit(q); blk_mq_unquiesce_queue(q); + blkcg_exit_disk(disk); + /* * If the disk does not own the queue, allow using passthrough requests * again. Else leave the queue frozen to fail all I/O. @@ -1016,9 +1024,8 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%x\n", disk->flags); + dev_warn_once(dev, "the capability attribute has been deprecated.\n"); + return sprintf(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, @@ -1171,8 +1178,6 @@ static void disk_release(struct device *dev) !test_bit(GD_ADDED, &disk->state)) blk_mq_exit_queue(disk->queue); - blkcg_exit_disk(disk); - bioset_exit(&disk->bio_split); disk_release_events(disk); @@ -1385,9 +1390,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; - if (blkcg_init_disk(disk)) - goto out_erase_part0; - rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; @@ -1400,8 +1402,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, #endif return disk; -out_erase_part0: - xa_erase(&disk->part_tbl, 0); out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1518a6423279..5f04235e4ff7 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -90,7 +90,7 @@ struct loop_cmd { }; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) -#define LOOP_DEFAULT_HW_Q_DEPTH (128) +#define LOOP_DEFAULT_HW_Q_DEPTH 128 static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); @@ -1792,9 +1792,15 @@ static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) { - int ret = kstrtoint(s, 10, &hw_queue_depth); + int qd, ret; - return (ret || (hw_queue_depth < 1)) ? -EINVAL : 0; + ret = kstrtoint(s, 0, &qd); + if (ret < 0) + return ret; + if (qd < 1) + return -EINVAL; + hw_queue_depth = qd; + return 0; } static const struct kernel_param_ops loop_hw_qdepth_param_ops = { @@ -1803,7 +1809,7 @@ static const struct kernel_param_ops loop_hw_qdepth_param_ops = { }; device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); -MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 128"); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 04453f4a319c..1faca7e07a4d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3068,13 +3068,12 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) for (i = 0; i < obj_req->copyup_bvec_count; i++) { unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); + struct page *page = alloc_page(GFP_NOIO); - obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); - if (!obj_req->copyup_bvecs[i].bv_page) + if (!page) return -ENOMEM; - obj_req->copyup_bvecs[i].bv_offset = 0; - obj_req->copyup_bvecs[i].bv_len = len; + bvec_set_page(&obj_req->copyup_bvecs[i], page, len, 0); obj_overlap -= len; } diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c932e9ea5a0f..d83fe2c2b3ba 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -322,7 +322,7 @@ static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, static inline bool ublk_rq_has_data(const struct request *rq) { - return rq->bio && bio_has_data(rq->bio); + return bio_has_data(rq->bio); } static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, @@ -665,7 +665,7 @@ static void ublk_complete_rq(struct request *req) } /* - * FLUSH or DISCARD usually won't return bytes returned, so end them + * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them * directly. * * Both the two needn't unmap. @@ -1578,7 +1578,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) goto out_unlock; } - disk = blk_mq_alloc_disk(&ub->tag_set, ub); + disk = blk_mq_alloc_disk(&ub->tag_set, NULL); if (IS_ERR(disk)) { ret = PTR_ERR(disk); goto out_unlock; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6a77fa917428..dc6e9b989910 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -170,9 +170,7 @@ static int virtblk_setup_discard_write_zeroes_erase(struct request *req, bool un WARN_ON_ONCE(n != segments); - req->special_vec.bv_page = virt_to_page(range); - req->special_vec.bv_offset = offset_in_page(range); - req->special_vec.bv_len = sizeof(*range) * segments; + bvec_set_virt(&req->special_vec, range, sizeof(*range) * segments); req->rq_flags |= RQF_SPECIAL_PAYLOAD; return 0; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e290d6d97047..bd8ae4822dc3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -703,9 +703,7 @@ static ssize_t writeback_store(struct device *dev, for (; nr_pages != 0; index++, nr_pages--) { struct bio_vec bvec; - bvec.bv_page = page; - bvec.bv_len = PAGE_SIZE; - bvec.bv_offset = 0; + bvec_set_page(&bvec, page, PAGE_SIZE, 0); spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { @@ -1380,12 +1378,9 @@ out: static int zram_bvec_read_from_bdev(struct zram *zram, struct page *page, u32 index, struct bio *bio, bool partial_io) { - struct bio_vec bvec = { - .bv_page = page, - .bv_len = PAGE_SIZE, - .bv_offset = 0, - }; + struct bio_vec bvec; + bvec_set_page(&bvec, page, PAGE_SIZE, 0); return read_from_bdev(zram, &bvec, zram_get_element(zram, index), bio, partial_io); } @@ -1652,9 +1647,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, memcpy_from_bvec(dst + offset, bvec); kunmap_atomic(dst); - vec.bv_page = page; - vec.bv_len = PAGE_SIZE; - vec.bv_offset = 0; + bvec_set_page(&vec, page, PAGE_SIZE, 0); } ret = __zram_bvec_write(zram, &vec, index, bio); diff --git a/drivers/md/md.c b/drivers/md/md.c index 02b0240e7c71..1961105712b7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -380,6 +380,10 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); +static bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -389,7 +393,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); */ static bool is_suspended(struct mddev *mddev, struct bio *bio) { - if (mddev->suspended) + if (is_md_suspended(mddev)) return true; if (bio_data_dir(bio) != WRITE) return false; @@ -405,12 +409,10 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) void md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: - rcu_read_lock(); if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { - rcu_read_unlock(); bio_wouldblock_error(bio); return; } @@ -419,23 +421,19 @@ check_suspended: TASK_UNINTERRUPTIBLE); if (!is_suspended(mddev, bio)) break; - rcu_read_unlock(); schedule(); - rcu_read_lock(); } finish_wait(&mddev->sb_wait, &__wait); } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); + if (!percpu_ref_tryget_live(&mddev->active_io)) + goto check_suspended; if (!mddev->pers->make_request(mddev, bio)) { - atomic_dec(&mddev->active_io); - wake_up(&mddev->sb_wait); + percpu_ref_put(&mddev->active_io); goto check_suspended; } - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); + percpu_ref_put(&mddev->active_io); } EXPORT_SYMBOL(md_handle_request); @@ -483,11 +481,10 @@ void mddev_suspend(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; - synchronize_rcu(); wake_up(&mddev->sb_wait); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); - smp_mb__after_atomic(); - wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + percpu_ref_kill(&mddev->active_io); + wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); @@ -505,6 +502,7 @@ void mddev_resume(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; + percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 0); @@ -683,7 +681,6 @@ void mddev_init(struct mddev *mddev) timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); - atomic_set(&mddev->active_io, 0); spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); @@ -5760,6 +5757,12 @@ static void md_safemode_timeout(struct timer_list *t) } static int start_dirty_degraded; +static void active_io_release(struct percpu_ref *ref) +{ + struct mddev *mddev = container_of(ref, struct mddev, active_io); + + wake_up(&mddev->sb_wait); +} int md_run(struct mddev *mddev) { @@ -5840,10 +5843,15 @@ int md_run(struct mddev *mddev) nowait = nowait && bdev_nowait(rdev->bdev); } + err = percpu_ref_init(&mddev->active_io, active_io_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (err) + return err; + if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) - return err; + goto exit_active_io; } if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); @@ -6031,6 +6039,8 @@ abort: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); +exit_active_io: + percpu_ref_exit(&mddev->active_io); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6156,7 +6166,7 @@ static void md_clean(struct mddev *mddev) mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; mddev->new_chunk_sectors = 0; - mddev->curr_resync = 0; + mddev->curr_resync = MD_RESYNC_NONE; atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; @@ -6219,7 +6229,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { md_bitmap_wait_behind_writes(mddev); - if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) { + if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } @@ -6255,6 +6265,8 @@ void md_stop(struct mddev *mddev) */ __md_stop_writes(mddev); __md_stop(mddev); + percpu_ref_exit(&mddev->writes_pending); + percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); } @@ -7828,6 +7840,7 @@ static void md_free_disk(struct gendisk *disk) struct mddev *mddev = disk->private_data; percpu_ref_exit(&mddev->writes_pending); + percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); @@ -8531,7 +8544,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) return true; wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || - mddev->suspended); + is_md_suspended(mddev)); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; @@ -8883,7 +8896,7 @@ void md_do_sync(struct md_thread *thread) atomic_set(&mddev->recovery_active, 0); last_check = 0; - if (j>2) { + if (j >= MD_RESYNC_ACTIVE) { pr_debug("md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; @@ -8955,7 +8968,7 @@ void md_do_sync(struct md_thread *thread) if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ j = max_sectors; - if (j > 2) + if (j >= MD_RESYNC_ACTIVE) mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) @@ -9030,7 +9043,7 @@ void md_do_sync(struct md_thread *thread) mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync >= MD_RESYNC_ACTIVE) { + mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { @@ -9259,7 +9272,7 @@ void md_check_recovery(struct mddev *mddev) wake_up(&mddev->sb_wait); } - if (mddev->suspended) + if (is_md_suspended(mddev)) return; if (mddev->bitmap) diff --git a/drivers/md/md.h b/drivers/md/md.h index 554a9026669a..6335cb86e52e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -315,7 +315,7 @@ struct mddev { unsigned long sb_flags; int suspended; - atomic_t active_io; + struct percpu_ref active_io; int ro; int sysfs_active; /* set when sysfs deletes * are happening, so run/ diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d1c9402389f9..25968b25d0ba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -806,9 +806,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, cmnd->dsm.nr = cpu_to_le32(segments - 1); cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - req->special_vec.bv_page = virt_to_page(range); - req->special_vec.bv_offset = offset_in_page(range); - req->special_vec.bv_len = alloc_size; + bvec_set_virt(&req->special_vec, range, alloc_size); req->rq_flags |= RQF_SPECIAL_PAYLOAD; return BLK_STS_OK; diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 871c4f32f443..2d068439b129 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -73,13 +73,6 @@ err: return ret; } -static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg) -{ - bv->bv_page = sg_page(sg); - bv->bv_offset = sg->offset; - bv->bv_len = sg->length; -} - static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, unsigned long nr_segs, size_t count, int ki_flags) { @@ -146,7 +139,8 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) memset(&req->f.iocb, 0, sizeof(struct kiocb)); for_each_sg(req->sg, sg, req->sg_cnt, i) { - nvmet_file_init_bvec(&req->f.bvec[bv_cnt], sg); + bvec_set_page(&req->f.bvec[bv_cnt], sg_page(sg), sg->length, + sg->offset); len += req->f.bvec[bv_cnt].bv_len; total_len += req->f.bvec[bv_cnt].bv_len; bv_cnt++; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index cc05c094de22..c5759eb503d0 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -321,9 +321,8 @@ static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) while (length) { u32 iov_len = min_t(u32, length, sg->length - sg_offset); - iov->bv_page = sg_page(sg); - iov->bv_len = sg->length; - iov->bv_offset = sg->offset + sg_offset; + bvec_set_page(iov, sg_page(sg), sg->length, + sg->offset + sg_offset); length -= iov_len; sg = sg_next(sg); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 47dafe6b8a66..277960decc10 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -831,6 +831,19 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); } +static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) +{ + struct page *page; + + page = mempool_alloc(sd_page_pool, GFP_ATOMIC); + if (!page) + return NULL; + clear_highpage(page); + bvec_set_page(&rq->special_vec, page, data_len, 0); + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + return bvec_virt(&rq->special_vec); +} + static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; @@ -841,19 +854,14 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) unsigned int data_len = 24; char *buf; - rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); - if (!rq->special_vec.bv_page) + buf = sd_set_special_bvec(rq, data_len); + if (!buf) return BLK_STS_RESOURCE; - clear_highpage(rq->special_vec.bv_page); - rq->special_vec.bv_offset = 0; - rq->special_vec.bv_len = data_len; - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; - buf = bvec_virt(&rq->special_vec); put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); @@ -876,13 +884,8 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; - rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); - if (!rq->special_vec.bv_page) + if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; - clear_highpage(rq->special_vec.bv_page); - rq->special_vec.bv_offset = 0; - rq->special_vec.bv_len = data_len; - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; @@ -908,13 +911,8 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; - rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); - if (!rq->special_vec.bv_page) + if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; - clear_highpage(rq->special_vec.bv_page); - rq->special_vec.bv_offset = 0; - rq->special_vec.bv_len = data_len; - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index fd584111da45..ce0e000b74fc 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -281,10 +281,8 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; for_each_sg(sgl, sg, sgl_nents, i) { - aio_cmd->bvecs[i].bv_page = sg_page(sg); - aio_cmd->bvecs[i].bv_len = sg->length; - aio_cmd->bvecs[i].bv_offset = sg->offset; - + bvec_set_page(&aio_cmd->bvecs[i], sg_page(sg), sg->length, + sg->offset); len += sg->length; } @@ -329,10 +327,7 @@ static int fd_do_rw(struct se_cmd *cmd, struct file *fd, } for_each_sg(sgl, sg, sgl_nents, i) { - bvec[i].bv_page = sg_page(sg); - bvec[i].bv_len = sg->length; - bvec[i].bv_offset = sg->offset; - + bvec_set_page(&bvec[i], sg_page(sg), sg->length, sg->offset); len += sg->length; } @@ -465,10 +460,9 @@ fd_execute_write_same(struct se_cmd *cmd) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; for (i = 0; i < nolb; i++) { - bvec[i].bv_page = sg_page(&cmd->t_data_sg[0]); - bvec[i].bv_len = cmd->t_data_sg[0].length; - bvec[i].bv_offset = cmd->t_data_sg[0].offset; - + bvec_set_page(&bvec[i], sg_page(&cmd->t_data_sg[0]), + cmd->t_data_sg[0].length, + cmd->t_data_sg[0].offset); len += se_dev->dev_attrib.block_size; } diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 33eb941fcf15..a1e27da54481 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1126,9 +1126,8 @@ static int iotlb_translate(const struct vringh *vrh, size = map->size - addr + map->start; pa = map->addr + addr - map->start; pfn = pa >> PAGE_SHIFT; - iov[ret].bv_page = pfn_to_page(pfn); - iov[ret].bv_len = min(len - s, size); - iov[ret].bv_offset = pa & (PAGE_SIZE - 1); + bvec_set_page(&iov[ret], pfn_to_page(pfn), min(len - s, size), + pa & (PAGE_SIZE - 1)); s += size; addr += size; ++ret; diff --git a/fs/afs/write.c b/fs/afs/write.c index 19df10d63323..2d17891b618e 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -992,7 +992,7 @@ int afs_launder_folio(struct folio *folio) { struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); struct iov_iter iter; - struct bio_vec bv[1]; + struct bio_vec bv; unsigned long priv; unsigned int f, t; int ret = 0; @@ -1008,10 +1008,8 @@ int afs_launder_folio(struct folio *folio) t = afs_folio_dirty_to(folio, priv); } - bv[0].bv_page = &folio->page; - bv[0].bv_offset = f; - bv[0].bv_len = t - f; - iov_iter_bvec(&iter, ITER_SOURCE, bv, 1, bv[0].bv_len); + bvec_set_folio(&bv, folio, t - f, f); + iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len); trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 764598e1efd9..90b2aa7963bf 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -103,14 +103,10 @@ static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, size += bytes; for ( ; bytes; idx++, bvec_idx++) { - struct bio_vec bv = { - .bv_page = pages[idx], - .bv_len = min_t(int, bytes, PAGE_SIZE - start), - .bv_offset = start, - }; - - bvecs[bvec_idx] = bv; - bytes -= bv.bv_len; + int len = min_t(int, bytes, PAGE_SIZE - start); + + bvec_set_page(&bvecs[bvec_idx], pages[idx], len, start); + bytes -= len; start = 0; } } diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b2a04b4e89a5..e6088d96eb04 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -759,8 +759,9 @@ cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read) { struct msghdr smb_msg = {}; - struct bio_vec bv = { - .bv_page = page, .bv_len = to_read, .bv_offset = page_offset}; + struct bio_vec bv; + + bvec_set_page(&bv, page, to_read, page_offset); iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read); return cifs_readv_from_socket(server, &smb_msg); } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index f6f3a6b75601..0911327ebfde 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -143,14 +143,12 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) struct netfs_cache_resources cres; struct fscache_cookie *cookie = cifs_inode_cookie(inode); struct iov_iter iter; - struct bio_vec bvec[1]; + struct bio_vec bvec; int ret; memset(&cres, 0, sizeof(cres)); - bvec[0].bv_page = page; - bvec[0].bv_offset = 0; - bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + bvec_set_page(&bvec, page, PAGE_SIZE, 0); + iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -171,16 +169,14 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, struct netfs_cache_resources cres; struct fscache_cookie *cookie = cifs_inode_cookie(inode); struct iov_iter iter; - struct bio_vec bvec[1]; + struct bio_vec bvec; loff_t start = page_offset(page); size_t len = PAGE_SIZE; int ret; memset(&cres, 0, sizeof(cres)); - bvec[0].bv_page = page; - bvec[0].bv_offset = 0; - bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + bvec_set_page(&bvec, page, PAGE_SIZE, 0); + iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 2a19c7987c5b..95cc4d7dd806 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1054,9 +1054,8 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) for (i = 0; i < cur_npages; i++) { len = rc > PAGE_SIZE ? PAGE_SIZE : rc; - bv[npages + i].bv_page = pages[i]; - bv[npages + i].bv_offset = start; - bv[npages + i].bv_len = len - start; + bvec_set_page(&bv[npages + i], pages[i], len - start, + start); rc -= len; start = 0; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index e6bcd2baf446..cb2deac6b2d7 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -4598,9 +4598,9 @@ init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size, return -ENOMEM; for (i = 0; i < npages; i++) { - bvec[i].bv_page = pages[i]; - bvec[i].bv_offset = (i == 0) ? cur_off : 0; - bvec[i].bv_len = min_t(unsigned int, PAGE_SIZE, data_size); + bvec_set_page(&bvec[i], pages[i], + min_t(unsigned int, PAGE_SIZE, data_size), + i == 0 ? cur_off : 0); data_size -= bvec[i].bv_len; } diff --git a/fs/coredump.c b/fs/coredump.c index de78bde2991b..0a6873a9c4d0 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -840,11 +840,7 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) static int dump_emit_page(struct coredump_params *cprm, struct page *page) { - struct bio_vec bvec = { - .bv_page = page, - .bv_offset = 0, - .bv_len = PAGE_SIZE, - }; + struct bio_vec bvec; struct iov_iter iter; struct file *file = cprm->file; loff_t pos; @@ -860,6 +856,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page) if (dump_interrupted()) return 0; pos = file->f_pos; + bvec_set_page(&bvec, page, PAGE_SIZE, 0); iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); n = __kernel_write_iter(cprm->file, &iter, &pos); if (n != PAGE_SIZE) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e731c00a9fcb..ea5f2976dfab 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -245,14 +245,12 @@ static int fscache_fallback_read_page(struct inode *inode, struct page *page) struct netfs_cache_resources cres; struct fscache_cookie *cookie = nfs_i_fscache(inode); struct iov_iter iter; - struct bio_vec bvec[1]; + struct bio_vec bvec; int ret; memset(&cres, 0, sizeof(cres)); - bvec[0].bv_page = page; - bvec[0].bv_offset = 0; - bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, ITER_DEST, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + bvec_set_page(&bvec, page, PAGE_SIZE, 0); + iov_iter_bvec(&iter, ITER_DEST, &bvec, 1, PAGE_SIZE); ret = fscache_begin_read_operation(&cres, cookie); if (ret < 0) @@ -273,16 +271,14 @@ static int fscache_fallback_write_page(struct inode *inode, struct page *page, struct netfs_cache_resources cres; struct fscache_cookie *cookie = nfs_i_fscache(inode); struct iov_iter iter; - struct bio_vec bvec[1]; + struct bio_vec bvec; loff_t start = page_offset(page); size_t len = PAGE_SIZE; int ret; memset(&cres, 0, sizeof(cres)); - bvec[0].bv_page = page; - bvec[0].bv_offset = 0; - bvec[0].bv_len = PAGE_SIZE; - iov_iter_bvec(&iter, ITER_SOURCE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE); + bvec_set_page(&bvec, page, PAGE_SIZE, 0); + iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE); ret = fscache_begin_write_operation(&cres, cookie); if (ret < 0) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 4df560894386..215f6cb3dc41 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -49,10 +49,8 @@ static int orangefs_writepage_locked(struct page *page, /* Should've been handled in orangefs_invalidate_folio. */ WARN_ON(off == len || off + wlen > len); - bv.bv_page = page; - bv.bv_len = wlen; - bv.bv_offset = off % PAGE_SIZE; WARN_ON(wlen == 0); + bvec_set_page(&bv, page, wlen, off % PAGE_SIZE); iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, @@ -102,15 +100,11 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, for (i = 0; i < ow->npages; i++) { set_page_writeback(ow->pages[i]); - ow->bv[i].bv_page = ow->pages[i]; - ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE, - ow->off + ow->len) - - max(ow->off, page_offset(ow->pages[i])); - if (i == 0) - ow->bv[i].bv_offset = ow->off - - page_offset(ow->pages[i]); - else - ow->bv[i].bv_offset = 0; + bvec_set_page(&ow->bv[i], ow->pages[i], + min(page_offset(ow->pages[i]) + PAGE_SIZE, + ow->off + ow->len) - + max(ow->off, page_offset(ow->pages[i])), + i == 0 ? ow->off - page_offset(ow->pages[i]) : 0); } iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); @@ -300,9 +294,7 @@ static int orangefs_read_folio(struct file *file, struct folio *folio) orangefs_launder_folio(folio); off = folio_pos(folio); - bv.bv_page = &folio->page; - bv.bv_len = folio_size(folio); - bv.bv_offset = 0; + bvec_set_folio(&bv, folio, folio_size(folio), 0); iov_iter_bvec(&iter, ITER_DEST, &bv, 1, folio_size(folio)); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, diff --git a/fs/splice.c b/fs/splice.c index 5969b7a1d353..87d9b19349de 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -675,9 +675,8 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, goto done; } - array[n].bv_page = buf->page; - array[n].bv_len = this_len; - array[n].bv_offset = buf->offset; + bvec_set_page(&array[n], buf->page, this_len, + buf->offset); left -= this_len; n++; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b9637d63e6f0..79aec4ebadb9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -163,6 +163,12 @@ struct gendisk { struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev; +#ifdef CONFIG_BLK_CGROUP + DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); + struct blkcg_gq *root_blkg; + struct list_head blkg_list; + struct mutex blkcg_mutex; +#endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_BLK_DEV_INTEGRITY struct kobject integrity_kobj; #endif /* CONFIG_BLK_DEV_INTEGRITY */ @@ -481,12 +487,6 @@ struct request_queue { struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; -#ifdef CONFIG_BLK_CGROUP - DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); - struct blkcg_gq *root_blkg; - struct list_head blkg_list; - struct mutex blkcg_mutex; -#endif struct queue_limits limits; diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 7939b345ee7f..555aae5448ae 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -34,6 +34,46 @@ struct bio_vec { unsigned int bv_offset; }; +/** + * bvec_set_page - initialize a bvec based off a struct page + * @bv: bvec to initialize + * @page: page the bvec should point to + * @len: length of the bvec + * @offset: offset into the page + */ +static inline void bvec_set_page(struct bio_vec *bv, struct page *page, + unsigned int len, unsigned int offset) +{ + bv->bv_page = page; + bv->bv_len = len; + bv->bv_offset = offset; +} + +/** + * bvec_set_folio - initialize a bvec based off a struct folio + * @bv: bvec to initialize + * @folio: folio the bvec should point to + * @len: length of the bvec + * @offset: offset into the folio + */ +static inline void bvec_set_folio(struct bio_vec *bv, struct folio *folio, + unsigned int len, unsigned int offset) +{ + bvec_set_page(bv, &folio->page, len, offset); +} + +/** + * bvec_set_virt - initialize a bvec based on a virtual address + * @bv: bvec to initialize + * @vaddr: virtual address to set the bvec to + * @len: length of the bvec + */ +static inline void bvec_set_virt(struct bio_vec *bv, void *vaddr, + unsigned int len) +{ + bvec_set_page(bv, virt_to_page(vaddr), len, offset_in_page(vaddr)); +} + struct bvec_iter { sector_t bi_sector; /* device address in 512 byte sectors */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 853d08f7562b..6f6ce9ca7097 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1436,7 +1436,7 @@ struct task_struct { #endif #ifdef CONFIG_BLK_CGROUP - struct request_queue *throttle_queue; + struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 18de10c68a15..a59fc02de598 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1237,9 +1237,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size_t vec_len; vec_len = min_t(size_t, size, PAGE_SIZE - off); - imu->bvec[i].bv_page = pages[i]; - imu->bvec[i].bv_len = vec_len; - imu->bvec[i].bv_offset = off; + bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); off = 0; size -= vec_len; } diff --git a/kernel/fork.c b/kernel/fork.c index 9f7fe3541897..d9c97704b7c9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1044,7 +1044,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #endif #ifdef CONFIG_BLK_CGROUP - tsk->throttle_queue = NULL; + tsk->throttle_disk = NULL; tsk->use_memdelay = 0; #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 918a7d12df8f..5743be559415 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -320,8 +320,8 @@ static void blk_trace_free(struct request_queue *q, struct blk_trace *bt) * under 'q->debugfs_dir', thus lookup and remove them. */ if (!bt->dir) { - debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir)); - debugfs_remove(debugfs_lookup("msg", q->debugfs_dir)); + debugfs_lookup_and_remove("dropped", q->debugfs_dir); + debugfs_lookup_and_remove("msg", q->debugfs_dir); } else { debugfs_remove(bt->dir); } diff --git a/mm/page_io.c b/mm/page_io.c index 3a5f921b932e..233f6e6eb1c5 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -318,9 +318,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) sio->pages = 0; sio->len = 0; } - sio->bvec[sio->pages].bv_page = page; - sio->bvec[sio->pages].bv_len = thp_size(page); - sio->bvec[sio->pages].bv_offset = 0; + bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0); sio->len += thp_size(page); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) { @@ -432,9 +430,7 @@ static void swap_readpage_fs(struct page *page, sio->pages = 0; sio->len = 0; } - sio->bvec[sio->pages].bv_page = page; - sio->bvec[sio->pages].bv_len = thp_size(page); - sio->bvec[sio->pages].bv_offset = 0; + bvec_set_page(&sio->bvec[sio->pages], page, thp_size(page), 0); sio->len += thp_size(page); sio->pages += 1; if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 908a529bca12..3e0a742fb7bb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3642,7 +3642,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) * We've already scheduled a throttle, avoid taking the global swap * lock. */ - if (current->throttle_queue) + if (current->throttle_disk) return; spin_lock(&swap_avail_lock); diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index d1787d7d33ef..d664cb1593a7 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -40,15 +40,12 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) static int ceph_tcp_recvpage(struct socket *sock, struct page *page, int page_offset, size_t length) { - struct bio_vec bvec = { - .bv_page = page, - .bv_offset = page_offset, - .bv_len = length - }; + struct bio_vec bvec; struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; int r; BUG_ON(page_offset + length > PAGE_SIZE); + bvec_set_page(&bvec, page, length, page_offset); iov_iter_bvec(&msg.msg_iter, ITER_DEST, &bvec, 1, length); r = sock_recvmsg(sock, &msg, msg.msg_flags); if (r == -EAGAIN) diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 3009028c4fa2..301a991dc6a6 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -149,10 +149,10 @@ static int do_try_sendpage(struct socket *sock, struct iov_iter *it) while (iov_iter_count(it)) { /* iov_iter_iovec() for ITER_BVEC */ - bv.bv_page = it->bvec->bv_page; - bv.bv_offset = it->bvec->bv_offset + it->iov_offset; - bv.bv_len = min(iov_iter_count(it), - it->bvec->bv_len - it->iov_offset); + bvec_set_page(&bv, it->bvec->bv_page, + min(iov_iter_count(it), + it->bvec->bv_len - it->iov_offset), + it->bvec->bv_offset + it->iov_offset); /* * sendpage cannot properly handle pages with @@ -286,9 +286,8 @@ static void set_out_bvec_zero(struct ceph_connection *con) WARN_ON(iov_iter_count(&con->v2.out_iter)); WARN_ON(!con->v2.out_zero); - con->v2.out_bvec.bv_page = ceph_zero_page; - con->v2.out_bvec.bv_offset = 0; - con->v2.out_bvec.bv_len = min(con->v2.out_zero, (int)PAGE_SIZE); + bvec_set_page(&con->v2.out_bvec, ceph_zero_page, + min(con->v2.out_zero, (int)PAGE_SIZE), 0); con->v2.out_iter_sendpage = true; iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1, con->v2.out_bvec.bv_len); @@ -863,10 +862,7 @@ static void get_bvec_at(struct ceph_msg_data_cursor *cursor, /* get a piece of data, cursor isn't advanced */ page = ceph_msg_data_next(cursor, &off, &len); - - bv->bv_page = page; - bv->bv_offset = off; - bv->bv_len = len; + bvec_set_page(bv, page, len, off); } static int calc_sg_cnt(void *buf, int buf_len) @@ -1855,9 +1851,8 @@ static void prepare_read_enc_page(struct ceph_connection *con) con->v2.in_enc_resid); WARN_ON(!con->v2.in_enc_resid); - bv.bv_page = con->v2.in_enc_pages[con->v2.in_enc_i]; - bv.bv_offset = 0; - bv.bv_len = min(con->v2.in_enc_resid, (int)PAGE_SIZE); + bvec_set_page(&bv, con->v2.in_enc_pages[con->v2.in_enc_i], + min(con->v2.in_enc_resid, (int)PAGE_SIZE), 0); set_in_bvec(con, &bv); con->v2.in_enc_i++; @@ -2998,9 +2993,8 @@ static void queue_enc_page(struct ceph_connection *con) con->v2.out_enc_resid); WARN_ON(!con->v2.out_enc_resid); - bv.bv_page = con->v2.out_enc_pages[con->v2.out_enc_i]; - bv.bv_offset = 0; - bv.bv_len = min(con->v2.out_enc_resid, (int)PAGE_SIZE); + bvec_set_page(&bv, con->v2.out_enc_pages[con->v2.out_enc_i], + min(con->v2.out_enc_resid, (int)PAGE_SIZE), 0); set_out_bvec(con, &bv, false); con->v2.out_enc_i++; diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 16dcabb71ebe..4a2e90015ca7 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -493,7 +493,7 @@ static int rxperf_deliver_request(struct rxperf_call *call) static int rxperf_process_call(struct rxperf_call *call) { struct msghdr msg = {}; - struct bio_vec bv[1]; + struct bio_vec bv; struct kvec iov[1]; ssize_t n; size_t reply_len = call->reply_len, len; @@ -503,10 +503,8 @@ static int rxperf_process_call(struct rxperf_call *call) while (reply_len > 0) { len = min_t(size_t, reply_len, PAGE_SIZE); - bv[0].bv_page = ZERO_PAGE(0); - bv[0].bv_offset = 0; - bv[0].bv_len = len; - iov_iter_bvec(&msg.msg_iter, WRITE, bv, 1, len); + bvec_set_page(&bv, ZERO_PAGE(0), len, 0); + iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len); msg.msg_flags = MSG_MORE; n = rxrpc_kernel_send_data(rxperf_socket, call->rxcall, &msg, len, rxperf_notify_end_reply_tx); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 815baf308236..91252adcae46 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -252,11 +252,8 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) { - bvec[i].bv_page = rqstp->rq_pages[i]; - bvec[i].bv_len = PAGE_SIZE; - bvec[i].bv_offset = 0; - } + for (i = 0, t = 0; t < buflen; i++, t += PAGE_SIZE) + bvec_set_page(&bvec[i], rqstp->rq_pages[i], PAGE_SIZE, 0); rqstp->rq_respages = &rqstp->rq_pages[i]; rqstp->rq_next_page = rqstp->rq_respages + 1; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index f7767bf22406..afe7ec02d232 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -150,9 +150,8 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) if (!buf->bvec) return -ENOMEM; for (i = 0; i < n; i++) { - buf->bvec[i].bv_page = buf->pages[i]; - buf->bvec[i].bv_len = PAGE_SIZE; - buf->bvec[i].bv_offset = 0; + bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE, + 0); } } return 0; |