From 76aaa5101fffaef12b45b4c01ed0d0528f23dedf Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 14 Jun 2012 09:04:07 +0200 Subject: block: Drop dead function blk_abort_queue() This function was only used by btrfs code in btrfs_abort_devices() (seems in a wrong way). It was removed in commit d07eb9117050c9ed3f78296ebcc06128b52693be, So, Let's remove the dead code to avoid any confusion. Changes in v2: update commit log, btrfs_abort_devices() was removed already. Cc: Jens Axboe Cc: linux-kernel@vger.kernel.org Cc: Chris Mason Cc: linux-btrfs@vger.kernel.org Cc: David Sterba Signed-off-by: Asias He Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ba43f408baa3..07954b05b86c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -827,7 +827,6 @@ extern bool __blk_end_request_err(struct request *rq, int error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); -extern void blk_abort_queue(struct request_queue *); extern void blk_unprep_request(struct request *); /* -- cgit From 8a5ecdd42862bf87ceab00bf2a15d7eabf58c02d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 20:40:58 -0700 Subject: block: add q->nr_rqs[] and move q->rq.elvpriv to q->nr_rqs_elvpriv Add q->nr_rqs[] which currently behaves the same as q->rq.count[] and move q->rq.elvpriv to q->nr_rqs_elvpriv. blk_drain_queue() is updated to use q->nr_rqs[] instead of q->rq.count[]. These counters separates queue-wide request statistics from the request list and allow implementation of per-queue request allocation. While at it, properly indent fields of struct request_list. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +++++++------ include/linux/blkdev.h | 11 ++++++----- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 71894e143b91..a2648153691f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -387,7 +387,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) if (!list_empty(&q->queue_head) && q->request_fn) __blk_run_queue(q); - drain |= q->rq.elvpriv; + drain |= q->nr_rqs_elvpriv; /* * Unfortunately, requests are queued at and tracked from @@ -397,7 +397,7 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) if (drain_all) { drain |= !list_empty(&q->queue_head); for (i = 0; i < 2; i++) { - drain |= q->rq.count[i]; + drain |= q->nr_rqs[i]; drain |= q->in_flight[i]; drain |= !list_empty(&q->flush_queue[i]); } @@ -526,7 +526,6 @@ static int blk_init_free_list(struct request_queue *q) rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; - rl->elvpriv = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); @@ -791,9 +790,10 @@ static void freed_request(struct request_queue *q, unsigned int flags) struct request_list *rl = &q->rq; int sync = rw_is_sync(flags); + q->nr_rqs[sync]--; rl->count[sync]--; if (flags & REQ_ELVPRIV) - rl->elvpriv--; + q->nr_rqs_elvpriv--; __freed_request(q, sync); @@ -902,6 +902,7 @@ static struct request *__get_request(struct request_queue *q, int rw_flags, if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) return NULL; + q->nr_rqs[is_sync]++; rl->count[is_sync]++; rl->starved[is_sync] = 0; @@ -917,7 +918,7 @@ static struct request *__get_request(struct request_queue *q, int rw_flags, */ if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { rw_flags |= REQ_ELVPRIV; - rl->elvpriv++; + q->nr_rqs_elvpriv++; if (et->icq_cache && ioc) icq = ioc_lookup_icq(ioc, q); } @@ -978,7 +979,7 @@ fail_elvpriv: rq->elv.icq = NULL; spin_lock_irq(q->queue_lock); - rl->elvpriv--; + q->nr_rqs_elvpriv--; spin_unlock_irq(q->queue_lock); goto out; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 07954b05b86c..7e44ed93f84b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -51,11 +51,10 @@ struct request_list { * count[], starved[], and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC */ - int count[2]; - int starved[2]; - int elvpriv; - mempool_t *rq_pool; - wait_queue_head_t wait[2]; + int count[2]; + int starved[2]; + mempool_t *rq_pool; + wait_queue_head_t wait[2]; }; /* @@ -282,6 +281,8 @@ struct request_queue { struct list_head queue_head; struct request *last_merge; struct elevator_queue *elevator; + int nr_rqs[2]; /* # allocated [a]sync rqs */ + int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ /* * the queue request freelist, one for reads and one for writes -- cgit From 5b788ce3e2acac9bf109743b1281d77347cf2101 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 Jun 2012 20:40:59 -0700 Subject: block: prepare for multiple request_lists Request allocation is about to be made per-blkg meaning that there'll be multiple request lists. * Make queue full state per request_list. blk_*queue_full() functions are renamed to blk_*rl_full() and takes @rl instead of @q. * Rename blk_init_free_list() to blk_init_rl() and make it take @rl instead of @q. Also add @gfp_mask parameter. * Add blk_exit_rl() instead of destroying rl directly from blk_release_queue(). * Add request_list->q and make request alloc/free functions - blk_free_request(), [__]freed_request(), __get_request() - take @rl instead of @q. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-core.c | 56 ++++++++++++++++++++++++++++---------------------- block/blk-sysfs.c | 12 +++++------ block/blk.h | 3 +++ include/linux/blkdev.h | 32 ++++++++++++++++------------- 4 files changed, 57 insertions(+), 46 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index a2648153691f..f392a2edf462 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -517,13 +517,13 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -static int blk_init_free_list(struct request_queue *q) +int blk_init_rl(struct request_list *rl, struct request_queue *q, + gfp_t gfp_mask) { - struct request_list *rl = &q->rq; - if (unlikely(rl->rq_pool)) return 0; + rl->q = q; rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); @@ -531,13 +531,19 @@ static int blk_init_free_list(struct request_queue *q) rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep, - GFP_KERNEL, q->node); + gfp_mask, q->node); if (!rl->rq_pool) return -ENOMEM; return 0; } +void blk_exit_rl(struct request_list *rl) +{ + if (rl->rq_pool) + mempool_destroy(rl->rq_pool); +} + struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { return blk_alloc_queue_node(gfp_mask, -1); @@ -679,7 +685,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - if (blk_init_free_list(q)) + if (blk_init_rl(&q->rq, q, GFP_KERNEL)) return NULL; q->request_fn = rfn; @@ -721,15 +727,15 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -static inline void blk_free_request(struct request_queue *q, struct request *rq) +static inline void blk_free_request(struct request_list *rl, struct request *rq) { if (rq->cmd_flags & REQ_ELVPRIV) { - elv_put_request(q, rq); + elv_put_request(rl->q, rq); if (rq->elv.icq) put_io_context(rq->elv.icq->ioc); } - mempool_free(rq, q->rq.rq_pool); + mempool_free(rq, rl->rq_pool); } /* @@ -766,9 +772,9 @@ static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) ioc->last_waited = jiffies; } -static void __freed_request(struct request_queue *q, int sync) +static void __freed_request(struct request_list *rl, int sync) { - struct request_list *rl = &q->rq; + struct request_queue *q = rl->q; if (rl->count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); @@ -777,7 +783,7 @@ static void __freed_request(struct request_queue *q, int sync) if (waitqueue_active(&rl->wait[sync])) wake_up(&rl->wait[sync]); - blk_clear_queue_full(q, sync); + blk_clear_rl_full(rl, sync); } } @@ -785,9 +791,9 @@ static void __freed_request(struct request_queue *q, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, unsigned int flags) +static void freed_request(struct request_list *rl, unsigned int flags) { - struct request_list *rl = &q->rq; + struct request_queue *q = rl->q; int sync = rw_is_sync(flags); q->nr_rqs[sync]--; @@ -795,10 +801,10 @@ static void freed_request(struct request_queue *q, unsigned int flags) if (flags & REQ_ELVPRIV) q->nr_rqs_elvpriv--; - __freed_request(q, sync); + __freed_request(rl, sync); if (unlikely(rl->starved[sync ^ 1])) - __freed_request(q, sync ^ 1); + __freed_request(rl, sync ^ 1); } /* @@ -838,7 +844,7 @@ static struct io_context *rq_ioc(struct bio *bio) /** * __get_request - get a free request - * @q: request_queue to allocate request from + * @rl: request list to allocate from * @rw_flags: RW and SYNC flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask @@ -850,11 +856,11 @@ static struct io_context *rq_ioc(struct bio *bio) * Returns %NULL on failure, with @q->queue_lock held. * Returns !%NULL on success, with @q->queue_lock *not held*. */ -static struct request *__get_request(struct request_queue *q, int rw_flags, +static struct request *__get_request(struct request_list *rl, int rw_flags, struct bio *bio, gfp_t gfp_mask) { + struct request_queue *q = rl->q; struct request *rq; - struct request_list *rl = &q->rq; struct elevator_type *et = q->elevator->type; struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; @@ -876,9 +882,9 @@ static struct request *__get_request(struct request_queue *q, int rw_flags, * This process will be allowed to complete a batch of * requests, others will be blocked. */ - if (!blk_queue_full(q, is_sync)) { + if (!blk_rl_full(rl, is_sync)) { ioc_set_batching(q, ioc); - blk_set_queue_full(q, is_sync); + blk_set_rl_full(rl, is_sync); } else { if (may_queue != ELV_MQUEUE_MUST && !ioc_batching(q, ioc)) { @@ -928,7 +934,7 @@ static struct request *__get_request(struct request_queue *q, int rw_flags, spin_unlock_irq(q->queue_lock); /* allocate and init request */ - rq = mempool_alloc(q->rq.rq_pool, gfp_mask); + rq = mempool_alloc(rl->rq_pool, gfp_mask); if (!rq) goto fail_alloc; @@ -992,7 +998,7 @@ fail_alloc: * queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, rw_flags); + freed_request(rl, rw_flags); /* * in the very unlikely event that allocation failed and no @@ -1029,7 +1035,7 @@ static struct request *get_request(struct request_queue *q, int rw_flags, struct request_list *rl = &q->rq; struct request *rq; retry: - rq = __get_request(q, rw_flags, bio, gfp_mask); + rq = __get_request(&q->rq, rw_flags, bio, gfp_mask); if (rq) return rq; @@ -1229,8 +1235,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); - blk_free_request(q, req); - freed_request(q, flags); + blk_free_request(&q->rq, req); + freed_request(&q->rq, flags); } } EXPORT_SYMBOL_GPL(__blk_put_request); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index aa41b47c22d2..234ce7c082fa 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -66,16 +66,16 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) blk_clear_queue_congested(q, BLK_RW_ASYNC); if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_queue_full(q, BLK_RW_SYNC); + blk_set_rl_full(rl, BLK_RW_SYNC); } else { - blk_clear_queue_full(q, BLK_RW_SYNC); + blk_clear_rl_full(rl, BLK_RW_SYNC); wake_up(&rl->wait[BLK_RW_SYNC]); } if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_queue_full(q, BLK_RW_ASYNC); + blk_set_rl_full(rl, BLK_RW_ASYNC); } else { - blk_clear_queue_full(q, BLK_RW_ASYNC); + blk_clear_rl_full(rl, BLK_RW_ASYNC); wake_up(&rl->wait[BLK_RW_ASYNC]); } spin_unlock_irq(q->queue_lock); @@ -476,7 +476,6 @@ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - struct request_list *rl = &q->rq; blk_sync_queue(q); @@ -489,8 +488,7 @@ static void blk_release_queue(struct kobject *kobj) elevator_exit(q->elevator); } - if (rl->rq_pool) - mempool_destroy(rl->rq_pool); + blk_exit_rl(&q->rq); if (q->queue_tags) __blk_queue_free_tags(q); diff --git a/block/blk.h b/block/blk.h index 85f6ae42f7d3..a134231fd22a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -18,6 +18,9 @@ static inline void __blk_get_queue(struct request_queue *q) kobject_get(&q->kobj); } +int blk_init_rl(struct request_list *rl, struct request_queue *q, + gfp_t gfp_mask); +void blk_exit_rl(struct request_list *rl); void init_request_from_bio(struct request *req, struct bio *bio); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7e44ed93f84b..f2385ee7c7b2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -46,7 +46,12 @@ struct blkcg_gq; struct request; typedef void (rq_end_io_fn)(struct request *, int); +#define BLK_RL_SYNCFULL (1U << 0) +#define BLK_RL_ASYNCFULL (1U << 1) + struct request_list { + struct request_queue *q; /* the queue this rl belongs to */ + /* * count[], starved[], and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC @@ -55,6 +60,7 @@ struct request_list { int starved[2]; mempool_t *rq_pool; wait_queue_head_t wait[2]; + unsigned int flags; }; /* @@ -562,27 +568,25 @@ static inline bool rq_is_sync(struct request *rq) return rw_is_sync(rq->cmd_flags); } -static inline int blk_queue_full(struct request_queue *q, int sync) +static inline bool blk_rl_full(struct request_list *rl, bool sync) { - if (sync) - return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags); - return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + return rl->flags & flag; } -static inline void blk_set_queue_full(struct request_queue *q, int sync) +static inline void blk_set_rl_full(struct request_list *rl, bool sync) { - if (sync) - queue_flag_set(QUEUE_FLAG_SYNCFULL, q); - else - queue_flag_set(QUEUE_FLAG_ASYNCFULL, q); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + rl->flags |= flag; } -static inline void blk_clear_queue_full(struct request_queue *q, int sync) +static inline void blk_clear_rl_full(struct request_list *rl, bool sync) { - if (sync) - queue_flag_clear(QUEUE_FLAG_SYNCFULL, q); - else - queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q); + unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL; + + rl->flags &= ~flag; } -- cgit From a051661ca6d134c18599498b185b667859d4339b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 26 Jun 2012 15:05:44 -0700 Subject: blkcg: implement per-blkg request allocation Currently, request_queue has one request_list to allocate requests from regardless of blkcg of the IO being issued. When the unified request pool is used up, cfq proportional IO limits become meaningless - whoever grabs the next request being freed wins the race regardless of the configured weights. This can be easily demonstrated by creating a blkio cgroup w/ very low weight, put a program which can issue a lot of random direct IOs there and running a sequential IO from a different cgroup. As soon as the request pool is used up, the sequential IO bandwidth crashes. This patch implements per-blkg request_list. Each blkg has its own request_list and any IO allocates its request from the matching blkg making blkcgs completely isolated in terms of request allocation. * Root blkcg uses the request_list embedded in each request_queue, which was renamed to @q->root_rl from @q->rq. While making blkcg rl handling a bit harier, this enables avoiding most overhead for root blkcg. * Queue fullness is properly per request_list but bdi isn't blkcg aware yet, so congestion state currently just follows the root blkcg. As writeback isn't aware of blkcg yet, this works okay for async congestion but readahead may get the wrong signals. It's better than blkcg completely collapsing with shared request_list but needs to be improved with future changes. * After this change, each block cgroup gets a full request pool making resource consumption of each cgroup higher. This makes allowing non-root users to create cgroups less desirable; however, note that allowing non-root users to directly manage cgroups is already severely broken regardless of this patch - each block cgroup consumes kernel memory and skews IO weight (IO weights are not hierarchical). v2: queue-sysfs.txt updated and patch description udpated as suggested by Vivek. v3: blk_get_rl() wasn't checking error return from blkg_lookup_create() and may cause oops on lookup failure. Fix it by falling back to root_rl on blkg lookup failures. This problem was spotted by Rakesh Iyer . v4: Updated to accomodate 458f27a982 "block: Avoid missed wakeup in request waitqueue". blk_drain_queue() now wakes up waiters on all blkg->rl on the target queue. Signed-off-by: Tejun Heo Acked-by: Vivek Goyal Cc: Wu Fengguang Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.txt | 7 +++ block/blk-cgroup.c | 51 ++++++++++++++++-- block/blk-cgroup.h | 102 ++++++++++++++++++++++++++++++++++++ block/blk-core.c | 42 +++++++++++---- block/blk-sysfs.c | 32 ++++++----- include/linux/blkdev.h | 12 +++-- 6 files changed, 216 insertions(+), 30 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index d8147b336c35..6518a55273e7 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -38,6 +38,13 @@ read or write requests. Note that the total allocated number may be twice this amount, since it applies only to reads or writes (not the accumulated sum). +To avoid priority inversion through request starvation, a request +queue maintains a separate request pool per each cgroup when +CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such +per-block-cgroup request pool. IOW, if there are N block cgroups, +each request queue may have upto N request pools, each independently +regulated by nr_requests. + read_ahead_kb (RW) ------------------ Maximum number of kilobytes to read-ahead for filesystems on this block diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 63b31ebae6e2..f3b44a65fc7a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -63,6 +63,7 @@ static void blkg_free(struct blkcg_gq *blkg) kfree(pd); } + blk_exit_rl(&blkg->rl); kfree(blkg); } @@ -90,6 +91,13 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->blkcg = blkcg; blkg->refcnt = 1; + /* root blkg uses @q->root_rl, init rl only for !root blkgs */ + if (blkcg != &blkcg_root) { + if (blk_init_rl(&blkg->rl, q, gfp_mask)) + goto err_free; + blkg->rl.blkg = blkg; + } + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; @@ -99,10 +107,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, /* alloc per-policy data and attach it to blkg */ pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); - if (!pd) { - blkg_free(blkg); - return NULL; - } + if (!pd) + goto err_free; blkg->pd[i] = pd; pd->blkg = blkg; @@ -113,6 +119,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, } return blkg; + +err_free: + blkg_free(blkg); + return NULL; } static struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, @@ -300,6 +310,38 @@ void __blkg_release(struct blkcg_gq *blkg) } EXPORT_SYMBOL_GPL(__blkg_release); +/* + * The next function used by blk_queue_for_each_rl(). It's a bit tricky + * because the root blkg uses @q->root_rl instead of its own rl. + */ +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q) +{ + struct list_head *ent; + struct blkcg_gq *blkg; + + /* + * Determine the current blkg list_head. The first entry is + * root_rl which is off @q->blkg_list and mapped to the head. + */ + if (rl == &q->root_rl) { + ent = &q->blkg_list; + } else { + blkg = container_of(rl, struct blkcg_gq, rl); + ent = &blkg->q_node; + } + + /* walk to the next list_head, skip root blkcg */ + ent = ent->next; + if (ent == &q->root_blkg->q_node) + ent = ent->next; + if (ent == &q->blkg_list) + return NULL; + + blkg = container_of(ent, struct blkcg_gq, q_node); + return &blkg->rl; +} + static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) { @@ -750,6 +792,7 @@ int blkcg_activate_policy(struct request_queue *q, goto out_unlock; } q->root_blkg = blkg; + q->root_rl.blkg = blkg; list_for_each_entry(blkg, &q->blkg_list, q_node) cnt++; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index e74cce1fbac9..24597309e23d 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -17,6 +17,7 @@ #include #include #include +#include /* Max limits for throttle policy */ #define THROTL_IOPS_MAX UINT_MAX @@ -93,6 +94,8 @@ struct blkcg_gq { struct list_head q_node; struct hlist_node blkcg_node; struct blkcg *blkcg; + /* request allocation list for this blkcg-q pair */ + struct request_list rl; /* reference count */ int refcnt; @@ -250,6 +253,95 @@ static inline void blkg_put(struct blkcg_gq *blkg) __blkg_release(blkg); } +/** + * blk_get_rl - get request_list to use + * @q: request_queue of interest + * @bio: bio which will be attached to the allocated request (may be %NULL) + * + * The caller wants to allocate a request from @q to use for @bio. Find + * the request_list to use and obtain a reference on it. Should be called + * under queue_lock. This function is guaranteed to return non-%NULL + * request_list. + */ +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) +{ + struct blkcg *blkcg; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + blkcg = bio_blkcg(bio); + + /* bypass blkg lookup and use @q->root_rl directly for root */ + if (blkcg == &blkcg_root) + goto root_rl; + + /* + * Try to use blkg->rl. blkg lookup may fail under memory pressure + * or if either the blkcg or queue is going away. Fall back to + * root_rl in such cases. + */ + blkg = blkg_lookup_create(blkcg, q); + if (unlikely(IS_ERR(blkg))) + goto root_rl; + + blkg_get(blkg); + rcu_read_unlock(); + return &blkg->rl; +root_rl: + rcu_read_unlock(); + return &q->root_rl; +} + +/** + * blk_put_rl - put request_list + * @rl: request_list to put + * + * Put the reference acquired by blk_get_rl(). Should be called under + * queue_lock. + */ +static inline void blk_put_rl(struct request_list *rl) +{ + /* root_rl may not have blkg set */ + if (rl->blkg && rl->blkg->blkcg != &blkcg_root) + blkg_put(rl->blkg); +} + +/** + * blk_rq_set_rl - associate a request with a request_list + * @rq: request of interest + * @rl: target request_list + * + * Associate @rq with @rl so that accounting and freeing can know the + * request_list @rq came from. + */ +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) +{ + rq->rl = rl; +} + +/** + * blk_rq_rl - return the request_list a request came from + * @rq: request of interest + * + * Return the request_list @rq is allocated from. + */ +static inline struct request_list *blk_rq_rl(struct request *rq) +{ + return rq->rl; +} + +struct request_list *__blk_queue_next_rl(struct request_list *rl, + struct request_queue *q); +/** + * blk_queue_for_each_rl - iterate through all request_lists of a request_queue + * + * Should be used under queue_lock. + */ +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) + /** * blkg_stat_add - add a value to a blkg_stat * @stat: target blkg_stat @@ -392,6 +484,7 @@ static inline void blkcg_deactivate_policy(struct request_queue *q, static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } + static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, struct blkcg_policy *pol) { return NULL; } static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } @@ -399,5 +492,14 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } +static inline struct request_list *blk_get_rl(struct request_queue *q, + struct bio *bio) { return &q->root_rl; } +static inline void blk_put_rl(struct request_list *rl) { } +static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } +static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + #endif /* CONFIG_BLK_CGROUP */ #endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index f392a2edf462..dd134d834d58 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all) * left with hung waiters. We need to wake up those waiters. */ if (q->request_fn) { + struct request_list *rl; + spin_lock_irq(q->queue_lock); - for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) - wake_up_all(&q->rq.wait[i]); + + blk_queue_for_each_rl(rl, q) + for (i = 0; i < ARRAY_SIZE(rl->wait); i++) + wake_up_all(&rl->wait[i]); + spin_unlock_irq(q->queue_lock); } } @@ -685,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (!q) return NULL; - if (blk_init_rl(&q->rq, q, GFP_KERNEL)) + if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) return NULL; q->request_fn = rfn; @@ -776,7 +781,12 @@ static void __freed_request(struct request_list *rl, int sync) { struct request_queue *q = rl->q; - if (rl->count[sync] < queue_congestion_off_threshold(q)) + /* + * bdi isn't aware of blkcg yet. As all async IOs end up root + * blkcg anyway, just use root blkcg state. + */ + if (rl == &q->root_rl && + rl->count[sync] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, sync); if (rl->count[sync] + 1 <= q->nr_requests) { @@ -897,7 +907,12 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, } } } - blk_set_queue_congested(q, is_sync); + /* + * bdi isn't aware of blkcg yet. As all async IOs end up + * root blkcg anyway, just use root blkcg state. + */ + if (rl == &q->root_rl) + blk_set_queue_congested(q, is_sync); } /* @@ -939,6 +954,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, goto fail_alloc; blk_rq_init(q, rq); + blk_rq_set_rl(rq, rl); rq->cmd_flags = rw_flags | REQ_ALLOCED; /* init elvpriv */ @@ -1032,15 +1048,19 @@ static struct request *get_request(struct request_queue *q, int rw_flags, { const bool is_sync = rw_is_sync(rw_flags) != 0; DEFINE_WAIT(wait); - struct request_list *rl = &q->rq; + struct request_list *rl; struct request *rq; + + rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(&q->rq, rw_flags, bio, gfp_mask); + rq = __get_request(rl, rw_flags, bio, gfp_mask); if (rq) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) + if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) { + blk_put_rl(rl); return NULL; + } /* wait on @rl and retry */ prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, @@ -1231,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req) */ if (req->cmd_flags & REQ_ALLOCED) { unsigned int flags = req->cmd_flags; + struct request_list *rl = blk_rq_rl(req); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); - blk_free_request(&q->rq, req); - freed_request(&q->rq, flags); + blk_free_request(rl, req); + freed_request(rl, flags); + blk_put_rl(rl); } } EXPORT_SYMBOL_GPL(__blk_put_request); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 234ce7c082fa..9628b291f960 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -40,7 +40,7 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page) static ssize_t queue_requests_store(struct request_queue *q, const char *page, size_t count) { - struct request_list *rl = &q->rq; + struct request_list *rl; unsigned long nr; int ret; @@ -55,6 +55,9 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) q->nr_requests = nr; blk_queue_congestion_threshold(q); + /* congestion isn't cgroup aware and follows root blkcg for now */ + rl = &q->root_rl; + if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) blk_set_queue_congested(q, BLK_RW_SYNC); else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) @@ -65,19 +68,22 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) blk_clear_queue_congested(q, BLK_RW_ASYNC); - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_SYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_SYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_SYNC); + wake_up(&rl->wait[BLK_RW_SYNC]); + } + + if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { + blk_set_rl_full(rl, BLK_RW_ASYNC); + } else { + blk_clear_rl_full(rl, BLK_RW_ASYNC); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } } - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_ASYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } spin_unlock_irq(q->queue_lock); return ret; } @@ -488,7 +494,7 @@ static void blk_release_queue(struct kobject *kobj) elevator_exit(q->elevator); } - blk_exit_rl(&q->rq); + blk_exit_rl(&q->root_rl); if (q->queue_tags) __blk_queue_free_tags(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f2385ee7c7b2..3816ce8a08fc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -51,7 +51,9 @@ typedef void (rq_end_io_fn)(struct request *, int); struct request_list { struct request_queue *q; /* the queue this rl belongs to */ - +#ifdef CONFIG_BLK_CGROUP + struct blkcg_gq *blkg; /* blkg this request pool belongs to */ +#endif /* * count[], starved[], and wait[] are indexed by * BLK_RW_SYNC/BLK_RW_ASYNC @@ -143,6 +145,7 @@ struct request { struct hd_struct *part; unsigned long start_time; #ifdef CONFIG_BLK_CGROUP + struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; unsigned long long io_start_time_ns; /* when passed to hardware */ #endif @@ -291,9 +294,12 @@ struct request_queue { int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ /* - * the queue request freelist, one for reads and one for writes + * If blkcg is not used, @q->root_rl serves all requests. If blkcg + * is used, root blkg allocates from @q->root_rl and all other + * blkgs from their own blkg->rl. Which one to use should be + * determined using bio_request_list(). */ - struct request_list rq; + struct request_list root_rl; request_fn_proc *request_fn; make_request_fn *make_request_fn; -- cgit From 9cbb17508808f8a6bdd83354b61e126ac4fa6fed Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Jul 2012 09:08:14 +0200 Subject: blk: centralize non-request unplug handling. Both md and umem has similar code for getting notified on an blk_finish_plug event. Centralize this code in block/ and allow each driver to provide its distinctive difference. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- block/blk-core.c | 25 ++++++++++++++++++++++ drivers/block/umem.c | 35 ++++++------------------------- drivers/md/md.c | 56 +++++--------------------------------------------- drivers/md/md.h | 8 +++++++- include/linux/blkdev.h | 8 ++++++-- 5 files changed, 49 insertions(+), 83 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index dd134d834d58..177ddcf356e6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2927,6 +2927,31 @@ static void flush_plug_callbacks(struct blk_plug *plug) } } +struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, + int size) +{ + struct blk_plug *plug = current->plug; + struct blk_plug_cb *cb; + + if (!plug) + return NULL; + + list_for_each_entry(cb, &plug->cb_list, list) + if (cb->callback == unplug && cb->data == data) + return cb; + + /* Not currently on the callback list */ + BUG_ON(size < sizeof(*cb)); + cb = kzalloc(size, GFP_ATOMIC); + if (cb) { + cb->data = data; + cb->callback = unplug; + list_add(&cb->list, &plug->cb_list); + } + return cb; +} +EXPORT_SYMBOL(blk_check_plugged); + void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request_queue *q; diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 9a72277a31df..6ef3489568e3 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -513,42 +513,19 @@ static void process_page(unsigned long data) } } -struct mm_plug_cb { - struct blk_plug_cb cb; - struct cardinfo *card; -}; - static void mm_unplug(struct blk_plug_cb *cb) { - struct mm_plug_cb *mmcb = container_of(cb, struct mm_plug_cb, cb); + struct cardinfo *card = cb->data; - spin_lock_irq(&mmcb->card->lock); - activate(mmcb->card); - spin_unlock_irq(&mmcb->card->lock); - kfree(mmcb); + spin_lock_irq(&card->lock); + activate(card); + spin_unlock_irq(&card->lock); + kfree(cb); } static int mm_check_plugged(struct cardinfo *card) { - struct blk_plug *plug = current->plug; - struct mm_plug_cb *mmcb; - - if (!plug) - return 0; - - list_for_each_entry(mmcb, &plug->cb_list, cb.list) { - if (mmcb->cb.callback == mm_unplug && mmcb->card == card) - return 1; - } - /* Not currently on the callback list */ - mmcb = kmalloc(sizeof(*mmcb), GFP_ATOMIC); - if (!mmcb) - return 0; - - mmcb->card = card; - mmcb->cb.callback = mm_unplug; - list_add(&mmcb->cb.list, &plug->cb_list); - return 1; + return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); } static void mm_make_request(struct request_queue *q, struct bio *bio) diff --git a/drivers/md/md.c b/drivers/md/md.c index 34381172a947..b493fa417387 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -498,59 +498,13 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -/* Support for plugging. - * This mirrors the plugging support in request_queue, but does not - * require having a whole queue or request structures. - * We allocate an md_plug_cb for each md device and each thread it gets - * plugged on. This links tot the private plug_handle structure in the - * personality data where we keep a count of the number of outstanding - * plugs so other code can see if a plug is active. - */ -struct md_plug_cb { - struct blk_plug_cb cb; - struct mddev *mddev; -}; - -static void plugger_unplug(struct blk_plug_cb *cb) +void md_unplug(struct blk_plug_cb *cb) { - struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb); - md_wakeup_thread(mdcb->mddev->thread); - kfree(mdcb); -} - -/* Check that an unplug wakeup will come shortly. - * If not, wakeup the md thread immediately - */ -int mddev_check_plugged(struct mddev *mddev) -{ - struct blk_plug *plug = current->plug; - struct md_plug_cb *mdcb; - - if (!plug) - return 0; - - list_for_each_entry(mdcb, &plug->cb_list, cb.list) { - if (mdcb->cb.callback == plugger_unplug && - mdcb->mddev == mddev) { - /* Already on the list, move to top */ - if (mdcb != list_first_entry(&plug->cb_list, - struct md_plug_cb, - cb.list)) - list_move(&mdcb->cb.list, &plug->cb_list); - return 1; - } - } - /* Not currently on the callback list */ - mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC); - if (!mdcb) - return 0; - - mdcb->mddev = mddev; - mdcb->cb.callback = plugger_unplug; - list_add(&mdcb->cb.list, &plug->cb_list); - return 1; + struct mddev *mddev = cb->data; + md_wakeup_thread(mddev->thread); + kfree(cb); } -EXPORT_SYMBOL_GPL(mddev_check_plugged); +EXPORT_SYMBOL(md_unplug); static inline struct mddev *mddev_get(struct mddev *mddev) { diff --git a/drivers/md/md.h b/drivers/md/md.h index 91786c46b85c..8f998e08fb87 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -627,6 +627,12 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); -extern int mddev_check_plugged(struct mddev *mddev); extern void md_trim_bio(struct bio *bio, int offset, int size); + +extern void md_unplug(struct blk_plug_cb *cb); +static inline int mddev_check_plugged(struct mddev *mddev) +{ + return !!blk_check_plugged(md_unplug, mddev, + sizeof(struct blk_plug_cb)); +} #endif /* _MD_MD_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3816ce8a08fc..607ca228f47e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -922,11 +922,15 @@ struct blk_plug { }; #define BLK_MAX_REQUEST_COUNT 16 +struct blk_plug_cb; +typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *); struct blk_plug_cb { struct list_head list; - void (*callback)(struct blk_plug_cb *); + blk_plug_cb_fn callback; + void *data; }; - +extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, + void *data, int size); extern void blk_start_plug(struct blk_plug *); extern void blk_finish_plug(struct blk_plug *); extern void blk_flush_plug_list(struct blk_plug *, bool); -- cgit From 74018dc3063a2c729fc73041c0a9f03aac995920 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Jul 2012 09:08:15 +0200 Subject: blk: pass from_schedule to non-request unplug functions. This will allow md/raid to know why the unplug was called, and will be able to act according - if !from_schedule it is safe to perform tasks which could themselves schedule. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- block/blk-core.c | 6 +++--- drivers/block/umem.c | 2 +- drivers/md/md.c | 2 +- drivers/md/md.h | 2 +- include/linux/blkdev.h | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 35bf4fe8234c..4b4dbdfbca89 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2909,7 +2909,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth, } -static void flush_plug_callbacks(struct blk_plug *plug) +static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); @@ -2921,7 +2921,7 @@ static void flush_plug_callbacks(struct blk_plug *plug) struct blk_plug_cb, list); list_del(&cb->list); - cb->callback(cb); + cb->callback(cb, from_schedule); } } } @@ -2961,7 +2961,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) BUG_ON(plug->magic != PLUG_MAGIC); - flush_plug_callbacks(plug); + flush_plug_callbacks(plug, from_schedule); if (list_empty(&plug->list)) return; diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 6ef3489568e3..eb0d8216f557 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -513,7 +513,7 @@ static void process_page(unsigned long data) } } -static void mm_unplug(struct blk_plug_cb *cb) +static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule) { struct cardinfo *card = cb->data; diff --git a/drivers/md/md.c b/drivers/md/md.c index b493fa417387..db02d2efb76f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -498,7 +498,7 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) } EXPORT_SYMBOL(md_flush_request); -void md_unplug(struct blk_plug_cb *cb) +void md_unplug(struct blk_plug_cb *cb, bool from_schedule) { struct mddev *mddev = cb->data; md_wakeup_thread(mddev->thread); diff --git a/drivers/md/md.h b/drivers/md/md.h index 8f998e08fb87..f385b038589d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -629,7 +629,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); extern void md_trim_bio(struct bio *bio, int offset, int size); -extern void md_unplug(struct blk_plug_cb *cb); +extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); static inline int mddev_check_plugged(struct mddev *mddev) { return !!blk_check_plugged(md_unplug, mddev, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 607ca228f47e..4e72a9d48232 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -923,7 +923,7 @@ struct blk_plug { #define BLK_MAX_REQUEST_COUNT 16 struct blk_plug_cb; -typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *); +typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); struct blk_plug_cb { struct list_head list; blk_plug_cb_fn callback; -- cgit