From b2bf96833c5782befc3e7700f791fde754a47b01 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Feb 2009 08:50:26 +0100
Subject: block: fix bogus gcc warning for uninitialized var usage

Newer gcc throw this warning:

        fs/bio.c: In function ?bio_alloc_bioset?:
        fs/bio.c:305: warning: ?p? may be used uninitialized in this function

since it cannot figure out that 'p' is only ever used if 'bs' is non-NULL.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bio.c b/fs/bio.c
index 72ab251cdb9c..124b95c4d582 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,7 +302,7 @@ void bio_init(struct bio *bio)
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
 	struct bio *bio = NULL;
-	void *p;
+	void *uninitialized_var(p);
 
 	if (bs) {
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
-- 
cgit 


From 9e8c0bccdc944bd09361672d47660810c027bcaa Mon Sep 17 00:00:00 2001
From: Márton Németh <nm127@freemail.hu>
Date: Fri, 20 Feb 2009 08:12:51 +0100
Subject: block: add documentation for register_blkdev()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add documentation for register_blkdev() function and for the parameters.

Signed-off-by: Márton Németh <nm127@freemail.hu>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/genhd.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index e1eadcc9546a..a9ec910974c1 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -256,6 +256,22 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 }
 #endif /* CONFIG_PROC_FS */
 
+/**
+ * register_blkdev - register a new block device
+ *
+ * @major: the requested major device number [1..255]. If @major=0, try to
+ *         allocate any unused major number.
+ * @name: the name of the new block device as a zero terminated string
+ *
+ * The @name must be unique within the system.
+ *
+ * The return value depends on the @major input parameter.
+ *  - if a major device number was requested in range [1..255] then the
+ *    function returns zero on success, or a negative error code
+ *  - if any unused major number was requested with @major=0 parameter
+ *    then the return value is the allocated major number in range
+ *    [1..255] or a negative error code otherwise
+ */
 int register_blkdev(unsigned int major, const char *name)
 {
 	struct blk_major_name **n, *p;
-- 
cgit 


From 5e4c91c84b194b26cf592779e451f4b5be777cba Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 23 Feb 2009 08:53:35 +0100
Subject: cciss: shorten 30s timeout on controller reset

If reset_devices is set for kexec, then cciss will delay 30 seconds
since the old 5i controller _may_ need that long to recover. Replace
the long sleep with incremental sleep and tests to reduce the 30 seconds
to worst case for 5i, so that other controllers will proceed quickly.

Reviewed-by: Mike Miller <mike.miller@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index d2cb67b61176..b5a061114630 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -3611,11 +3611,15 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		schedule_timeout_uninterruptible(30*HZ);
 
 		/* Now try to get the controller to respond to a no-op */
-		for (i=0; i<12; i++) {
+		for (i=0; i<30; i++) {
 			if (cciss_noop(pdev) == 0)
 				break;
-			else
-				printk("cciss: no-op failed%s\n", (i < 11 ? "; re-trying" : ""));
+
+			schedule_timeout_uninterruptible(HZ);
+		}
+		if (i == 30) {
+			printk(KERN_ERR "cciss: controller seems dead\n");
+			return -EBUSY;
 		}
 	}
 
-- 
cgit 


From 1e42807918d17e8c93bf14fbb74be84b141334c1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 23 Feb 2009 09:03:10 +0100
Subject: block: reduce stack footprint of blk_recount_segments()

blk_recalc_rq_segments() requires a request structure passed in, which
we don't have from blk_recount_segments(). So the latter allocates one on
the stack, using > 400 bytes of stack for that. This can cause us to spill
over one page of stack from ext4 at least:

 0)     4560     400   blk_recount_segments+0x43/0x62
 1)     4160      32   bio_phys_segments+0x1c/0x24
 2)     4128      32   blk_rq_bio_prep+0x2a/0xf9
 3)     4096      32   init_request_from_bio+0xf9/0xfe
 4)     4064     112   __make_request+0x33c/0x3f6
 5)     3952     144   generic_make_request+0x2d1/0x321
 6)     3808      64   submit_bio+0xb9/0xc3
 7)     3744      48   submit_bh+0xea/0x10e
 8)     3696     368   ext4_mb_init_cache+0x257/0xa6a [ext4]
 9)     3328     288   ext4_mb_regular_allocator+0x421/0xcd9 [ext4]
10)     3040     160   ext4_mb_new_blocks+0x211/0x4b4 [ext4]
11)     2880     336   ext4_ext_get_blocks+0xb61/0xd45 [ext4]
12)     2544      96   ext4_get_blocks_wrap+0xf2/0x200 [ext4]
13)     2448      80   ext4_da_get_block_write+0x6e/0x16b [ext4]
14)     2368     352   mpage_da_map_blocks+0x7e/0x4b3 [ext4]
15)     2016     352   ext4_da_writepages+0x2ce/0x43c [ext4]
16)     1664      32   do_writepages+0x2d/0x3c
17)     1632     144   __writeback_single_inode+0x162/0x2cd
18)     1488      96   generic_sync_sb_inodes+0x1e3/0x32b
19)     1392      16   sync_sb_inodes+0xe/0x10
20)     1376      48   writeback_inodes+0x69/0xb3
21)     1328     208   balance_dirty_pages_ratelimited_nr+0x187/0x2f9
22)     1120     224   generic_file_buffered_write+0x1d4/0x2c4
23)      896     176   __generic_file_aio_write_nolock+0x35f/0x393
24)      720      80   generic_file_aio_write+0x6c/0xc8
25)      640      80   ext4_file_write+0xa9/0x137 [ext4]
26)      560     320   do_sync_write+0xf0/0x137
27)      240      48   vfs_write+0xb3/0x13c
28)      192      64   sys_write+0x4c/0x74
29)      128     128   system_call_fastpath+0x16/0x1b

Split the segment counting out into a __blk_recalc_rq_segments() helper
to avoid allocating an onstack request just for checking the physical
segment count.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-merge.c      | 94 ++++++++++++++++++++++++++++----------------------
 include/linux/blkdev.h |  2 ++
 2 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index b92f5b0866b0..a104593e70c3 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -38,72 +38,84 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
 	}
 }
 
-void blk_recalc_rq_segments(struct request *rq)
+static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
+					     struct bio *bio,
+					     unsigned int *seg_size_ptr)
 {
-	int nr_phys_segs;
 	unsigned int phys_size;
 	struct bio_vec *bv, *bvprv = NULL;
-	int seg_size;
-	int cluster;
-	struct req_iterator iter;
-	int high, highprv = 1;
-	struct request_queue *q = rq->q;
+	int cluster, i, high, highprv = 1;
+	unsigned int seg_size, nr_phys_segs;
+	struct bio *fbio;
 
-	if (!rq->bio)
-		return;
+	if (!bio)
+		return 0;
 
+	fbio = bio;
 	cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 	seg_size = 0;
 	phys_size = nr_phys_segs = 0;
-	rq_for_each_segment(bv, rq, iter) {
-		/*
-		 * the trick here is making sure that a high page is never
-		 * considered part of another segment, since that might
-		 * change with the bounce page.
-		 */
-		high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
-		if (high || highprv)
-			goto new_segment;
-		if (cluster) {
-			if (seg_size + bv->bv_len > q->max_segment_size)
-				goto new_segment;
-			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
-				goto new_segment;
-			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+	for_each_bio(bio) {
+		bio_for_each_segment(bv, bio, i) {
+			/*
+			 * the trick here is making sure that a high page is
+			 * never considered part of another segment, since that
+			 * might change with the bounce page.
+			 */
+			high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
+			if (high || highprv)
 				goto new_segment;
+			if (cluster) {
+				if (seg_size + bv->bv_len > q->max_segment_size)
+					goto new_segment;
+				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
+					goto new_segment;
+				if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+					goto new_segment;
+
+				seg_size += bv->bv_len;
+				bvprv = bv;
+				continue;
+			}
+new_segment:
+			if (nr_phys_segs == 1 && seg_size >
+			    fbio->bi_seg_front_size)
+				fbio->bi_seg_front_size = seg_size;
 
-			seg_size += bv->bv_len;
+			nr_phys_segs++;
 			bvprv = bv;
-			continue;
+			seg_size = bv->bv_len;
+			highprv = high;
 		}
-new_segment:
-		if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
-			rq->bio->bi_seg_front_size = seg_size;
-
-		nr_phys_segs++;
-		bvprv = bv;
-		seg_size = bv->bv_len;
-		highprv = high;
 	}
 
-	if (nr_phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
+	if (seg_size_ptr)
+		*seg_size_ptr = seg_size;
+
+	return nr_phys_segs;
+}
+
+void blk_recalc_rq_segments(struct request *rq)
+{
+	unsigned int seg_size = 0, phys_segs;
+
+	phys_segs = __blk_recalc_rq_segments(rq->q, rq->bio, &seg_size);
+
+	if (phys_segs == 1 && seg_size > rq->bio->bi_seg_front_size)
 		rq->bio->bi_seg_front_size = seg_size;
 	if (seg_size > rq->biotail->bi_seg_back_size)
 		rq->biotail->bi_seg_back_size = seg_size;
 
-	rq->nr_phys_segments = nr_phys_segs;
+	rq->nr_phys_segments = phys_segs;
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-	struct request rq;
 	struct bio *nxt = bio->bi_next;
-	rq.q = q;
-	rq.bio = rq.biotail = bio;
+
 	bio->bi_next = NULL;
-	blk_recalc_rq_segments(&rq);
+	bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, NULL);
 	bio->bi_next = nxt;
-	bio->bi_phys_segments = rq.nr_phys_segments;
 	bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dcaa0fd84b02..465d6babc847 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -708,6 +708,8 @@ struct req_iterator {
 };
 
 /* This should not be used directly - use rq_for_each_segment */
+#define for_each_bio(_bio)		\
+	for (; _bio; _bio = _bio->bi_next)
 #define __rq_for_each_bio(_bio, rq)	\
 	if ((rq->bio))			\
 		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
-- 
cgit 


From 9e973e64ac6dc504e6447d52193d4fff1a670156 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 24 Feb 2009 08:10:09 +0100
Subject: xen/blkfront: use blk_rq_map_sg to generate ring entries

On occasion, the request will apparently have more segments than we
fit into the ring. Jens says:

> The second problem is that the block layer then appears to create one
> too many segments, but from the dump it has rq->nr_phys_segments ==
> BLKIF_MAX_SEGMENTS_PER_REQUEST. I suspect the latter is due to
> xen-blkfront not handling the merging on its own. It should check that
> the new page doesn't form part of the previous page. The
> rq_for_each_segment() iterates all single bits in the request, not dma
> segments. The "easiest" way to do this is to call blk_rq_map_sg() and
> then iterate the mapped sg list. That will give you what you are
> looking for.

> Here's a test patch, compiles but otherwise untested. I spent more
> time figuring out how to enable XEN than to code it up, so YMMV!
> Probably the sg list wants to be put inside the ring and only
> initialized on allocation, then you can get rid of the sg on stack and
> sg_init_table() loop call in the function. I'll leave that, and the
> testing, to you.

[Moved sg array into info structure, and initialize once. -J]

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/block/xen-blkfront.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 918ef725de41..b6c8ce254359 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -40,6 +40,7 @@
 #include <linux/hdreg.h>
 #include <linux/cdrom.h>
 #include <linux/module.h>
+#include <linux/scatterlist.h>
 
 #include <xen/xenbus.h>
 #include <xen/grant_table.h>
@@ -82,6 +83,7 @@ struct blkfront_info
 	enum blkif_state connected;
 	int ring_ref;
 	struct blkif_front_ring ring;
+	struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int evtchn, irq;
 	struct request_queue *rq;
 	struct work_struct work;
@@ -204,12 +206,11 @@ static int blkif_queue_request(struct request *req)
 	struct blkfront_info *info = req->rq_disk->private_data;
 	unsigned long buffer_mfn;
 	struct blkif_request *ring_req;
-	struct req_iterator iter;
-	struct bio_vec *bvec;
 	unsigned long id;
 	unsigned int fsect, lsect;
-	int ref;
+	int i, ref;
 	grant_ref_t gref_head;
+	struct scatterlist *sg;
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
@@ -238,12 +239,13 @@ static int blkif_queue_request(struct request *req)
 	if (blk_barrier_rq(req))
 		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
 
-	ring_req->nr_segments = 0;
-	rq_for_each_segment(bvec, req, iter) {
-		BUG_ON(ring_req->nr_segments == BLKIF_MAX_SEGMENTS_PER_REQUEST);
-		buffer_mfn = pfn_to_mfn(page_to_pfn(bvec->bv_page));
-		fsect = bvec->bv_offset >> 9;
-		lsect = fsect + (bvec->bv_len >> 9) - 1;
+	ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
+	BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+	for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
+		buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
+		fsect = sg->offset >> 9;
+		lsect = fsect + (sg->length >> 9) - 1;
 		/* install a grant reference. */
 		ref = gnttab_claim_grant_reference(&gref_head);
 		BUG_ON(ref == -ENOSPC);
@@ -254,16 +256,12 @@ static int blkif_queue_request(struct request *req)
 				buffer_mfn,
 				rq_data_dir(req) );
 
-		info->shadow[id].frame[ring_req->nr_segments] =
-				mfn_to_pfn(buffer_mfn);
-
-		ring_req->seg[ring_req->nr_segments] =
+		info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
+		ring_req->seg[i] =
 				(struct blkif_request_segment) {
 					.gref       = ref,
 					.first_sect = fsect,
 					.last_sect  = lsect };
-
-		ring_req->nr_segments++;
 	}
 
 	info->ring.req_prod_pvt++;
@@ -622,6 +620,8 @@ static int setup_blkring(struct xenbus_device *dev,
 	SHARED_RING_INIT(sring);
 	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
 
+	sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
 	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
 	if (err < 0) {
 		free_page((unsigned long)sring);
-- 
cgit