From 3edf5346e4f2ce2fa0c94651a90a8dda169565ee Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Wed, 31 Mar 2021 07:53:59 -0400 Subject: block: only update parent bi_status when bio fail For multiple split bios, if one of the bio is fail, the whole should return error to application. But we found there is a race between bio_integrity_verify_fn and bio complete, which return io success to application after one of the bio fail. The race as following: split bio(READ) kworker nvme_complete_rq blk_update_request //split error=0 bio_endio bio_integrity_endio queue_work(kintegrityd_wq, &bip->bip_work); bio_integrity_verify_fn bio_endio //split bio __bio_chain_endio if (!parent->bi_status) nvme_irq blk_update_request //parent error=7 req_bio_endio bio->bi_status = 7 //parent bio parent->bi_status = 0 parent->bi_end_io() // return bi_status=0 The bio has been split as two: split and parent. When split bio completed, it depends on kworker to do endio, while bio_integrity_verify_fn have been interrupted by parent bio complete irq handler. Then, parent bio->bi_status which have been set in irq handler will overwrite by kworker. In fact, even without the above race, we also need to conside the concurrency beteen mulitple split bio complete and update the same parent bi_status. Normally, multiple split bios will be issued to the same hctx and complete from the same irq vector. But if we have updated queue map between multiple split bios, these bios may complete on different hw queue and different irq vector. Then the concurrency update parent bi_status may cause the final status error. Suggested-by: Keith Busch Signed-off-by: Yufen Yu Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210331115359.1125679-1-yuyufen@huawei.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio.c b/block/bio.c index 963d1d406b3a..50e579088aca 100644 --- a/block/bio.c +++ b/block/bio.c @@ -277,7 +277,7 @@ static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - if (!parent->bi_status) + if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio); return parent; -- cgit From de3510e52b0a398261271455562458003b8eea62 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 1 Apr 2021 07:52:44 +0900 Subject: null_blk: fix command timeout completion handling Memory backed or zoned null block devices may generate actual request timeout errors due to the submission path being blocked on memory allocation or zone locking. Unlike fake timeouts or injected timeouts, the request submission path will call blk_mq_complete_request() or blk_mq_end_request() for these real timeout errors, causing a double completion and use after free situation as the block layer timeout handler executes blk_mq_rq_timed_out() and __blk_mq_free_request() in blk_mq_check_expired(). This problem often triggers a NULL pointer dereference such as: BUG: kernel NULL pointer dereference, address: 0000000000000050 RIP: 0010:blk_mq_sched_mark_restart_hctx+0x5/0x20 ... Call Trace: dd_finish_request+0x56/0x80 blk_mq_free_request+0x37/0x130 null_handle_cmd+0xbf/0x250 [null_blk] ? null_queue_rq+0x67/0xd0 [null_blk] blk_mq_dispatch_rq_list+0x122/0x850 __blk_mq_do_dispatch_sched+0xbb/0x2c0 __blk_mq_sched_dispatch_requests+0x13d/0x190 blk_mq_sched_dispatch_requests+0x30/0x60 __blk_mq_run_hw_queue+0x49/0x90 process_one_work+0x26c/0x580 worker_thread+0x55/0x3c0 ? process_one_work+0x580/0x580 kthread+0x134/0x150 ? kthread_create_worker_on_cpu+0x70/0x70 ret_from_fork+0x1f/0x30 This problem very often triggers when running the full btrfs xfstests on a memory-backed zoned null block device in a VM with limited amount of memory. Avoid this by executing blk_mq_complete_request() in null_timeout_rq() only for commands that are marked for a fake timeout completion using the fake_timeout boolean in struct null_cmd. For timeout errors injected through debugfs, the timeout handler will execute blk_mq_complete_request()i as before. This is safe as the submission path does not execute complete requests in this case. In null_timeout_rq(), also make sure to set the command error field to BLK_STS_TIMEOUT and to propagate this error through to the request completion. Reported-by: Johannes Thumshirn Signed-off-by: Damien Le Moal Tested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210331225244.126426-1-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 26 +++++++++++++++++++++----- drivers/block/null_blk/null_blk.h | 1 + 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d6c821d48090..51bfd7737552 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1369,10 +1369,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, } if (dev->zoned) - cmd->error = null_process_zoned_cmd(cmd, op, - sector, nr_sectors); + sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors); else - cmd->error = null_process_cmd(cmd, op, sector, nr_sectors); + sts = null_process_cmd(cmd, op, sector, nr_sectors); + + /* Do not overwrite errors (e.g. timeout errors) */ + if (cmd->error == BLK_STS_OK) + cmd->error = sts; out: nullb_complete_cmd(cmd); @@ -1451,8 +1454,20 @@ static bool should_requeue_request(struct request *rq) static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) { + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + pr_info("rq %p timed out\n", rq); - blk_mq_complete_request(rq); + + /* + * If the device is marked as blocking (i.e. memory backed or zoned + * device), the submission path may be blocked waiting for resources + * and cause real timeouts. For these real timeouts, the submission + * path will complete the request using blk_mq_complete_request(). + * Only fake timeouts need to execute blk_mq_complete_request() here. + */ + cmd->error = BLK_STS_TIMEOUT; + if (cmd->fake_timeout) + blk_mq_complete_request(rq); return BLK_EH_DONE; } @@ -1473,6 +1488,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->rq = bd->rq; cmd->error = BLK_STS_OK; cmd->nq = nq; + cmd->fake_timeout = should_timeout_request(bd->rq); blk_mq_start_request(bd->rq); @@ -1489,7 +1505,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } } - if (should_timeout_request(bd->rq)) + if (cmd->fake_timeout) return BLK_STS_OK; return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 83504f3cc9d6..4876d5adb12d 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -22,6 +22,7 @@ struct nullb_cmd { blk_status_t error; struct nullb_queue *nq; struct hrtimer timer; + bool fake_timeout; }; struct nullb_queue { -- cgit From f8b78caf21d5bc3fcfc40c18898f9d52ed1451a5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 20 Nov 2020 17:10:28 +0000 Subject: block: don't ignore REQ_NOWAIT for direct IO If IOCB_NOWAIT is set on submission, then that needs to get propagated to REQ_NOWAIT on the block side. Otherwise we completely lose this information, and any issuer of IOCB_NOWAIT IO will potentially end up blocking on eg request allocation on the storage side. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index 28d583fcdc2c..09d6f7229db9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -275,6 +275,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); @@ -428,6 +430,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; -- cgit From b9c6cdc37ee1fe5866d3b1c10efb9d03191a76af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Apr 2021 19:17:31 +0200 Subject: block: update a few comments in uapi/linux/blkpg.h The big top of the file comment talk about grand plans that never happened, so remove them to not confuse the readers. Also mark the devname and volname fields as ignored as they were never used by the kernel. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/uapi/linux/blkpg.h | 28 ++-------------------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/include/uapi/linux/blkpg.h b/include/uapi/linux/blkpg.h index ac6474e4f29d..d0a64ee97c6d 100644 --- a/include/uapi/linux/blkpg.h +++ b/include/uapi/linux/blkpg.h @@ -2,29 +2,6 @@ #ifndef _UAPI__LINUX_BLKPG_H #define _UAPI__LINUX_BLKPG_H -/* - * Partition table and disk geometry handling - * - * A single ioctl with lots of subfunctions: - * - * Device number stuff: - * get_whole_disk() (given the device number of a partition, - * find the device number of the encompassing disk) - * get_all_partitions() (given the device number of a disk, return the - * device numbers of all its known partitions) - * - * Partition stuff: - * add_partition() - * delete_partition() - * test_partition_in_use() (also for test_disk_in_use) - * - * Geometry stuff: - * get_geometry() - * set_geometry() - * get_bios_drivedata() - * - * For today, only the partition stuff - aeb, 990515 - */ #include #include @@ -52,9 +29,8 @@ struct blkpg_partition { long long start; /* starting offset in bytes */ long long length; /* length in bytes */ int pno; /* partition number */ - char devname[BLKPG_DEVNAMELTH]; /* partition name, like sda5 or c0d1p2, - to be used in kernel messages */ - char volname[BLKPG_VOLNAMELTH]; /* volume label */ + char devname[BLKPG_DEVNAMELTH]; /* unused / ignored */ + char volname[BLKPG_VOLNAMELTH]; /* unused / ignore */ }; #endif /* _UAPI__LINUX_BLKPG_H */ -- cgit From f06c609645ecd043c79380fac94145926603fb33 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Apr 2021 19:17:46 +0200 Subject: block: remove the unused RQF_ALLOCED flag Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - include/linux/blkdev.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 9ebb344e2585..271f6596435b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -302,7 +302,6 @@ static const char *const rqf_name[] = { RQF_NAME(QUIET), RQF_NAME(ELVPRIV), RQF_NAME(IO_STAT), - RQF_NAME(ALLOCED), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bc6bc8383b43..158aefae1030 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -85,8 +85,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) /* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) -/* request came from our alloc pool */ -#define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) /* runtime pm request */ #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ -- cgit