From 797476b88bde2a6001f9552f383f147e58c1a330 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 18 Oct 2016 15:40:29 +0900 Subject: block: Add 'zoned' queue limit Add the zoned queue limit to indicate the zoning model of a block device. Defined values are 0 (BLK_ZONED_NONE) for regular block devices, 1 (BLK_ZONED_HA) for host-aware zone block devices and 2 (BLK_ZONED_HM) for host-managed zone block devices. The standards defined drive managed model is not defined here since these block devices do not provide any command for accessing zone information. Drive managed model devices will be reported as BLK_ZONED_NONE. The helper functions blk_queue_zoned_model and bdev_zoned_model return the zoned limit and the functions blk_queue_is_zoned and bdev_is_zoned return a boolean for callers to test if a block device is zoned. The zoned attribute is also exported as a string to applications via sysfs. BLK_ZONED_NONE shows as "none", BLK_ZONED_HA as "host-aware" and BLK_ZONED_HM as "host-managed". Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c47c358ba052..f19e16bb43d1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -261,6 +261,15 @@ struct blk_queue_tag { #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) +/* + * Zoned block device models (zoned limit). + */ +enum blk_zoned_model { + BLK_ZONED_NONE, /* Regular block device */ + BLK_ZONED_HA, /* Host-aware zoned block device */ + BLK_ZONED_HM, /* Host-managed zoned block device */ +}; + struct queue_limits { unsigned long bounce_pfn; unsigned long seg_boundary_mask; @@ -290,6 +299,7 @@ struct queue_limits { unsigned char cluster; unsigned char discard_zeroes_data; unsigned char raid_partial_stripes_expensive; + enum blk_zoned_model zoned; }; struct request_queue { @@ -627,6 +637,23 @@ static inline unsigned int blk_queue_cluster(struct request_queue *q) return q->limits.cluster; } +static inline enum blk_zoned_model +blk_queue_zoned_model(struct request_queue *q) +{ + return q->limits.zoned; +} + +static inline bool blk_queue_is_zoned(struct request_queue *q) +{ + switch (blk_queue_zoned_model(q)) { + case BLK_ZONED_HA: + case BLK_ZONED_HM: + return true; + default: + return false; + } +} + /* * We regard a request as sync, if either a read or a sync write */ @@ -1354,6 +1381,26 @@ static inline unsigned int bdev_write_same(struct block_device *bdev) return 0; } +static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_zoned_model(q); + + return BLK_ZONED_NONE; +} + +static inline bool bdev_is_zoned(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_is_zoned(q); + + return false; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; -- cgit From 6a0cb1bc106fc07ce0443303bcdb7f7da5131e5c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 18 Oct 2016 15:40:33 +0900 Subject: block: Implement support for zoned block devices Implement zoned block device zone information reporting and reset. Zone information are reported as struct blk_zone. This implementation does not differentiate between host-aware and host-managed device models and is valid for both. Two functions are provided: blkdev_report_zones for discovering the zone configuration of a zoned block device, and blkdev_reset_zones for resetting the write pointer of sequential zones. The helper function blk_queue_zone_size and bdev_zone_size are also provided for, as the name suggest, obtaining the zone size (in 512B sectors) of the zones of the device. Signed-off-by: Hannes Reinecke [Damien: * Removed the zone cache * Implement report zones operation based on earlier proposal by Shaun Tancheff ] Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Shaun Tancheff Tested-by: Shaun Tancheff Signed-off-by: Jens Axboe --- block/Kconfig | 8 ++ block/Makefile | 1 + block/blk-zoned.c | 257 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 31 +++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/blkzoned.h | 103 +++++++++++++++++ 6 files changed, 401 insertions(+) create mode 100644 block/blk-zoned.c create mode 100644 include/uapi/linux/blkzoned.h (limited to 'include/linux/blkdev.h') diff --git a/block/Kconfig b/block/Kconfig index 1d4d624492fc..6b0ad08f0677 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -89,6 +89,14 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_ZONED + bool "Zoned block device support" + ---help--- + Block layer zoned block device support. This option enables + support for ZAC/ZBC host-managed and host-aware zoned block devices. + + Say yes here if you have a ZAC or ZBC storage device. + config BLK_DEV_THROTTLING bool "Block layer bio throttling support" depends on BLK_CGROUP=y diff --git a/block/Makefile b/block/Makefile index 36acdd7545be..934dac73fb37 100644 --- a/block/Makefile +++ b/block/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o +obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o diff --git a/block/blk-zoned.c b/block/blk-zoned.c new file mode 100644 index 000000000000..1603573f9605 --- /dev/null +++ b/block/blk-zoned.c @@ -0,0 +1,257 @@ +/* + * Zoned block device handling + * + * Copyright (c) 2015, Hannes Reinecke + * Copyright (c) 2015, SUSE Linux GmbH + * + * Copyright (c) 2016, Damien Le Moal + * Copyright (c) 2016, Western Digital + */ + +#include +#include +#include +#include + +static inline sector_t blk_zone_start(struct request_queue *q, + sector_t sector) +{ + sector_t zone_mask = blk_queue_zone_size(q) - 1; + + return sector & ~zone_mask; +} + +/* + * Check that a zone report belongs to the partition. + * If yes, fix its start sector and write pointer, copy it in the + * zone information array and return true. Return false otherwise. + */ +static bool blkdev_report_zone(struct block_device *bdev, + struct blk_zone *rep, + struct blk_zone *zone) +{ + sector_t offset = get_start_sect(bdev); + + if (rep->start < offset) + return false; + + rep->start -= offset; + if (rep->start + rep->len > bdev->bd_part->nr_sects) + return false; + + if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL) + rep->wp = rep->start + rep->len; + else + rep->wp -= offset; + memcpy(zone, rep, sizeof(struct blk_zone)); + + return true; +} + +/** + * blkdev_report_zones - Get zones information + * @bdev: Target block device + * @sector: Sector from which to report zones + * @zones: Array of zone structures where to return the zones information + * @nr_zones: Number of zone structures in the zone array + * @gfp_mask: Memory allocation flags (for bio_alloc) + * + * Description: + * Get zone information starting from the zone containing @sector. + * The number of zone information reported may be less than the number + * requested by @nr_zones. The number of zones actually reported is + * returned in @nr_zones. + */ +int blkdev_report_zones(struct block_device *bdev, + sector_t sector, + struct blk_zone *zones, + unsigned int *nr_zones, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct blk_zone_report_hdr *hdr; + unsigned int nrz = *nr_zones; + struct page *page; + unsigned int nr_rep; + size_t rep_bytes; + unsigned int nr_pages; + struct bio *bio; + struct bio_vec *bv; + unsigned int i, n, nz; + unsigned int ofst; + void *addr; + int ret = 0; + + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -EOPNOTSUPP; + + if (!nrz) + return 0; + + if (sector > bdev->bd_part->nr_sects) { + *nr_zones = 0; + return 0; + } + + /* + * The zone report has a header. So make room for it in the + * payload. Also make sure that the report fits in a single BIO + * that will not be split down the stack. + */ + rep_bytes = sizeof(struct blk_zone_report_hdr) + + sizeof(struct blk_zone) * nrz; + rep_bytes = (rep_bytes + PAGE_SIZE - 1) & PAGE_MASK; + if (rep_bytes > (queue_max_sectors(q) << 9)) + rep_bytes = queue_max_sectors(q) << 9; + + nr_pages = min_t(unsigned int, BIO_MAX_PAGES, + rep_bytes >> PAGE_SHIFT); + nr_pages = min_t(unsigned int, nr_pages, + queue_max_segments(q)); + + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return -ENOMEM; + + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = blk_zone_start(q, sector); + bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0); + + for (i = 0; i < nr_pages; i++) { + page = alloc_page(gfp_mask); + if (!page) { + ret = -ENOMEM; + goto out; + } + if (!bio_add_page(bio, page, PAGE_SIZE, 0)) { + __free_page(page); + break; + } + } + + if (i == 0) + ret = -ENOMEM; + else + ret = submit_bio_wait(bio); + if (ret) + goto out; + + /* + * Process the report result: skip the header and go through the + * reported zones to fixup and fixup the zone information for + * partitions. At the same time, return the zone information into + * the zone array. + */ + n = 0; + nz = 0; + nr_rep = 0; + bio_for_each_segment_all(bv, bio, i) { + + if (!bv->bv_page) + break; + + addr = kmap_atomic(bv->bv_page); + + /* Get header in the first page */ + ofst = 0; + if (!nr_rep) { + hdr = (struct blk_zone_report_hdr *) addr; + nr_rep = hdr->nr_zones; + ofst = sizeof(struct blk_zone_report_hdr); + } + + /* Fixup and report zones */ + while (ofst < bv->bv_len && + n < nr_rep && nz < nrz) { + if (blkdev_report_zone(bdev, addr + ofst, &zones[nz])) + nz++; + ofst += sizeof(struct blk_zone); + n++; + } + + kunmap_atomic(addr); + + if (n >= nr_rep || nz >= nrz) + break; + + } + +out: + bio_for_each_segment_all(bv, bio, i) + __free_page(bv->bv_page); + bio_put(bio); + + if (ret == 0) + *nr_zones = nz; + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_report_zones); + +/** + * blkdev_reset_zones - Reset zones write pointer + * @bdev: Target block device + * @sector: Start sector of the first zone to reset + * @nr_sectors: Number of sectors, at least the length of one zone + * @gfp_mask: Memory allocation flags (for bio_alloc) + * + * Description: + * Reset the write pointer of the zones contained in the range + * @sector..@sector+@nr_sectors. Specifying the entire disk sector range + * is valid, but the specified range should not contain conventional zones. + */ +int blkdev_reset_zones(struct block_device *bdev, + sector_t sector, sector_t nr_sectors, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + sector_t zone_sectors; + sector_t end_sector = sector + nr_sectors; + struct bio *bio; + int ret; + + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -EOPNOTSUPP; + + if (end_sector > bdev->bd_part->nr_sects) + /* Out of range */ + return -EINVAL; + + /* Check alignment (handle eventual smaller last zone) */ + zone_sectors = blk_queue_zone_size(q); + if (sector & (zone_sectors - 1)) + return -EINVAL; + + if ((nr_sectors & (zone_sectors - 1)) && + end_sector != bdev->bd_part->nr_sects) + return -EINVAL; + + while (sector < end_sector) { + + bio = bio_alloc(gfp_mask, 0); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0); + + ret = submit_bio_wait(bio); + bio_put(bio); + + if (ret) + return ret; + + sector += zone_sectors; + + /* This may take a while, so be nice to others */ + cond_resched(); + + } + + return 0; +} +EXPORT_SYMBOL_GPL(blkdev_reset_zones); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f19e16bb43d1..252043f7cd2c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -24,6 +24,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -302,6 +303,21 @@ struct queue_limits { enum blk_zoned_model zoned; }; +#ifdef CONFIG_BLK_DEV_ZONED + +struct blk_zone_report_hdr { + unsigned int nr_zones; + u8 padding[60]; +}; + +extern int blkdev_report_zones(struct block_device *bdev, + sector_t sector, struct blk_zone *zones, + unsigned int *nr_zones, gfp_t gfp_mask); +extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, + sector_t nr_sectors, gfp_t gfp_mask); + +#endif /* CONFIG_BLK_DEV_ZONED */ + struct request_queue { /* * Together with queue_head for cacheline sharing @@ -654,6 +670,11 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } +static inline unsigned int blk_queue_zone_size(struct request_queue *q) +{ + return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; +} + /* * We regard a request as sync, if either a read or a sync write */ @@ -1401,6 +1422,16 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } +static inline unsigned int bdev_zone_size(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return blk_queue_zone_size(q); + + return 0; +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 6965d0909554..b2166f283da9 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -70,6 +70,7 @@ header-y += bfs_fs.h header-y += binfmts.h header-y += blkpg.h header-y += blktrace_api.h +header-y += blkzoned.h header-y += bpf_common.h header-y += bpf_perf_event.h header-y += bpf.h diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h new file mode 100644 index 000000000000..a3817214b0e0 --- /dev/null +++ b/include/uapi/linux/blkzoned.h @@ -0,0 +1,103 @@ +/* + * Zoned block devices handling. + * + * Copyright (C) 2015 Seagate Technology PLC + * + * Written by: Shaun Tancheff + * + * Modified by: Damien Le Moal + * Copyright (C) 2016 Western Digital + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ +#ifndef _UAPI_BLKZONED_H +#define _UAPI_BLKZONED_H + +#include + +/** + * enum blk_zone_type - Types of zones allowed in a zoned device. + * + * @BLK_ZONE_TYPE_CONVENTIONAL: The zone has no write pointer and can be writen + * randomly. Zone reset has no effect on the zone. + * @BLK_ZONE_TYPE_SEQWRITE_REQ: The zone must be written sequentially + * @BLK_ZONE_TYPE_SEQWRITE_PREF: The zone can be written non-sequentially + * + * Any other value not defined is reserved and must be considered as invalid. + */ +enum blk_zone_type { + BLK_ZONE_TYPE_CONVENTIONAL = 0x1, + BLK_ZONE_TYPE_SEQWRITE_REQ = 0x2, + BLK_ZONE_TYPE_SEQWRITE_PREF = 0x3, +}; + +/** + * enum blk_zone_cond - Condition [state] of a zone in a zoned device. + * + * @BLK_ZONE_COND_NOT_WP: The zone has no write pointer, it is conventional. + * @BLK_ZONE_COND_EMPTY: The zone is empty. + * @BLK_ZONE_COND_IMP_OPEN: The zone is open, but not explicitly opened. + * @BLK_ZONE_COND_EXP_OPEN: The zones was explicitly opened by an + * OPEN ZONE command. + * @BLK_ZONE_COND_CLOSED: The zone was [explicitly] closed after writing. + * @BLK_ZONE_COND_FULL: The zone is marked as full, possibly by a zone + * FINISH ZONE command. + * @BLK_ZONE_COND_READONLY: The zone is read-only. + * @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written). + * + * The Zone Condition state machine in the ZBC/ZAC standards maps the above + * deinitions as: + * - ZC1: Empty | BLK_ZONE_EMPTY + * - ZC2: Implicit Open | BLK_ZONE_COND_IMP_OPEN + * - ZC3: Explicit Open | BLK_ZONE_COND_EXP_OPEN + * - ZC4: Closed | BLK_ZONE_CLOSED + * - ZC5: Full | BLK_ZONE_FULL + * - ZC6: Read Only | BLK_ZONE_READONLY + * - ZC7: Offline | BLK_ZONE_OFFLINE + * + * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should + * be considered invalid. + */ +enum blk_zone_cond { + BLK_ZONE_COND_NOT_WP = 0x0, + BLK_ZONE_COND_EMPTY = 0x1, + BLK_ZONE_COND_IMP_OPEN = 0x2, + BLK_ZONE_COND_EXP_OPEN = 0x3, + BLK_ZONE_COND_CLOSED = 0x4, + BLK_ZONE_COND_READONLY = 0xD, + BLK_ZONE_COND_FULL = 0xE, + BLK_ZONE_COND_OFFLINE = 0xF, +}; + +/** + * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl. + * + * @start: Zone start in 512 B sector units + * @len: Zone length in 512 B sector units + * @wp: Zone write pointer location in 512 B sector units + * @type: see enum blk_zone_type for possible values + * @cond: see enum blk_zone_cond for possible values + * @non_seq: Flag indicating that the zone is using non-sequential resources + * (for host-aware zoned block devices only). + * @reset: Flag indicating that a zone reset is recommended. + * @reserved: Padding to 64 B to match the ZBC/ZAC defined zone descriptor size. + * + * start, len and wp use the regular 512 B sector unit, regardless of the + * device logical block size. The overall structure size is 64 B to match the + * ZBC/ZAC defined zone descriptor and allow support for future additional + * zone information. + */ +struct blk_zone { + __u64 start; /* Zone start sector */ + __u64 len; /* Zone length in number of sectors */ + __u64 wp; /* Zone write pointer position */ + __u8 type; /* Zone type */ + __u8 cond; /* Zone condition */ + __u8 non_seq; /* Non-sequential write resources active */ + __u8 reset; /* Reset write pointer recommended */ + __u8 reserved[36]; +}; + +#endif /* _UAPI_BLKZONED_H */ -- cgit From 3ed05a987e0f63b21e634101e0b460d32f3581c3 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Tue, 18 Oct 2016 15:40:35 +0900 Subject: blk-zoned: implement ioctls Adds the new BLKREPORTZONE and BLKRESETZONE ioctls for respectively obtaining the zone configuration of a zoned block device and resetting the write pointer of sequential zones of a zoned block device. The BLKREPORTZONE ioctl maps directly to a single call of the function blkdev_report_zones. The zone information result is passed as an array of struct blk_zone identical to the structure used internally for processing the REQ_OP_ZONE_REPORT operation. The BLKRESETZONE ioctl maps to a single call of the blkdev_reset_zones function. Signed-off-by: Shaun Tancheff Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-zoned.c | 93 +++++++++++++++++++++++++++++++++++++++++++ block/ioctl.c | 4 ++ include/linux/blkdev.h | 21 ++++++++++ include/uapi/linux/blkzoned.h | 40 +++++++++++++++++++ include/uapi/linux/fs.h | 4 ++ 5 files changed, 162 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1603573f9605..667f95d86695 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -255,3 +255,96 @@ int blkdev_reset_zones(struct block_device *bdev, return 0; } EXPORT_SYMBOL_GPL(blkdev_reset_zones); + +/** + * BLKREPORTZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_report rep; + struct blk_zone *zones; + int ret; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) + return -EFAULT; + + if (!rep.nr_zones) + return -EINVAL; + + zones = kcalloc(rep.nr_zones, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) + return -ENOMEM; + + ret = blkdev_report_zones(bdev, rep.sector, + zones, &rep.nr_zones, + GFP_KERNEL); + if (ret) + goto out; + + if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) { + ret = -EFAULT; + goto out; + } + + if (rep.nr_zones) { + if (copy_to_user(argp + sizeof(struct blk_zone_report), zones, + sizeof(struct blk_zone) * rep.nr_zones)) + ret = -EFAULT; + } + + out: + kfree(zones); + + return ret; +} + +/** + * BLKRESETZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_range zrange; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) + return -EFAULT; + + return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); +} diff --git a/block/ioctl.c b/block/ioctl.c index 755119c3c1b9..f856963204f4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -519,6 +519,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, BLKDEV_DISCARD_SECURE); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKREPORTZONE: + return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); + case BLKRESETZONE: + return blkdev_reset_zones_ioctl(bdev, mode, cmd, arg); case HDIO_GETGEO: return blkdev_getgeo(bdev, argp); case BLKRAGET: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 252043f7cd2c..90097dd8b8ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -316,6 +316,27 @@ extern int blkdev_report_zones(struct block_device *bdev, extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); +extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); + +#else /* CONFIG_BLK_DEV_ZONED */ + +static inline int blkdev_report_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + +static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + return -ENOTTY; +} + #endif /* CONFIG_BLK_DEV_ZONED */ struct request_queue { diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index a3817214b0e0..40d1d7bff537 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -16,6 +16,7 @@ #define _UAPI_BLKZONED_H #include +#include /** * enum blk_zone_type - Types of zones allowed in a zoned device. @@ -100,4 +101,43 @@ struct blk_zone { __u8 reserved[36]; }; +/** + * struct blk_zone_report - BLKREPORTZONE ioctl request/reply + * + * @sector: starting sector of report + * @nr_zones: IN maximum / OUT actual + * @reserved: padding to 16 byte alignment + * @zones: Space to hold @nr_zones @zones entries on reply. + * + * The array of at most @nr_zones must follow this structure in memory. + */ +struct blk_zone_report { + __u64 sector; + __u32 nr_zones; + __u8 reserved[4]; + struct blk_zone zones[0]; +} __packed; + +/** + * struct blk_zone_range - BLKRESETZONE ioctl request + * @sector: starting sector of the first zone to issue reset write pointer + * @nr_sectors: Total number of sectors of 1 or more zones to reset + */ +struct blk_zone_range { + __u64 sector; + __u64 nr_sectors; +}; + +/** + * Zoned block device ioctl's: + * + * @BLKREPORTZONE: Get zone information. Takes a zone report as argument. + * The zone report will start from the zone containing the + * sector specified in the report request structure. + * @BLKRESETZONE: Reset the write pointer of the zones in the specified + * sector range. The sector range must be zone aligned. + */ +#define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) +#define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) + #endif /* _UAPI_BLKZONED_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index acb2b6152ba0..c1d11df07b28 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -225,6 +225,10 @@ struct fsxattr { #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) +/* + * A jump here: 130-131 are reserved for zoned block devices + * (see uapi/linux/blkzoned.h) + */ #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit From 5dc8b362a2374d007bc0db649b7ab6a79dd32bda Mon Sep 17 00:00:00 2001 From: Adam Manzanares Date: Mon, 17 Oct 2016 11:27:28 -0700 Subject: block: Add iocontext priority to request Patch adds an association between iocontext ioprio and the ioprio of a request. This is done to enable request based drivers the ability to act on priority information stored in the request. An example being ATA devices that support command priorities. If the ATA driver discovers that the device supports command priorities and the request has valid priority information indicating the request is high priority, then a high priority command can be sent to the device. This should improve tail latencies for high priority IO on any device that queues requests internally and can make use of the priority information stored in the request. The ioprio of the request is set in blk_rq_set_prio which takes the request and the ioc as arguments. If the ioc is valid in blk_rq_set_prio then the iopriority of the request is set as the iopriority of the ioc. In init_request_from_bio a check is made to see if the ioprio of the bio is valid and if so then the request prio comes from the bio. Signed-off-by: Adam Manzananares Reviewed-by: Jens Axboe Signed-off-by: Tejun Heo --- block/blk-core.c | 4 +++- include/linux/blkdev.h | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 14d7c0740dc0..361b1b965d89 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1153,6 +1153,7 @@ static struct request *__get_request(struct request_list *rl, int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); + blk_rq_set_prio(rq, ioc); req_set_op_attrs(rq, op, op_flags | REQ_ALLOCED); /* init elvpriv */ @@ -1656,7 +1657,8 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->errors = 0; req->__sector = bio->bi_iter.bi_sector; - req->ioprio = bio_prio(bio); + if (ioprio_valid(bio_prio(bio))) + req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c47c358ba052..9a0ceaa1b7e6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -933,6 +933,20 @@ static inline unsigned int blk_rq_count_bios(struct request *rq) return nr_bios; } +/* + * blk_rq_set_prio - associate a request with prio from ioc + * @rq: request of interest + * @ioc: target iocontext + * + * Assocate request prio with ioc prio so request based drivers + * can leverage priority information. + */ +static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc) +{ + if (ioc) + rq->ioprio = ioc->ioprio; +} + /* * Request issue related functions. */ -- cgit From e806402130c9c494e22c73ae9ead4e79d2a5811c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2016 15:12:13 +0200 Subject: block: split out request-only flags into a new namespace A lot of the REQ_* flags are only used on struct requests, and only of use to the block layer and a few drivers that dig into struct request internals. This patch adds a new req_flags_t rq_flags field to struct request for them, and thus dramatically shrinks the number of common requests. It also removes the unfortunate situation where we have to fit the fields from the same enum into 32 bits for struct bio and 64 bits for struct request. Signed-off-by: Christoph Hellwig Reviewed-by: Shaun Tancheff Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 2 +- block/blk-core.c | 71 ++++++++++++++------------- block/blk-exec.c | 2 +- block/blk-flush.c | 9 ++-- block/blk-map.c | 4 +- block/blk-merge.c | 8 +-- block/blk-mq.c | 19 ++++---- block/blk-tag.c | 6 +-- block/blk.h | 4 +- block/elevator.c | 32 ++++++------ drivers/block/pktcdvd.c | 2 +- drivers/ide/ide-atapi.c | 6 +-- drivers/ide/ide-cd.c | 46 +++++++++--------- drivers/ide/ide-cd.h | 2 +- drivers/ide/ide-cd_ioctl.c | 6 +-- drivers/ide/ide-io.c | 6 +-- drivers/ide/ide-pm.c | 4 +- drivers/md/dm-rq.c | 12 ++--- drivers/memstick/core/ms_block.c | 2 +- drivers/memstick/core/mspro_block.c | 2 +- drivers/mmc/card/block.c | 4 +- drivers/mmc/card/queue.c | 4 +- drivers/nvme/host/pci.c | 4 +- drivers/scsi/device_handler/scsi_dh_alua.c | 8 +-- drivers/scsi/device_handler/scsi_dh_emc.c | 2 +- drivers/scsi/device_handler/scsi_dh_hp_sw.c | 2 +- drivers/scsi/device_handler/scsi_dh_rdac.c | 2 +- drivers/scsi/osd/osd_initiator.c | 2 +- drivers/scsi/osst.c | 2 +- drivers/scsi/scsi_error.c | 2 +- drivers/scsi/scsi_lib.c | 75 +++++++++++++++++------------ drivers/scsi/sd.c | 6 +-- drivers/scsi/sd_zbc.c | 2 +- drivers/scsi/st.c | 2 +- drivers/scsi/ufs/ufshcd.c | 6 +-- include/linux/blk_types.h | 39 +-------------- include/linux/blkdev.h | 49 ++++++++++++++++++- include/scsi/scsi_device.h | 4 +- 38 files changed, 242 insertions(+), 218 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 918e1e0d0e78..6acea160298c 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -348,7 +348,7 @@ Drivers can now specify a request prepare function (q->prep_rq_fn) that the block layer would invoke to pre-build device commands for a given request, or perform other preparatory processing for the request. This is routine is called by elv_next_request(), i.e. typically just before servicing a request. -(The prepare function would not be called for requests that have REQ_DONTPREP +(The prepare function would not be called for requests that have RQF_DONTPREP enabled) Aside: diff --git a/block/blk-core.c b/block/blk-core.c index e4eda5d2aa56..fd416651a676 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -145,13 +145,13 @@ static void req_bio_endio(struct request *rq, struct bio *bio, if (error) bio->bi_error = error; - if (unlikely(rq->cmd_flags & REQ_QUIET)) + if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } @@ -899,7 +899,7 @@ EXPORT_SYMBOL(blk_get_queue); static inline void blk_free_request(struct request_list *rl, struct request *rq) { - if (rq->cmd_flags & REQ_ELVPRIV) { + if (rq->rq_flags & RQF_ELVPRIV) { elv_put_request(rl->q, rq); if (rq->elv.icq) put_io_context(rq->elv.icq->ioc); @@ -961,14 +961,14 @@ static void __freed_request(struct request_list *rl, int sync) * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_list *rl, int op, unsigned int flags) +static void freed_request(struct request_list *rl, bool sync, + req_flags_t rq_flags) { struct request_queue *q = rl->q; - int sync = rw_is_sync(op, flags); q->nr_rqs[sync]--; rl->count[sync]--; - if (flags & REQ_ELVPRIV) + if (rq_flags & RQF_ELVPRIV) q->nr_rqs_elvpriv--; __freed_request(rl, sync); @@ -1079,6 +1079,7 @@ static struct request *__get_request(struct request_list *rl, int op, struct io_cq *icq = NULL; const bool is_sync = rw_is_sync(op, op_flags) != 0; int may_queue; + req_flags_t rq_flags = RQF_ALLOCED; if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); @@ -1127,7 +1128,7 @@ static struct request *__get_request(struct request_list *rl, int op, /* * Decide whether the new request will be managed by elevator. If - * so, mark @op_flags and increment elvpriv. Non-zero elvpriv will + * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will * prevent the current elevator from being destroyed until the new * request is freed. This guarantees icq's won't be destroyed and * makes creating new ones safe. @@ -1136,14 +1137,14 @@ static struct request *__get_request(struct request_list *rl, int op, * it will be created after releasing queue_lock. */ if (blk_rq_should_init_elevator(bio) && !blk_queue_bypass(q)) { - op_flags |= REQ_ELVPRIV; + rq_flags |= RQF_ELVPRIV; q->nr_rqs_elvpriv++; if (et->icq_cache && ioc) icq = ioc_lookup_icq(ioc, q); } if (blk_queue_io_stat(q)) - op_flags |= REQ_IO_STAT; + rq_flags |= RQF_IO_STAT; spin_unlock_irq(q->queue_lock); /* allocate and init request */ @@ -1153,10 +1154,11 @@ static struct request *__get_request(struct request_list *rl, int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); - req_set_op_attrs(rq, op, op_flags | REQ_ALLOCED); + req_set_op_attrs(rq, op, op_flags); + rq->rq_flags = rq_flags; /* init elvpriv */ - if (op_flags & REQ_ELVPRIV) { + if (rq_flags & RQF_ELVPRIV) { if (unlikely(et->icq_cache && !icq)) { if (ioc) icq = ioc_create_icq(ioc, q, gfp_mask); @@ -1195,7 +1197,7 @@ fail_elvpriv: printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", __func__, dev_name(q->backing_dev_info.dev)); - rq->cmd_flags &= ~REQ_ELVPRIV; + rq->rq_flags &= ~RQF_ELVPRIV; rq->elv.icq = NULL; spin_lock_irq(q->queue_lock); @@ -1212,7 +1214,7 @@ fail_alloc: * queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(rl, op, op_flags); + freed_request(rl, is_sync, rq_flags); /* * in the very unlikely event that allocation failed and no @@ -1347,7 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - if (rq->cmd_flags & REQ_QUEUED) + if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); @@ -1409,7 +1411,7 @@ EXPORT_SYMBOL_GPL(part_round_stats); #ifdef CONFIG_PM static void blk_pm_put_request(struct request *rq) { - if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) + if (rq->q->dev && !(rq->rq_flags & RQF_PM) && !--rq->q->nr_pending) pm_runtime_mark_last_busy(rq->q->dev); } #else @@ -1421,6 +1423,8 @@ static inline void blk_pm_put_request(struct request *rq) {} */ void __blk_put_request(struct request_queue *q, struct request *req) { + req_flags_t rq_flags = req->rq_flags; + if (unlikely(!q)) return; @@ -1440,16 +1444,15 @@ void __blk_put_request(struct request_queue *q, struct request *req) * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools */ - if (req->cmd_flags & REQ_ALLOCED) { - unsigned int flags = req->cmd_flags; - int op = req_op(req); + if (rq_flags & RQF_ALLOCED) { struct request_list *rl = blk_rq_rl(req); + bool sync = rw_is_sync(req_op(req), req->cmd_flags); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(ELV_ON_HASH(req)); blk_free_request(rl, req); - freed_request(rl, op, flags); + freed_request(rl, sync, rq_flags); blk_put_rl(rl); } } @@ -2214,7 +2217,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq) unsigned int bytes = 0; struct bio *bio; - if (!(rq->cmd_flags & REQ_MIXED_MERGE)) + if (!(rq->rq_flags & RQF_MIXED_MERGE)) return blk_rq_bytes(rq); /* @@ -2257,7 +2260,7 @@ void blk_account_io_done(struct request *req) * normal IO on queueing nor completion. Accounting the * containing request is enough. */ - if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { + if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; @@ -2285,7 +2288,7 @@ static struct request *blk_pm_peek_request(struct request_queue *q, struct request *rq) { if (q->dev && (q->rpm_status == RPM_SUSPENDED || - (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM)))) + (q->rpm_status != RPM_ACTIVE && !(rq->rq_flags & RQF_PM)))) return NULL; else return rq; @@ -2361,13 +2364,13 @@ struct request *blk_peek_request(struct request_queue *q) if (!rq) break; - if (!(rq->cmd_flags & REQ_STARTED)) { + if (!(rq->rq_flags & RQF_STARTED)) { /* * This is the first time the device driver * sees this request (possibly after * requeueing). Notify IO scheduler. */ - if (rq->cmd_flags & REQ_SORTED) + if (rq->rq_flags & RQF_SORTED) elv_activate_rq(q, rq); /* @@ -2375,7 +2378,7 @@ struct request *blk_peek_request(struct request_queue *q) * it, a request that has been delayed should * not be passed by new incoming requests */ - rq->cmd_flags |= REQ_STARTED; + rq->rq_flags |= RQF_STARTED; trace_block_rq_issue(q, rq); } @@ -2384,7 +2387,7 @@ struct request *blk_peek_request(struct request_queue *q) q->boundary_rq = NULL; } - if (rq->cmd_flags & REQ_DONTPREP) + if (rq->rq_flags & RQF_DONTPREP) break; if (q->dma_drain_size && blk_rq_bytes(rq)) { @@ -2407,11 +2410,11 @@ struct request *blk_peek_request(struct request_queue *q) /* * the request may have been (partially) prepped. * we need to keep this request in the front to - * avoid resource deadlock. REQ_STARTED will + * avoid resource deadlock. RQF_STARTED will * prevent other fs requests from passing this one. */ if (q->dma_drain_size && blk_rq_bytes(rq) && - !(rq->cmd_flags & REQ_DONTPREP)) { + !(rq->rq_flags & RQF_DONTPREP)) { /* * remove the space for the drain we added * so that we don't add it again @@ -2424,7 +2427,7 @@ struct request *blk_peek_request(struct request_queue *q) } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { int err = (ret == BLKPREP_INVALID) ? -EREMOTEIO : -EIO; - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; /* * Mark this request as started so we don't trigger * any debug logic in the end I/O path. @@ -2561,7 +2564,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) req->errors = 0; if (error && req->cmd_type == REQ_TYPE_FS && - !(req->cmd_flags & REQ_QUIET)) { + !(req->rq_flags & RQF_QUIET)) { char *error_type; switch (error) { @@ -2634,7 +2637,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ - if (req->cmd_flags & REQ_MIXED_MERGE) { + if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } @@ -2687,7 +2690,7 @@ void blk_unprep_request(struct request *req) { struct request_queue *q = req->q; - req->cmd_flags &= ~REQ_DONTPREP; + req->rq_flags &= ~RQF_DONTPREP; if (q->unprep_rq_fn) q->unprep_rq_fn(q, req); } @@ -2698,7 +2701,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); */ void blk_finish_request(struct request *req, int error) { - if (req->cmd_flags & REQ_QUEUED) + if (req->rq_flags & RQF_QUEUED) blk_queue_end_tag(req->q, req); BUG_ON(blk_queued_rq(req)); @@ -2708,7 +2711,7 @@ void blk_finish_request(struct request *req, int error) blk_delete_timer(req); - if (req->cmd_flags & REQ_DONTPREP) + if (req->rq_flags & RQF_DONTPREP) blk_unprep_request(req); blk_account_io_done(req); diff --git a/block/blk-exec.c b/block/blk-exec.c index 7ea04325d02f..3ecb00a6cf45 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -72,7 +72,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; rq->errors = -ENXIO; __blk_end_request_all(rq, rq->errors); spin_unlock_irq(q->queue_lock); diff --git a/block/blk-flush.c b/block/blk-flush.c index 6a14b68b9135..3990b9cfbda5 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -56,7 +56,7 @@ * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole - * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in + * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * * The above peculiarity requires that each FLUSH/FUA request has only one @@ -127,7 +127,7 @@ static void blk_flush_restore_request(struct request *rq) rq->bio = rq->biotail; /* make @rq a normal request */ - rq->cmd_flags &= ~REQ_FLUSH_SEQ; + rq->rq_flags &= ~RQF_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; } @@ -330,7 +330,8 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) } flush_rq->cmd_type = REQ_TYPE_FS; - req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH | REQ_FLUSH_SEQ); + req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH); + flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; @@ -433,7 +434,7 @@ void blk_insert_flush(struct request *rq) */ memset(&rq->flush, 0, sizeof(rq->flush)); INIT_LIST_HEAD(&rq->flush.list); - rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ if (q->mq_ops) { rq->end_io = mq_flush_data_end_io; diff --git a/block/blk-map.c b/block/blk-map.c index b8657fa8dc9a..2c5ae5fef473 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -135,7 +135,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, } while (iov_iter_count(&i)); if (!bio_flagged(bio, BIO_USER_MAPPED)) - rq->cmd_flags |= REQ_COPY_USER; + rq->rq_flags |= RQF_COPY_USER; return 0; unmap_rq: @@ -232,7 +232,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio_set_op_attrs(bio, REQ_OP_WRITE, 0); if (do_copy) - rq->cmd_flags |= REQ_COPY_USER; + rq->rq_flags |= RQF_COPY_USER; ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { diff --git a/block/blk-merge.c b/block/blk-merge.c index 2642e5fc8b69..fda6a12fc776 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -456,7 +456,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); - if (unlikely(rq->cmd_flags & REQ_COPY_USER) && + if (unlikely(rq->rq_flags & RQF_COPY_USER) && (blk_rq_bytes(rq) & q->dma_pad_mask)) { unsigned int pad_len = (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; @@ -634,7 +634,7 @@ void blk_rq_set_mixed_merge(struct request *rq) unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; struct bio *bio; - if (rq->cmd_flags & REQ_MIXED_MERGE) + if (rq->rq_flags & RQF_MIXED_MERGE) return; /* @@ -647,7 +647,7 @@ void blk_rq_set_mixed_merge(struct request *rq) (bio->bi_opf & REQ_FAILFAST_MASK) != ff); bio->bi_opf |= ff; } - rq->cmd_flags |= REQ_MIXED_MERGE; + rq->rq_flags |= RQF_MIXED_MERGE; } static void blk_account_io_merge(struct request *req) @@ -709,7 +709,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, * makes sure that all involved bios have mixable attributes * set properly. */ - if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || + if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) || (req->cmd_flags & REQ_FAILFAST_MASK) != (next->cmd_flags & REQ_FAILFAST_MASK)) { blk_rq_set_mixed_merge(req); diff --git a/block/blk-mq.c b/block/blk-mq.c index d74a74a9f9ef..b49c6658eb05 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -142,14 +142,13 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, struct request *rq, int op, unsigned int op_flags) { - if (blk_queue_io_stat(q)) - op_flags |= REQ_IO_STAT; - INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = q; rq->mq_ctx = ctx; req_set_op_attrs(rq, op, op_flags); + if (blk_queue_io_stat(q)) + rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ rq->cpu = -1; INIT_HLIST_NODE(&rq->hash); @@ -198,7 +197,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) rq = data->hctx->tags->rqs[tag]; if (blk_mq_tag_busy(data->hctx)) { - rq->cmd_flags = REQ_MQ_INFLIGHT; + rq->rq_flags = RQF_MQ_INFLIGHT; atomic_inc(&data->hctx->nr_active); } @@ -298,9 +297,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, const int tag = rq->tag; struct request_queue *q = rq->q; - if (rq->cmd_flags & REQ_MQ_INFLIGHT) + if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); - rq->cmd_flags = 0; + rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); blk_mq_put_tag(hctx, ctx, tag); @@ -489,10 +488,10 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_unlock_irqrestore(&q->requeue_lock, flags); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - if (!(rq->cmd_flags & REQ_SOFTBARRIER)) + if (!(rq->rq_flags & RQF_SOFTBARRIER)) continue; - rq->cmd_flags &= ~REQ_SOFTBARRIER; + rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); blk_mq_insert_request(rq, true, false, false); } @@ -519,11 +518,11 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) * We abuse this flag that is otherwise used by the I/O scheduler to * request head insertation from the workqueue. */ - BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); + BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); spin_lock_irqsave(&q->requeue_lock, flags); if (at_head) { - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; list_add(&rq->queuelist, &q->requeue_list); } else { list_add_tail(&rq->queuelist, &q->requeue_list); diff --git a/block/blk-tag.c b/block/blk-tag.c index f0344e6939d5..bae1decb6ec3 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -270,7 +270,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq) BUG_ON(tag >= bqt->real_max_depth); list_del_init(&rq->queuelist); - rq->cmd_flags &= ~REQ_QUEUED; + rq->rq_flags &= ~RQF_QUEUED; rq->tag = -1; if (unlikely(bqt->tag_index[tag] == NULL)) @@ -316,7 +316,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) unsigned max_depth; int tag; - if (unlikely((rq->cmd_flags & REQ_QUEUED))) { + if (unlikely((rq->rq_flags & RQF_QUEUED))) { printk(KERN_ERR "%s: request %p for device [%s] already tagged %d", __func__, rq, @@ -371,7 +371,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) */ bqt->next_tag = (tag + 1) % bqt->max_depth; - rq->cmd_flags |= REQ_QUEUED; + rq->rq_flags |= RQF_QUEUED; rq->tag = tag; bqt->tag_index[tag] = rq; blk_start_request(rq); diff --git a/block/blk.h b/block/blk.h index 74444c49078f..aa132dea598c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -130,7 +130,7 @@ static inline void blk_clear_rq_complete(struct request *rq) /* * Internal elevator interface */ -#define ELV_ON_HASH(rq) ((rq)->cmd_flags & REQ_HASHED) +#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) void blk_insert_flush(struct request *rq); @@ -247,7 +247,7 @@ extern int blk_update_nr_requests(struct request_queue *, unsigned int); static inline int blk_do_io_stat(struct request *rq) { return rq->rq_disk && - (rq->cmd_flags & REQ_IO_STAT) && + (rq->rq_flags & RQF_IO_STAT) && (rq->cmd_type == REQ_TYPE_FS); } diff --git a/block/elevator.c b/block/elevator.c index f7d973a56fd7..ac80f89a0842 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -245,7 +245,7 @@ EXPORT_SYMBOL(elevator_exit); static inline void __elv_rqhash_del(struct request *rq) { hash_del(&rq->hash); - rq->cmd_flags &= ~REQ_HASHED; + rq->rq_flags &= ~RQF_HASHED; } static void elv_rqhash_del(struct request_queue *q, struct request *rq) @@ -260,7 +260,7 @@ static void elv_rqhash_add(struct request_queue *q, struct request *rq) BUG_ON(ELV_ON_HASH(rq)); hash_add(e->hash, &rq->hash, rq_hash_key(rq)); - rq->cmd_flags |= REQ_HASHED; + rq->rq_flags |= RQF_HASHED; } static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) @@ -352,7 +352,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) { sector_t boundary; struct list_head *entry; - int stop_flags; if (q->last_merge == rq) q->last_merge = NULL; @@ -362,7 +361,6 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) q->nr_sorted--; boundary = q->end_sector; - stop_flags = REQ_SOFTBARRIER | REQ_STARTED; list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); @@ -370,7 +368,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) break; if (rq_data_dir(rq) != rq_data_dir(pos)) break; - if (pos->cmd_flags & stop_flags) + if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER)) break; if (blk_rq_pos(rq) >= boundary) { if (blk_rq_pos(pos) < boundary) @@ -510,7 +508,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; - const int next_sorted = next->cmd_flags & REQ_SORTED; + const int next_sorted = next->rq_flags & RQF_SORTED; if (next_sorted && e->type->ops.elevator_merge_req_fn) e->type->ops.elevator_merge_req_fn(q, rq, next); @@ -537,13 +535,13 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, #ifdef CONFIG_PM static void blk_pm_requeue_request(struct request *rq) { - if (rq->q->dev && !(rq->cmd_flags & REQ_PM)) + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) rq->q->nr_pending--; } static void blk_pm_add_request(struct request_queue *q, struct request *rq) { - if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 && + if (q->dev && !(rq->rq_flags & RQF_PM) && q->nr_pending++ == 0 && (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING)) pm_request_resume(q->dev); } @@ -563,11 +561,11 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (rq->cmd_flags & REQ_SORTED) + if (rq->rq_flags & RQF_SORTED) elv_deactivate_rq(q, rq); } - rq->cmd_flags &= ~REQ_STARTED; + rq->rq_flags &= ~RQF_STARTED; blk_pm_requeue_request(rq); @@ -597,13 +595,13 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) rq->q = q; - if (rq->cmd_flags & REQ_SOFTBARRIER) { + if (rq->rq_flags & RQF_SOFTBARRIER) { /* barriers are scheduling boundary, update end_sector */ if (rq->cmd_type == REQ_TYPE_FS) { q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; } - } else if (!(rq->cmd_flags & REQ_ELVPRIV) && + } else if (!(rq->rq_flags & RQF_ELVPRIV) && (where == ELEVATOR_INSERT_SORT || where == ELEVATOR_INSERT_SORT_MERGE)) where = ELEVATOR_INSERT_BACK; @@ -611,12 +609,12 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) switch (where) { case ELEVATOR_INSERT_REQUEUE: case ELEVATOR_INSERT_FRONT: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; list_add(&rq->queuelist, &q->queue_head); break; case ELEVATOR_INSERT_BACK: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; elv_drain_elevator(q); list_add_tail(&rq->queuelist, &q->queue_head); /* @@ -642,7 +640,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) break; case ELEVATOR_INSERT_SORT: BUG_ON(rq->cmd_type != REQ_TYPE_FS); - rq->cmd_flags |= REQ_SORTED; + rq->rq_flags |= RQF_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -659,7 +657,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where) break; case ELEVATOR_INSERT_FLUSH: - rq->cmd_flags |= REQ_SOFTBARRIER; + rq->rq_flags |= RQF_SOFTBARRIER; blk_insert_flush(rq); break; default: @@ -735,7 +733,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if ((rq->cmd_flags & REQ_SORTED) && + if ((rq->rq_flags & RQF_SORTED) && e->type->ops.elevator_completed_req_fn) e->type->ops.elevator_completed_req_fn(q, rq); } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 90fa4ac149db..7cf795e0fc8d 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -721,7 +721,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * rq->timeout = 60*HZ; if (cgc->quiet) - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0); if (rq->errors) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 05352f490d60..f90ea221f7f2 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -211,7 +211,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) sense_rq->cmd[0] = GPCMD_REQUEST_SENSE; sense_rq->cmd[4] = cmd_len; sense_rq->cmd_type = REQ_TYPE_ATA_SENSE; - sense_rq->cmd_flags |= REQ_PREEMPT; + sense_rq->rq_flags |= RQF_PREEMPT; if (drive->media == ide_tape) sense_rq->cmd[13] = REQ_IDETAPE_PC1; @@ -295,7 +295,7 @@ int ide_cd_expiry(ide_drive_t *drive) wait = ATAPI_WAIT_PC; break; default: - if (!(rq->cmd_flags & REQ_QUIET)) + if (!(rq->rq_flags & RQF_QUIET)) printk(KERN_INFO PFX "cmd 0x%x timed out\n", rq->cmd[0]); wait = 0; @@ -375,7 +375,7 @@ int ide_check_ireason(ide_drive_t *drive, struct request *rq, int len, } if (dev_is_idecd(drive) && rq->cmd_type == REQ_TYPE_ATA_PC) - rq->cmd_flags |= REQ_FAILED; + rq->rq_flags |= RQF_FAILED; return 1; } diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index bf9a2ad296ed..9cbd217bc0c9 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -98,7 +98,7 @@ static int cdrom_log_sense(ide_drive_t *drive, struct request *rq) struct request_sense *sense = &drive->sense_data; int log = 0; - if (!sense || !rq || (rq->cmd_flags & REQ_QUIET)) + if (!sense || !rq || (rq->rq_flags & RQF_QUIET)) return 0; ide_debug_log(IDE_DBG_SENSE, "sense_key: 0x%x", sense->sense_key); @@ -291,7 +291,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) * (probably while trying to recover from a former error). * Just give up. */ - rq->cmd_flags |= REQ_FAILED; + rq->rq_flags |= RQF_FAILED; return 2; } @@ -311,7 +311,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) cdrom_saw_media_change(drive); if (rq->cmd_type == REQ_TYPE_FS && - !(rq->cmd_flags & REQ_QUIET)) + !(rq->rq_flags & RQF_QUIET)) printk(KERN_ERR PFX "%s: tray open\n", drive->name); } @@ -346,7 +346,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) * No point in retrying after an illegal request or data * protect error. */ - if (!(rq->cmd_flags & REQ_QUIET)) + if (!(rq->rq_flags & RQF_QUIET)) ide_dump_status(drive, "command error", stat); do_end_request = 1; break; @@ -355,14 +355,14 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) * No point in re-trying a zillion times on a bad sector. * If we got here the error is not correctable. */ - if (!(rq->cmd_flags & REQ_QUIET)) + if (!(rq->rq_flags & RQF_QUIET)) ide_dump_status(drive, "media error " "(bad sector)", stat); do_end_request = 1; break; case BLANK_CHECK: /* disk appears blank? */ - if (!(rq->cmd_flags & REQ_QUIET)) + if (!(rq->rq_flags & RQF_QUIET)) ide_dump_status(drive, "media error (blank)", stat); do_end_request = 1; @@ -380,7 +380,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) } if (rq->cmd_type != REQ_TYPE_FS) { - rq->cmd_flags |= REQ_FAILED; + rq->rq_flags |= RQF_FAILED; do_end_request = 1; } @@ -422,19 +422,19 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd) int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, int write, void *buffer, unsigned *bufflen, struct request_sense *sense, int timeout, - unsigned int cmd_flags) + req_flags_t rq_flags) { struct cdrom_info *info = drive->driver_data; struct request_sense local_sense; int retries = 10; - unsigned int flags = 0; + req_flags_t flags = 0; if (!sense) sense = &local_sense; ide_debug_log(IDE_DBG_PC, "cmd[0]: 0x%x, write: 0x%x, timeout: %d, " - "cmd_flags: 0x%x", - cmd[0], write, timeout, cmd_flags); + "rq_flags: 0x%x", + cmd[0], write, timeout, rq_flags); /* start of retry loop */ do { @@ -446,7 +446,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, memcpy(rq->cmd, cmd, BLK_MAX_CDB); rq->cmd_type = REQ_TYPE_ATA_PC; rq->sense = sense; - rq->cmd_flags |= cmd_flags; + rq->rq_flags |= rq_flags; rq->timeout = timeout; if (buffer) { error = blk_rq_map_kern(drive->queue, rq, buffer, @@ -462,14 +462,14 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, if (buffer) *bufflen = rq->resid_len; - flags = rq->cmd_flags; + flags = rq->rq_flags; blk_put_request(rq); /* * FIXME: we should probably abort/retry or something in case of * failure. */ - if (flags & REQ_FAILED) { + if (flags & RQF_FAILED) { /* * The request failed. Retry if it was due to a unit * attention status (usually means media was changed). @@ -494,10 +494,10 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, } /* end of retry loop */ - } while ((flags & REQ_FAILED) && retries >= 0); + } while ((flags & RQF_FAILED) && retries >= 0); /* return an error if the command failed */ - return (flags & REQ_FAILED) ? -EIO : 0; + return (flags & RQF_FAILED) ? -EIO : 0; } /* @@ -589,7 +589,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) "(%u bytes)\n", drive->name, __func__, cmd->nleft); if (!write) - rq->cmd_flags |= REQ_FAILED; + rq->rq_flags |= RQF_FAILED; uptodate = 0; } } else if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { @@ -607,7 +607,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) } if (!uptodate) - rq->cmd_flags |= REQ_FAILED; + rq->rq_flags |= RQF_FAILED; } goto out_end; } @@ -745,9 +745,9 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq) rq->cmd[0], rq->cmd_type); if (rq->cmd_type == REQ_TYPE_BLOCK_PC) - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; else - rq->cmd_flags &= ~REQ_FAILED; + rq->rq_flags &= ~RQF_FAILED; drive->dma = 0; @@ -867,7 +867,7 @@ int cdrom_check_status(ide_drive_t *drive, struct request_sense *sense) */ cmd[7] = cdi->sanyo_slot % 3; - return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, REQ_QUIET); + return ide_cd_queue_pc(drive, cmd, 0, NULL, NULL, sense, 0, RQF_QUIET); } static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, @@ -890,7 +890,7 @@ static int cdrom_read_capacity(ide_drive_t *drive, unsigned long *capacity, cmd[0] = GPCMD_READ_CDVD_CAPACITY; stat = ide_cd_queue_pc(drive, cmd, 0, &capbuf, &len, sense, 0, - REQ_QUIET); + RQF_QUIET); if (stat) return stat; @@ -943,7 +943,7 @@ static int cdrom_read_tocentry(ide_drive_t *drive, int trackno, int msf_flag, if (msf_flag) cmd[1] = 2; - return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, REQ_QUIET); + return ide_cd_queue_pc(drive, cmd, 0, buf, &buflen, sense, 0, RQF_QUIET); } /* Try to read the entire TOC for the disk into our internal buffer. */ diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h index 1efc936f5b66..eea60c986c4f 100644 --- a/drivers/ide/ide-cd.h +++ b/drivers/ide/ide-cd.h @@ -101,7 +101,7 @@ void ide_cd_log_error(const char *, struct request *, struct request_sense *); /* ide-cd.c functions used by ide-cd_ioctl.c */ int ide_cd_queue_pc(ide_drive_t *, const unsigned char *, int, void *, - unsigned *, struct request_sense *, int, unsigned int); + unsigned *, struct request_sense *, int, req_flags_t); int ide_cd_read_toc(ide_drive_t *, struct request_sense *); int ide_cdrom_get_capabilities(ide_drive_t *, u8 *); void ide_cdrom_update_speed(ide_drive_t *, u8 *); diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c index 5887a7a09e37..f085e3a2e1d6 100644 --- a/drivers/ide/ide-cd_ioctl.c +++ b/drivers/ide/ide-cd_ioctl.c @@ -305,7 +305,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi) rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_DRV_PRIV; - rq->cmd_flags = REQ_QUIET; + rq->rq_flags = RQF_QUIET; ret = blk_execute_rq(drive->queue, cd->disk, rq, 0); blk_put_request(rq); /* @@ -449,7 +449,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi, struct packet_command *cgc) { ide_drive_t *drive = cdi->handle; - unsigned int flags = 0; + req_flags_t flags = 0; unsigned len = cgc->buflen; if (cgc->timeout <= 0) @@ -463,7 +463,7 @@ int ide_cdrom_packet(struct cdrom_device_info *cdi, memset(cgc->sense, 0, sizeof(struct request_sense)); if (cgc->quiet) - flags |= REQ_QUIET; + flags |= RQF_QUIET; cgc->stat = ide_cd_queue_pc(drive, cgc->cmd, cgc->data_direction == CGC_DATA_WRITE, diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 669ea1e45795..6360bbd37efe 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -307,7 +307,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) { ide_startstop_t startstop; - BUG_ON(!(rq->cmd_flags & REQ_STARTED)); + BUG_ON(!(rq->rq_flags & RQF_STARTED)); #ifdef DEBUG printk("%s: start_request: current=0x%08lx\n", @@ -316,7 +316,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) /* bail early if we've exceeded max_failures */ if (drive->max_failures && (drive->failures > drive->max_failures)) { - rq->cmd_flags |= REQ_FAILED; + rq->rq_flags |= RQF_FAILED; goto kill_rq; } @@ -539,7 +539,7 @@ repeat: */ if ((drive->dev_flags & IDE_DFLAG_BLOCKED) && ata_pm_request(rq) == 0 && - (rq->cmd_flags & REQ_PREEMPT) == 0) { + (rq->rq_flags & RQF_PREEMPT) == 0) { /* there should be no pending command at this point */ ide_unlock_port(hwif); goto plug_device; diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index e34af488693a..a015acdffb39 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -53,7 +53,7 @@ static int ide_pm_execute_rq(struct request *rq) spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; rq->errors = -ENXIO; __blk_end_request_all(rq, rq->errors); spin_unlock_irq(q->queue_lock); @@ -90,7 +90,7 @@ int generic_ide_resume(struct device *dev) memset(&rqpm, 0, sizeof(rqpm)); rq = blk_get_request(drive->queue, READ, __GFP_RECLAIM); rq->cmd_type = REQ_TYPE_ATA_PM_RESUME; - rq->cmd_flags |= REQ_PREEMPT; + rq->rq_flags |= RQF_PREEMPT; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; rqpm.pm_state = PM_EVENT_ON; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index dc75bea0d541..f76cc36b8546 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -313,7 +313,7 @@ static void dm_unprep_request(struct request *rq) if (!rq->q->mq_ops) { rq->special = NULL; - rq->cmd_flags &= ~REQ_DONTPREP; + rq->rq_flags &= ~RQF_DONTPREP; } if (clone) @@ -431,7 +431,7 @@ static void dm_softirq_done(struct request *rq) return; } - if (rq->cmd_flags & REQ_FAILED) + if (rq->rq_flags & RQF_FAILED) mapped = false; dm_done(clone, tio->error, mapped); @@ -460,7 +460,7 @@ static void dm_complete_request(struct request *rq, int error) */ static void dm_kill_unmapped_request(struct request *rq, int error) { - rq->cmd_flags |= REQ_FAILED; + rq->rq_flags |= RQF_FAILED; dm_complete_request(rq, error); } @@ -476,7 +476,7 @@ static void end_clone_request(struct request *clone, int error) * For just cleaning up the information of the queue in which * the clone was dispatched. * The clone is *NOT* freed actually here because it is alloced - * from dm own mempool (REQ_ALLOCED isn't set). + * from dm own mempool (RQF_ALLOCED isn't set). */ __blk_put_request(clone->q, clone); } @@ -497,7 +497,7 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq) int r; if (blk_queue_io_stat(clone->q)) - clone->cmd_flags |= REQ_IO_STAT; + clone->rq_flags |= RQF_IO_STAT; clone->start_time = jiffies; r = blk_insert_cloned_request(clone->q, clone); @@ -633,7 +633,7 @@ static int dm_old_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_DEFER; rq->special = tio; - rq->cmd_flags |= REQ_DONTPREP; + rq->rq_flags |= RQF_DONTPREP; return BLKPREP_OK; } diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index aacf584f2a42..f3512404bc52 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2006,7 +2006,7 @@ static int msb_prepare_req(struct request_queue *q, struct request *req) blk_dump_rq_flags(req, "MS unsupported request"); return BLKPREP_KILL; } - req->cmd_flags |= REQ_DONTPREP; + req->rq_flags |= RQF_DONTPREP; return BLKPREP_OK; } diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c1472275fe57..fa0746d182ff 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -834,7 +834,7 @@ static int mspro_block_prepare_req(struct request_queue *q, struct request *req) return BLKPREP_KILL; } - req->cmd_flags |= REQ_DONTPREP; + req->rq_flags |= RQF_DONTPREP; return BLKPREP_OK; } diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index c3335112e68c..f8190dd4a35c 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -2117,7 +2117,7 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) mmc_blk_abort_packed_req(mq_rq); } else { if (mmc_card_removed(card)) - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; while (ret) ret = blk_end_request(req, -EIO, blk_rq_cur_bytes(req)); @@ -2126,7 +2126,7 @@ static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc) start_new_req: if (rqc) { if (mmc_card_removed(card)) { - rqc->cmd_flags |= REQ_QUIET; + rqc->rq_flags |= RQF_QUIET; blk_end_request_all(rqc, -EIO); } else { /* diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index 8037f73a109a..8a67f1c2ce21 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -44,7 +44,7 @@ static int mmc_prep_request(struct request_queue *q, struct request *req) if (mq && (mmc_card_removed(mq->card) || mmc_access_rpmb(mq))) return BLKPREP_KILL; - req->cmd_flags |= REQ_DONTPREP; + req->rq_flags |= RQF_DONTPREP; return BLKPREP_OK; } @@ -120,7 +120,7 @@ static void mmc_request_fn(struct request_queue *q) if (!mq) { while ((req = blk_fetch_request(q)) != NULL) { - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; __blk_end_request_all(req, -EIO); } return; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0fc99f0f2571..0955e9d22020 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -323,9 +323,9 @@ static int nvme_init_iod(struct request *rq, unsigned size, iod->nents = 0; iod->length = size; - if (!(rq->cmd_flags & REQ_DONTPREP)) { + if (!(rq->rq_flags & RQF_DONTPREP)) { rq->retries = 0; - rq->cmd_flags |= REQ_DONTPREP; + rq->rq_flags |= RQF_DONTPREP; } return 0; } diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index 241829e59668..05813a420188 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -154,7 +154,8 @@ static int submit_rtpg(struct scsi_device *sdev, unsigned char *buff, return scsi_execute_req_flags(sdev, cdb, DMA_FROM_DEVICE, buff, bufflen, sshdr, ALUA_FAILOVER_TIMEOUT * HZ, - ALUA_FAILOVER_RETRIES, NULL, req_flags); + ALUA_FAILOVER_RETRIES, NULL, + req_flags, 0); } /* @@ -187,7 +188,8 @@ static int submit_stpg(struct scsi_device *sdev, int group_id, return scsi_execute_req_flags(sdev, cdb, DMA_TO_DEVICE, stpg_data, stpg_len, sshdr, ALUA_FAILOVER_TIMEOUT * HZ, - ALUA_FAILOVER_RETRIES, NULL, req_flags); + ALUA_FAILOVER_RETRIES, NULL, + req_flags, 0); } static struct alua_port_group *alua_find_get_pg(char *id_str, size_t id_size, @@ -1063,7 +1065,7 @@ static int alua_prep_fn(struct scsi_device *sdev, struct request *req) state != SCSI_ACCESS_STATE_ACTIVE && state != SCSI_ACCESS_STATE_LBA) { ret = BLKPREP_KILL; - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; } return ret; diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c index 375d81850f15..5b80746980b8 100644 --- a/drivers/scsi/device_handler/scsi_dh_emc.c +++ b/drivers/scsi/device_handler/scsi_dh_emc.c @@ -452,7 +452,7 @@ static int clariion_prep_fn(struct scsi_device *sdev, struct request *req) if (h->lun_state != CLARIION_LUN_OWNED) { ret = BLKPREP_KILL; - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; } return ret; diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c index 9406d5f4a3d3..308e87195dc1 100644 --- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c +++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c @@ -266,7 +266,7 @@ static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req) if (h->path_state != HP_SW_PATH_ACTIVE) { ret = BLKPREP_KILL; - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; } return ret; diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c index 06fbd0b0c68a..00d9c326158e 100644 --- a/drivers/scsi/device_handler/scsi_dh_rdac.c +++ b/drivers/scsi/device_handler/scsi_dh_rdac.c @@ -724,7 +724,7 @@ static int rdac_prep_fn(struct scsi_device *sdev, struct request *req) if (h->state != RDAC_STATE_ACTIVE) { ret = BLKPREP_KILL; - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; } return ret; diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 2f2a9910e30e..ef99f62831fb 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1595,7 +1595,7 @@ static int _init_blk_request(struct osd_request *or, } or->request = req; - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; req->timeout = or->timeout; req->retries = or->retries; diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index 5033223f6287..a2960f5d98ec 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -368,7 +368,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd, return DRIVER_ERROR << 24; blk_rq_set_block_pc(req); - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; SRpnt->bio = NULL; diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index 106a6adbd6f1..996e134d79fa 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev) req->cmd_len = COMMAND_SIZE(req->cmd[0]); - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; req->timeout = 10 * HZ; req->retries = 5; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 2cca9cffc63f..8c52622ac257 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -163,26 +163,11 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) { __scsi_queue_insert(cmd, reason, 1); } -/** - * scsi_execute - insert request and wait for the result - * @sdev: scsi device - * @cmd: scsi command - * @data_direction: data direction - * @buffer: data buffer - * @bufflen: len of buffer - * @sense: optional sense buffer - * @timeout: request timeout in seconds - * @retries: number of times to retry request - * @flags: or into request flags; - * @resid: optional residual length - * - * returns the req->errors value which is the scsi_cmnd result - * field. - */ -int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, + +static int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, unsigned char *sense, int timeout, int retries, u64 flags, - int *resid) + req_flags_t rq_flags, int *resid) { struct request *req; int write = (data_direction == DMA_TO_DEVICE); @@ -203,7 +188,8 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, req->sense_len = 0; req->retries = retries; req->timeout = timeout; - req->cmd_flags |= flags | REQ_QUIET | REQ_PREEMPT; + req->cmd_flags |= flags; + req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT; /* * head injection *required* here otherwise quiesce won't work @@ -227,12 +213,37 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, return ret; } + +/** + * scsi_execute - insert request and wait for the result + * @sdev: scsi device + * @cmd: scsi command + * @data_direction: data direction + * @buffer: data buffer + * @bufflen: len of buffer + * @sense: optional sense buffer + * @timeout: request timeout in seconds + * @retries: number of times to retry request + * @flags: or into request flags; + * @resid: optional residual length + * + * returns the req->errors value which is the scsi_cmnd result + * field. + */ +int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, + int data_direction, void *buffer, unsigned bufflen, + unsigned char *sense, int timeout, int retries, u64 flags, + int *resid) +{ + return __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, sense, + timeout, retries, flags, 0, resid); +} EXPORT_SYMBOL(scsi_execute); int scsi_execute_req_flags(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, int retries, - int *resid, u64 flags) + int *resid, u64 flags, req_flags_t rq_flags) { char *sense = NULL; int result; @@ -242,8 +253,8 @@ int scsi_execute_req_flags(struct scsi_device *sdev, const unsigned char *cmd, if (!sense) return DRIVER_ERROR << 24; } - result = scsi_execute(sdev, cmd, data_direction, buffer, bufflen, - sense, timeout, retries, flags, resid); + result = __scsi_execute(sdev, cmd, data_direction, buffer, bufflen, + sense, timeout, retries, flags, rq_flags, resid); if (sshdr) scsi_normalize_sense(sense, SCSI_SENSE_BUFFERSIZE, sshdr); @@ -813,7 +824,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) */ if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d)) ; - else if (!(req->cmd_flags & REQ_QUIET)) + else if (!(req->rq_flags & RQF_QUIET)) scsi_print_sense(cmd); result = 0; /* BLOCK_PC may have set error */ @@ -943,7 +954,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) switch (action) { case ACTION_FAIL: /* Give up and fail the remainder of the request */ - if (!(req->cmd_flags & REQ_QUIET)) { + if (!(req->rq_flags & RQF_QUIET)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -972,7 +983,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) * A new command will be prepared and issued. */ if (q->mq_ops) { - cmd->request->cmd_flags &= ~REQ_DONTPREP; + cmd->request->rq_flags &= ~RQF_DONTPREP; scsi_mq_uninit_cmd(cmd); scsi_mq_requeue_cmd(cmd); } else { @@ -1234,7 +1245,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) /* * If the devices is blocked we defer normal commands. */ - if (!(req->cmd_flags & REQ_PREEMPT)) + if (!(req->rq_flags & RQF_PREEMPT)) ret = BLKPREP_DEFER; break; default: @@ -1243,7 +1254,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) * special commands. In particular any user initiated * command is not allowed. */ - if (!(req->cmd_flags & REQ_PREEMPT)) + if (!(req->rq_flags & RQF_PREEMPT)) ret = BLKPREP_KILL; break; } @@ -1279,7 +1290,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret) blk_delay_queue(q, SCSI_QUEUE_DELAY); break; default: - req->cmd_flags |= REQ_DONTPREP; + req->rq_flags |= RQF_DONTPREP; } return ret; @@ -1736,7 +1747,7 @@ static void scsi_request_fn(struct request_queue *q) * we add the dev to the starved list so it eventually gets * a run when a tag is freed. */ - if (blk_queue_tagged(q) && !(req->cmd_flags & REQ_QUEUED)) { + if (blk_queue_tagged(q) && !(req->rq_flags & RQF_QUEUED)) { spin_lock_irq(shost->host_lock); if (list_empty(&sdev->starved_entry)) list_add_tail(&sdev->starved_entry, @@ -1903,11 +1914,11 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_dec_target_busy; - if (!(req->cmd_flags & REQ_DONTPREP)) { + if (!(req->rq_flags & RQF_DONTPREP)) { ret = prep_to_mq(scsi_mq_prep_fn(req)); if (ret) goto out_dec_host_busy; - req->cmd_flags |= REQ_DONTPREP; + req->rq_flags |= RQF_DONTPREP; } else { blk_mq_start_request(req); } @@ -1952,7 +1963,7 @@ out: * we hit an error, as we will never see this command * again. */ - if (req->cmd_flags & REQ_DONTPREP) + if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); break; default: diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b9618ffca829..cef1f78031d4 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1520,7 +1520,7 @@ static int sd_sync_cache(struct scsi_disk *sdkp) */ res = scsi_execute_req_flags(sdp, cmd, DMA_NONE, NULL, 0, &sshdr, timeout, SD_MAX_RETRIES, - NULL, REQ_PM); + NULL, 0, RQF_PM); if (res == 0) break; } @@ -1879,7 +1879,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) good_bytes = 0; req->__data_len = blk_rq_bytes(req); - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; } } } @@ -3278,7 +3278,7 @@ static int sd_start_stop_device(struct scsi_disk *sdkp, int start) return -ENODEV; res = scsi_execute_req_flags(sdp, cmd, DMA_NONE, NULL, 0, &sshdr, - SD_TIMEOUT, SD_MAX_RETRIES, NULL, REQ_PM); + SD_TIMEOUT, SD_MAX_RETRIES, NULL, 0, RQF_PM); if (res) { sd_print_result(sdkp, "Start/Stop Unit failed", res); if (driver_byte(res) & DRIVER_SENSE) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index d5b3bd915d9e..394ab490919c 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -348,7 +348,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd, * this case, so be quiet about the error. */ if (req_op(rq) == REQ_OP_ZONE_RESET) - rq->cmd_flags |= REQ_QUIET; + rq->rq_flags |= RQF_QUIET; break; case 0x21: /* diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 7af5226aa55b..3bc46a4abd43 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -546,7 +546,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd, return DRIVER_ERROR << 24; blk_rq_set_block_pc(req); - req->cmd_flags |= REQ_QUIET; + req->rq_flags |= RQF_QUIET; mdata->null_mapped = 1; diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 05c745663c10..cf549871c1ee 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -5590,7 +5590,7 @@ ufshcd_send_request_sense(struct ufs_hba *hba, struct scsi_device *sdp) ret = scsi_execute_req_flags(sdp, cmd, DMA_FROM_DEVICE, buffer, SCSI_SENSE_BUFFERSIZE, NULL, - msecs_to_jiffies(1000), 3, NULL, REQ_PM); + msecs_to_jiffies(1000), 3, NULL, 0, RQF_PM); if (ret) pr_err("%s: failed with err %d\n", __func__, ret); @@ -5652,11 +5652,11 @@ static int ufshcd_set_dev_pwr_mode(struct ufs_hba *hba, /* * Current function would be generally called from the power management - * callbacks hence set the REQ_PM flag so that it doesn't resume the + * callbacks hence set the RQF_PM flag so that it doesn't resume the * already suspended childs. */ ret = scsi_execute_req_flags(sdp, cmd, DMA_NONE, NULL, 0, &sshdr, - START_STOP_TIMEOUT, 0, NULL, REQ_PM); + START_STOP_TIMEOUT, 0, NULL, 0, RQF_PM); if (ret) { sdev_printk(KERN_WARNING, sdp, "START_STOP failed for power mode: %d, result %x\n", diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6df722de2e22..ec69a8fe3b29 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -167,26 +167,6 @@ enum rq_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ - /* request only flags */ - __REQ_SORTED, /* elevator knows about this request */ - __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ - __REQ_STARTED, /* drive already may have started this one */ - __REQ_DONTPREP, /* don't call prep for this one */ - __REQ_QUEUED, /* uses queueing */ - __REQ_ELVPRIV, /* elevator private data attached */ - __REQ_FAILED, /* set if the request failed */ - __REQ_QUIET, /* don't worry about errors */ - __REQ_PREEMPT, /* set for "ide_preempt" requests and also - for requests for which the SCSI "quiesce" - state must be ignored. */ - __REQ_ALLOCED, /* request came from our alloc pool */ - __REQ_COPY_USER, /* contains copies of user pages */ - __REQ_FLUSH_SEQ, /* request for flush sequence */ - __REQ_IO_STAT, /* account I/O stat */ - __REQ_MIXED_MERGE, /* merge of different types, fail separately */ - __REQ_PM, /* runtime pm request */ - __REQ_HASHED, /* on IO scheduler merge hash */ - __REQ_MQ_INFLIGHT, /* track inflight for MQ */ __REQ_NR_BITS, /* stops here */ }; @@ -208,29 +188,12 @@ enum rq_flag_bits { /* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ - (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_PREFLUSH | REQ_FUA | REQ_FLUSH_SEQ) + (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) -#define REQ_SORTED (1ULL << __REQ_SORTED) -#define REQ_SOFTBARRIER (1ULL << __REQ_SOFTBARRIER) #define REQ_FUA (1ULL << __REQ_FUA) #define REQ_NOMERGE (1ULL << __REQ_NOMERGE) -#define REQ_STARTED (1ULL << __REQ_STARTED) -#define REQ_DONTPREP (1ULL << __REQ_DONTPREP) -#define REQ_QUEUED (1ULL << __REQ_QUEUED) -#define REQ_ELVPRIV (1ULL << __REQ_ELVPRIV) -#define REQ_FAILED (1ULL << __REQ_FAILED) -#define REQ_QUIET (1ULL << __REQ_QUIET) -#define REQ_PREEMPT (1ULL << __REQ_PREEMPT) -#define REQ_ALLOCED (1ULL << __REQ_ALLOCED) -#define REQ_COPY_USER (1ULL << __REQ_COPY_USER) #define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) -#define REQ_FLUSH_SEQ (1ULL << __REQ_FLUSH_SEQ) -#define REQ_IO_STAT (1ULL << __REQ_IO_STAT) -#define REQ_MIXED_MERGE (1ULL << __REQ_MIXED_MERGE) -#define REQ_PM (1ULL << __REQ_PM) -#define REQ_HASHED (1ULL << __REQ_HASHED) -#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) enum req_op { REQ_OP_READ, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 90097dd8b8ed..b4415feac679 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -78,6 +78,50 @@ enum rq_cmd_type_bits { REQ_TYPE_DRV_PRIV, /* driver defined types from here */ }; +/* + * request flags */ +typedef __u32 __bitwise req_flags_t; + +/* elevator knows about this request */ +#define RQF_SORTED ((__force req_flags_t)(1 << 0)) +/* drive already may have started this one */ +#define RQF_STARTED ((__force req_flags_t)(1 << 1)) +/* uses tagged queueing */ +#define RQF_QUEUED ((__force req_flags_t)(1 << 2)) +/* may not be passed by ioscheduler */ +#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) +/* request for flush sequence */ +#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) +/* merge of different types, fail separately */ +#define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) +/* track inflight for MQ */ +#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) +/* don't call prep for this one */ +#define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) +/* set for "ide_preempt" requests and also for requests for which the SCSI + "quiesce" state must be ignored. */ +#define RQF_PREEMPT ((__force req_flags_t)(1 << 8)) +/* contains copies of user pages */ +#define RQF_COPY_USER ((__force req_flags_t)(1 << 9)) +/* vaguely specified driver internal error. Ignored by the block layer */ +#define RQF_FAILED ((__force req_flags_t)(1 << 10)) +/* don't warn about errors */ +#define RQF_QUIET ((__force req_flags_t)(1 << 11)) +/* elevator private data attached */ +#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) +/* account I/O stat */ +#define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) +/* request came from our alloc pool */ +#define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) +/* runtime pm request */ +#define RQF_PM ((__force req_flags_t)(1 << 15)) +/* on IO scheduler merge hash */ +#define RQF_HASHED ((__force req_flags_t)(1 << 16)) + +/* flags that prevent us from merging requests: */ +#define RQF_NOMERGE_FLAGS \ + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ) + #define BLK_MAX_CDB 16 /* @@ -99,6 +143,7 @@ struct request { int cpu; unsigned cmd_type; u64 cmd_flags; + req_flags_t rq_flags; unsigned long atomic_flags; /* the following two fields are internal, NEVER access directly */ @@ -648,7 +693,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) REQ_FAILFAST_DRIVER)) #define blk_account_rq(rq) \ - (((rq)->cmd_flags & REQ_STARTED) && \ + (((rq)->rq_flags & RQF_STARTED) && \ ((rq)->cmd_type == REQ_TYPE_FS)) #define blk_rq_cpu_valid(rq) ((rq)->cpu != -1) @@ -740,6 +785,8 @@ static inline bool rq_mergeable(struct request *rq) if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; + if (rq->rq_flags & RQF_NOMERGE_FLAGS) + return false; return true; } diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 8a9563144890..8990e580b278 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -414,14 +414,14 @@ extern int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, extern int scsi_execute_req_flags(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, - int retries, int *resid, u64 flags); + int retries, int *resid, u64 flags, req_flags_t rq_flags); static inline int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, struct scsi_sense_hdr *sshdr, int timeout, int retries, int *resid) { return scsi_execute_req_flags(sdev, cmd, data_direction, buffer, - bufflen, sshdr, timeout, retries, resid, 0); + bufflen, sshdr, timeout, retries, resid, 0, 0); } extern void sdev_disable_disk_events(struct scsi_device *sdev); extern void sdev_enable_disk_events(struct scsi_device *sdev); -- cgit From ef295ecf090d3e86e5b742fc6ab34f1122a43773 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2016 08:48:16 -0600 Subject: block: better op and flags encoding Now that we don't need the common flags to overflow outside the range of a 32-bit type we can encode them the same way for both the bio and request fields. This in addition allows us to place the operation first (and make some room for more ops while we're at it) and to stop having to shift around the operation values. In addition this allows passing around only one value in the block layer instead of two (and eventuall also in the file systems, but we can do that later) and thus clean up a lot of code. Last but not least this allows decreasing the size of the cmd_flags field in struct request to 32-bits. Various functions passing this value could also be updated, but I'd like to avoid the churn for now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 4 +- block/blk-core.c | 60 ++++++++++-------------------- block/blk-flush.c | 2 +- block/blk-lib.c | 2 +- block/blk-map.c | 2 + block/blk-mq.c | 28 ++++++-------- block/cfq-iosched.c | 66 ++++++++++++++++----------------- block/elevator.c | 4 +- drivers/md/dm-crypt.c | 2 +- drivers/scsi/sd.c | 3 +- fs/btrfs/inode.c | 5 +-- fs/buffer.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/gfs2/lops.c | 2 +- include/linux/blk-cgroup.h | 11 +++--- include/linux/blk_types.h | 83 +++++++++++++++++++----------------------- include/linux/blkdev.h | 26 +------------ include/linux/blktrace_api.h | 2 +- include/linux/dm-io.h | 2 +- include/linux/elevator.h | 4 +- include/trace/events/bcache.h | 12 ++---- include/trace/events/block.h | 31 ++++++---------- kernel/trace/blktrace.c | 14 +++---- 23 files changed, 148 insertions(+), 221 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 6acea160298c..01ddeaf64b0f 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -553,8 +553,8 @@ struct request { struct request_list *rl; } -See the rq_flag_bits definitions for an explanation of the various flags -available. Some bits are used by the block layer or i/o scheduler. +See the req_ops and req_flag_bits definitions for an explanation of the various +flags available. Some bits are used by the block layer or i/o scheduler. The behaviour of the various sector counts are almost the same as before, except that since we have multi-segment bios, current_nr_sectors refers diff --git a/block/blk-core.c b/block/blk-core.c index fd416651a676..0bfaa54d3e9f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1056,8 +1056,7 @@ static struct io_context *rq_ioc(struct bio *bio) /** * __get_request - get a free request * @rl: request list to allocate from - * @op: REQ_OP_READ/REQ_OP_WRITE - * @op_flags: rq_flag_bits + * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * @@ -1068,23 +1067,22 @@ static struct io_context *rq_ioc(struct bio *bio) * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ -static struct request *__get_request(struct request_list *rl, int op, - int op_flags, struct bio *bio, - gfp_t gfp_mask) +static struct request *__get_request(struct request_list *rl, unsigned int op, + struct bio *bio, gfp_t gfp_mask) { struct request_queue *q = rl->q; struct request *rq; struct elevator_type *et = q->elevator->type; struct io_context *ioc = rq_ioc(bio); struct io_cq *icq = NULL; - const bool is_sync = rw_is_sync(op, op_flags) != 0; + const bool is_sync = op_is_sync(op); int may_queue; req_flags_t rq_flags = RQF_ALLOCED; if (unlikely(blk_queue_dying(q))) return ERR_PTR(-ENODEV); - may_queue = elv_may_queue(q, op, op_flags); + may_queue = elv_may_queue(q, op); if (may_queue == ELV_MQUEUE_NO) goto rq_starved; @@ -1154,7 +1152,7 @@ static struct request *__get_request(struct request_list *rl, int op, blk_rq_init(q, rq); blk_rq_set_rl(rq, rl); - req_set_op_attrs(rq, op, op_flags); + rq->cmd_flags = op; rq->rq_flags = rq_flags; /* init elvpriv */ @@ -1232,8 +1230,7 @@ rq_starved: /** * get_request - get a free request * @q: request_queue to allocate request from - * @op: REQ_OP_READ/REQ_OP_WRITE - * @op_flags: rq_flag_bits + * @op: operation and flags * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * @@ -1244,18 +1241,17 @@ rq_starved: * Returns ERR_PTR on failure, with @q->queue_lock held. * Returns request pointer on success, with @q->queue_lock *not held*. */ -static struct request *get_request(struct request_queue *q, int op, - int op_flags, struct bio *bio, - gfp_t gfp_mask) +static struct request *get_request(struct request_queue *q, unsigned int op, + struct bio *bio, gfp_t gfp_mask) { - const bool is_sync = rw_is_sync(op, op_flags) != 0; + const bool is_sync = op_is_sync(op); DEFINE_WAIT(wait); struct request_list *rl; struct request *rq; rl = blk_get_rl(q, bio); /* transferred to @rq on success */ retry: - rq = __get_request(rl, op, op_flags, bio, gfp_mask); + rq = __get_request(rl, op, bio, gfp_mask); if (!IS_ERR(rq)) return rq; @@ -1297,7 +1293,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw, create_io_context(gfp_mask, q->node); spin_lock_irq(q->queue_lock); - rq = get_request(q, rw, 0, NULL, gfp_mask); + rq = get_request(q, rw, NULL, gfp_mask); if (IS_ERR(rq)) { spin_unlock_irq(q->queue_lock); return rq; @@ -1446,7 +1442,7 @@ void __blk_put_request(struct request_queue *q, struct request *req) */ if (rq_flags & RQF_ALLOCED) { struct request_list *rl = blk_rq_rl(req); - bool sync = rw_is_sync(req_op(req), req->cmd_flags); + bool sync = op_is_sync(req->cmd_flags); BUG_ON(!list_empty(&req->queuelist)); BUG_ON(ELV_ON_HASH(req)); @@ -1652,8 +1648,6 @@ out: void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; - - req->cmd_flags |= bio->bi_opf & REQ_COMMON_MASK; if (bio->bi_opf & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; @@ -1665,9 +1659,8 @@ void init_request_from_bio(struct request *req, struct bio *bio) static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { - const bool sync = !!(bio->bi_opf & REQ_SYNC); struct blk_plug *plug; - int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT; + int el_ret, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; @@ -1722,24 +1715,11 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: - /* - * This sync check and mask will be re-done in init_request_from_bio(), - * but we need to set it earlier to expose the sync flag to the - * rq allocator and io schedulers. - */ - if (sync) - rw_flags |= REQ_SYNC; - - /* - * Add in META/PRIO flags, if set, before we get to the IO scheduler - */ - rw_flags |= (bio->bi_opf & (REQ_META | REQ_PRIO)); - /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ - req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO); + req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { bio->bi_error = PTR_ERR(req); bio_endio(bio); @@ -2946,8 +2926,6 @@ EXPORT_SYMBOL_GPL(__blk_end_request_err); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { - req_set_op(rq, bio_op(bio)); - if (bio_has_data(bio)) rq->nr_phys_segments = bio_phys_segments(q, bio); @@ -3031,8 +3009,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - req_set_op_attrs(dst, req_op(src), - (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE); + dst->cmd_flags = src->cmd_flags | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -3537,8 +3514,11 @@ EXPORT_SYMBOL(blk_set_runtime_active); int __init blk_dev_init(void) { - BUILD_BUG_ON(__REQ_NR_BITS > 8 * + BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * FIELD_SIZEOF(struct request, cmd_flags)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * + FIELD_SIZEOF(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", diff --git a/block/blk-flush.c b/block/blk-flush.c index 3990b9cfbda5..95f1d4d357df 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -330,7 +330,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) } flush_rq->cmd_type = REQ_TYPE_FS; - req_set_op_attrs(flush_rq, REQ_OP_FLUSH, WRITE_FLUSH); + flush_rq->cmd_flags = REQ_OP_FLUSH | WRITE_FLUSH; flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; diff --git a/block/blk-lib.c b/block/blk-lib.c index 46fe9248410d..18abda862915 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int granularity; - enum req_op op; + unsigned int op; int alignment; sector_t bs_mask; diff --git a/block/blk-map.c b/block/blk-map.c index 2c5ae5fef473..0173a72a8aa9 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -16,6 +16,8 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) { if (!rq->bio) { + rq->cmd_flags &= REQ_OP_MASK; + rq->cmd_flags |= (bio->bi_opf & REQ_OP_MASK); blk_rq_bio_prep(rq->q, rq, bio); } else { if (!ll_back_merge_fn(rq->q, rq, bio)) diff --git a/block/blk-mq.c b/block/blk-mq.c index b49c6658eb05..2da1a0ee3318 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -139,14 +139,13 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) EXPORT_SYMBOL(blk_mq_can_queue); static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, - struct request *rq, int op, - unsigned int op_flags) + struct request *rq, unsigned int op) { INIT_LIST_HEAD(&rq->queuelist); /* csd/requeue_work/fifo_time is initialized before use */ rq->q = q; rq->mq_ctx = ctx; - req_set_op_attrs(rq, op, op_flags); + rq->cmd_flags = op; if (blk_queue_io_stat(q)) rq->rq_flags |= RQF_IO_STAT; /* do not touch atomic flags, it needs atomic ops against the timer */ @@ -183,11 +182,11 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, rq->end_io_data = NULL; rq->next_rq = NULL; - ctx->rq_dispatched[rw_is_sync(op, op_flags)]++; + ctx->rq_dispatched[op_is_sync(op)]++; } static struct request * -__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) +__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op) { struct request *rq; unsigned int tag; @@ -202,7 +201,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int op, int op_flags) } rq->tag = tag; - blk_mq_rq_ctx_init(data->q, data->ctx, rq, op, op_flags); + blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); return rq; } @@ -225,7 +224,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + rq = __blk_mq_alloc_request(&alloc_data, rw); blk_mq_put_ctx(ctx); if (!rq) { @@ -277,7 +276,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask)); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); + rq = __blk_mq_alloc_request(&alloc_data, rw); if (!rq) { ret = -EWOULDBLOCK; goto out_queue_exit; @@ -1196,19 +1195,14 @@ static struct request *blk_mq_map_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct request *rq; - int op = bio_data_dir(bio); - int op_flags = 0; blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, ctx->cpu); - if (rw_is_sync(bio_op(bio), bio->bi_opf)) - op_flags |= REQ_SYNC; - - trace_block_getrq(q, bio, op); + trace_block_getrq(q, bio, bio->bi_opf); blk_mq_set_alloc_data(data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(data, op, op_flags); + rq = __blk_mq_alloc_request(data, bio->bi_opf); data->hctx->queued++; return rq; @@ -1256,7 +1250,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) */ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); + const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_mq_alloc_data data; struct request *rq; @@ -1350,7 +1344,7 @@ done: */ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { - const int is_sync = rw_is_sync(bio_op(bio), bio->bi_opf); + const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_plug *plug; unsigned int request_count = 0; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5e24d880306c..c96186adaa66 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -667,10 +667,10 @@ static inline void cfqg_put(struct cfq_group *cfqg) } while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int op, - int op_flags) + struct cfq_group *curr_cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, 1); + blkg_rwstat_add(&cfqg->stats.queued, op, 1); cfqg_stats_end_empty_time(&cfqg->stats); cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); } @@ -684,30 +684,29 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, #endif } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, - int op_flags) +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.queued, op, op_flags, -1); + blkg_rwstat_add(&cfqg->stats.queued, op, -1); } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, - int op_flags) +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, + unsigned int op) { - blkg_rwstat_add(&cfqg->stats.merged, op, op_flags, 1); + blkg_rwstat_add(&cfqg->stats.merged, op, 1); } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { struct cfqg_stats *stats = &cfqg->stats; unsigned long long now = sched_clock(); if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, op_flags, - now - io_start_time); + blkg_rwstat_add(&stats->service_time, op, now - io_start_time); if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, op_flags, + blkg_rwstat_add(&stats->wait_time, op, io_start_time - start_time); } @@ -786,16 +785,16 @@ static inline void cfqg_put(struct cfq_group *cfqg) { } #define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, int op, int op_flags) { } + struct cfq_group *curr_cfqg, unsigned int op) { } static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, uint64_t time, unsigned long unaccounted_time) { } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int op, - int op_flags) { } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int op, - int op_flags) { } +static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, + unsigned int op) { } +static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, + unsigned int op) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - uint64_t start_time, uint64_t io_start_time, int op, - int op_flags) { } + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -2474,10 +2473,10 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); cfq_add_rq_rb(rq); cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, - req_op(rq), rq->cmd_flags); + rq->cmd_flags); } static struct request * @@ -2530,7 +2529,7 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), req_op(rq), rq->cmd_flags); + cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); if (rq->cmd_flags & REQ_PRIO) { WARN_ON(!cfqq->prio_pending); cfqq->prio_pending--; @@ -2565,7 +2564,7 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, static void cfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { - cfqg_stats_update_io_merged(RQ_CFQG(req), bio_op(bio), bio->bi_opf); + cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf); } static void @@ -2588,7 +2587,7 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); - cfqg_stats_update_io_merged(RQ_CFQG(rq), req_op(next), next->cmd_flags); + cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); cfqq = RQ_CFQQ(next); /* @@ -4142,7 +4141,7 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, req_op(rq), + cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, rq->cmd_flags); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -4240,8 +4239,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfqq->dispatched--; (RQ_CFQG(rq))->dispatched--; cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq), - rq_io_start_time_ns(rq), req_op(rq), - rq->cmd_flags); + rq_io_start_time_ns(rq), rq->cmd_flags); cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; @@ -4319,14 +4317,14 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_schedule_dispatch(cfqd); } -static void cfqq_boost_on_prio(struct cfq_queue *cfqq, int op_flags) +static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op) { /* * If REQ_PRIO is set, boost class and prio level, if it's below * BE/NORM. If prio is not set, restore the potentially boosted * class/prio level. */ - if (!(op_flags & REQ_PRIO)) { + if (!(op & REQ_PRIO)) { cfqq->ioprio_class = cfqq->org_ioprio_class; cfqq->ioprio = cfqq->org_ioprio; } else { @@ -4347,7 +4345,7 @@ static inline int __cfq_may_queue(struct cfq_queue *cfqq) return ELV_MQUEUE_MAY; } -static int cfq_may_queue(struct request_queue *q, int op, int op_flags) +static int cfq_may_queue(struct request_queue *q, unsigned int op) { struct cfq_data *cfqd = q->elevator->elevator_data; struct task_struct *tsk = current; @@ -4364,10 +4362,10 @@ static int cfq_may_queue(struct request_queue *q, int op, int op_flags) if (!cic) return ELV_MQUEUE_MAY; - cfqq = cic_to_cfqq(cic, rw_is_sync(op, op_flags)); + cfqq = cic_to_cfqq(cic, op_is_sync(op)); if (cfqq) { cfq_init_prio_data(cfqq, cic); - cfqq_boost_on_prio(cfqq, op_flags); + cfqq_boost_on_prio(cfqq, op); return __cfq_may_queue(cfqq); } diff --git a/block/elevator.c b/block/elevator.c index ac80f89a0842..a18a5db274e4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -714,12 +714,12 @@ void elv_put_request(struct request_queue *q, struct request *rq) e->type->ops.elevator_put_req_fn(rq); } -int elv_may_queue(struct request_queue *q, int op, int op_flags) +int elv_may_queue(struct request_queue *q, unsigned int op) { struct elevator_queue *e = q->elevator; if (e->type->ops.elevator_may_queue_fn) - return e->type->ops.elevator_may_queue_fn(q, op, op_flags); + return e->type->ops.elevator_may_queue_fn(q, op); return ELV_MQUEUE_MAY; } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a2768835d394..68a9eb4f3f36 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1135,7 +1135,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; - bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio)); + clone->bi_opf = io->base_bio->bi_opf; } static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index cef1f78031d4..65738b0aad36 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1031,8 +1031,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) } else if (rq_data_dir(rq) == READ) { SCpnt->cmnd[0] = READ_6; } else { - scmd_printk(KERN_ERR, SCpnt, "Unknown command %llu,%llx\n", - req_op(rq), (unsigned long long) rq->cmd_flags); + scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq)); goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2b790bda7998..9a377079af26 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8427,7 +8427,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); + bio->bi_opf = orig_bio->bi_opf; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8465,8 +8465,7 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), - bio_flags(orig_bio)); + bio->bi_opf = orig_bio->bi_opf; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; diff --git a/fs/buffer.c b/fs/buffer.c index b205a629001d..a29335867e30 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3118,7 +3118,7 @@ EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) * @op: whether to %READ or %WRITE - * @op_flags: rq_flag_bits + * @op_flags: req_flag_bits * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8de18a168a..2cf4f7f09e32 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -688,7 +688,7 @@ struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int op; /* contains REQ_OP_ */ - int op_flags; /* rq_flag_bits */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 49d5a1b61b06..b1f9144b42c7 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio) * gfs2_log_flush_bio - Submit any pending log bio * @sdp: The superblock * @op: REQ_OP - * @op_flags: rq_flag_bits + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 3bf5d33800ab..ddaf28d0988f 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -581,15 +581,14 @@ static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat - * @op: REQ_OP - * @op_flags: rq_flag_bits + * @op: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int op, int op_flags, uint64_t val) + unsigned int op, uint64_t val) { struct percpu_counter *cnt; @@ -600,7 +599,7 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, __percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH); - if (op_flags & REQ_SYNC) + if (op & REQ_SYNC) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; @@ -705,9 +704,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, if (!throtl) { blkg = blkg ?: q->root_blkg; - blkg_rwstat_add(&blkg->stat_bytes, bio_op(bio), bio->bi_opf, + blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio_op(bio), bio->bi_opf, 1); + blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); } rcu_read_unlock(); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index ec69a8fe3b29..dca972d67548 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -88,24 +88,6 @@ struct bio { struct bio_vec bi_inline_vecs[0]; }; -#define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS) -#define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1)) -#define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) - -#define bio_set_op_attrs(bio, op, op_flags) do { \ - if (__builtin_constant_p(op)) \ - BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS)); \ - else \ - WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS)); \ - if (__builtin_constant_p(op_flags)) \ - BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ - else \ - WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ - (bio)->bi_opf = bio_flags(bio); \ - (bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT); \ - (bio)->bi_opf |= (op_flags); \ -} while (0) - #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) /* @@ -147,26 +129,40 @@ struct bio { #endif /* CONFIG_BLOCK */ /* - * Request flags. For use in the cmd_flags field of struct request, and in - * bi_opf of struct bio. Note that some flags are only valid in either one. + * Operations and flags common to the bio and request structures. + * We use 8 bits for encoding the operation, and the remaining 24 for flags. */ -enum rq_flag_bits { - /* common flags */ - __REQ_FAILFAST_DEV, /* no driver retries of device errors */ +#define REQ_OP_BITS 8 +#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1) +#define REQ_FLAG_BITS 24 + +enum req_opf { + REQ_OP_READ, + REQ_OP_WRITE, + REQ_OP_DISCARD, /* request to discard sectors */ + REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ + REQ_OP_WRITE_SAME, /* write same block many times */ + REQ_OP_FLUSH, /* request for cache flush */ + REQ_OP_ZONE_REPORT, /* Get zone information */ + REQ_OP_ZONE_RESET, /* Reset a zone write pointer */ + + REQ_OP_LAST, +}; + +enum req_flag_bits { + __REQ_FAILFAST_DEV = /* no driver retries of device errors */ + REQ_OP_BITS, __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */ __REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */ - __REQ_SYNC, /* request is sync (sync write or read) */ __REQ_META, /* metadata io request */ __REQ_PRIO, /* boost priority in cfq */ - __REQ_NOMERGE, /* don't touch this for merging */ __REQ_NOIDLE, /* don't anticipate more IO after this one */ __REQ_INTEGRITY, /* I/O includes block integrity payload */ __REQ_FUA, /* forced unit access */ __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ - __REQ_NR_BITS, /* stops here */ }; @@ -176,37 +172,32 @@ enum rq_flag_bits { #define REQ_SYNC (1ULL << __REQ_SYNC) #define REQ_META (1ULL << __REQ_META) #define REQ_PRIO (1ULL << __REQ_PRIO) +#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) #define REQ_NOIDLE (1ULL << __REQ_NOIDLE) #define REQ_INTEGRITY (1ULL << __REQ_INTEGRITY) +#define REQ_FUA (1ULL << __REQ_FUA) +#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) +#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -#define REQ_COMMON_MASK \ - (REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \ - REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_RAHEAD) -#define REQ_CLONE_MASK REQ_COMMON_MASK -/* This mask is used for both bio and request merge checking */ #define REQ_NOMERGE_FLAGS \ (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA) -#define REQ_RAHEAD (1ULL << __REQ_RAHEAD) -#define REQ_FUA (1ULL << __REQ_FUA) -#define REQ_NOMERGE (1ULL << __REQ_NOMERGE) -#define REQ_PREFLUSH (1ULL << __REQ_PREFLUSH) +#define bio_op(bio) \ + ((bio)->bi_opf & REQ_OP_MASK) +#define req_op(req) \ + ((req)->cmd_flags & REQ_OP_MASK) -enum req_op { - REQ_OP_READ, - REQ_OP_WRITE, - REQ_OP_DISCARD, /* request to discard sectors */ - REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ - REQ_OP_WRITE_SAME, /* write same block many times */ - REQ_OP_FLUSH, /* request for cache flush */ - REQ_OP_ZONE_REPORT, /* Get zone information */ - REQ_OP_ZONE_RESET, /* Reset a zone write pointer */ -}; +/* obsolete, don't use in new code */ +#define bio_set_op_attrs(bio, op, op_flags) \ + ((bio)->bi_opf |= (op | op_flags)) -#define REQ_OP_BITS 3 +static inline bool op_is_sync(unsigned int op) +{ + return (op & REQ_OP_MASK) == REQ_OP_READ || (op & REQ_SYNC); +} typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b4415feac679..8396da2bb698 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -142,7 +142,7 @@ struct request { int cpu; unsigned cmd_type; - u64 cmd_flags; + unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; unsigned long atomic_flags; @@ -244,20 +244,6 @@ struct request { struct request *next_rq; }; -#define REQ_OP_SHIFT (8 * sizeof(u64) - REQ_OP_BITS) -#define req_op(req) ((req)->cmd_flags >> REQ_OP_SHIFT) - -#define req_set_op(req, op) do { \ - WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (req)->cmd_flags &= ((1ULL << REQ_OP_SHIFT) - 1); \ - (req)->cmd_flags |= ((u64) (op) << REQ_OP_SHIFT); \ -} while (0) - -#define req_set_op_attrs(req, op, flags) do { \ - req_set_op(req, op); \ - (req)->cmd_flags |= flags; \ -} while (0) - static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; @@ -741,17 +727,9 @@ static inline unsigned int blk_queue_zone_size(struct request_queue *q) return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } -/* - * We regard a request as sync, if either a read or a sync write - */ -static inline bool rw_is_sync(int op, unsigned int rw_flags) -{ - return op == REQ_OP_READ || (rw_flags & REQ_SYNC); -} - static inline bool rq_is_sync(struct request *rq) { - return rw_is_sync(req_op(rq), rq->cmd_flags); + return op_is_sync(rq->cmd_flags); } static inline bool blk_rl_full(struct request_list *rl, bool sync) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index cceb72f9e29f..e417f080219a 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -118,7 +118,7 @@ static inline int blk_cmd_buf_len(struct request *rq) } extern void blk_dump_cmd(char *buf, struct request *rq); -extern void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes); +extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes); #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */ diff --git a/include/linux/dm-io.h b/include/linux/dm-io.h index b91b023deffb..a52c6580cc9a 100644 --- a/include/linux/dm-io.h +++ b/include/linux/dm-io.h @@ -58,7 +58,7 @@ struct dm_io_notify { struct dm_io_client; struct dm_io_request { int bi_op; /* REQ_OP */ - int bi_op_flags; /* rq_flag_bits */ + int bi_op_flags; /* req_flag_bits */ struct dm_io_memory mem; /* Memory to use for io */ struct dm_io_notify notify; /* Synchronous if notify.fn is NULL */ struct dm_io_client *client; /* Client memory handler */ diff --git a/include/linux/elevator.h b/include/linux/elevator.h index e7f358d2e5fc..f219c9aed360 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -30,7 +30,7 @@ typedef int (elevator_dispatch_fn) (struct request_queue *, int); typedef void (elevator_add_req_fn) (struct request_queue *, struct request *); typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *); typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *); -typedef int (elevator_may_queue_fn) (struct request_queue *, int, int); +typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int); typedef void (elevator_init_icq_fn) (struct io_cq *); typedef void (elevator_exit_icq_fn) (struct io_cq *); @@ -139,7 +139,7 @@ extern struct request *elv_former_request(struct request_queue *, struct request extern struct request *elv_latter_request(struct request_queue *, struct request *); extern int elv_register_queue(struct request_queue *q); extern void elv_unregister_queue(struct request_queue *q); -extern int elv_may_queue(struct request_queue *, int, int); +extern int elv_may_queue(struct request_queue *, unsigned int); extern void elv_completed_request(struct request_queue *, struct request *); extern int elv_set_request(struct request_queue *q, struct request *rq, struct bio *bio, gfp_t gfp_mask); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index d336b890e31f..df3e9ae5ad8d 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -27,8 +27,7 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->sector = bio->bi_iter.bi_sector; __entry->orig_sector = bio->bi_iter.bi_sector - 16; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", @@ -102,8 +101,7 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u", @@ -138,8 +136,7 @@ TRACE_EVENT(bcache_read, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); __entry->cache_hit = hit; __entry->bypass = bypass; ), @@ -170,8 +167,7 @@ TRACE_EVENT(bcache_write, __entry->inode = inode; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); __entry->writeback = writeback; __entry->bypass = bypass; ), diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 8f3a163b8166..3e02e3a25413 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -84,8 +84,7 @@ DECLARE_EVENT_CLASS(block_rq_with_error, 0 : blk_rq_sectors(rq); __entry->errors = rq->errors; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); blk_dump_cmd(__get_str(cmd), rq); ), @@ -163,7 +162,7 @@ TRACE_EVENT(block_rq_complete, __entry->nr_sector = nr_bytes >> 9; __entry->errors = rq->errors; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, nr_bytes); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes); blk_dump_cmd(__get_str(cmd), rq); ), @@ -199,8 +198,7 @@ DECLARE_EVENT_CLASS(block_rq, __entry->bytes = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? blk_rq_bytes(rq) : 0; - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); blk_dump_cmd(__get_str(cmd), rq); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -274,8 +272,7 @@ TRACE_EVENT(block_bio_bounce, bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -313,8 +310,7 @@ TRACE_EVENT(block_bio_complete, __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = error; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u [%d]", @@ -341,8 +337,7 @@ DECLARE_EVENT_CLASS(block_bio_merge, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -409,8 +404,7 @@ TRACE_EVENT(block_bio_queue, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -438,7 +432,7 @@ DECLARE_EVENT_CLASS(block_get_rq, __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio ? bio->bi_iter.bi_sector : 0; __entry->nr_sector = bio ? bio_sectors(bio) : 0; - blk_fill_rwbs(__entry->rwbs, bio ? bio_op(bio) : 0, + blk_fill_rwbs(__entry->rwbs, bio ? bio->bi_opf : 0, __entry->nr_sector); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -573,8 +567,7 @@ TRACE_EVENT(block_split, __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), @@ -617,8 +610,7 @@ TRACE_EVENT(block_bio_remap, __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; - blk_fill_rwbs(__entry->rwbs, bio_op(bio), bio->bi_opf, - bio->bi_iter.bi_size); + blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", @@ -664,8 +656,7 @@ TRACE_EVENT(block_rq_remap, __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); - blk_fill_rwbs(__entry->rwbs, req_op(rq), rq->cmd_flags, - blk_rq_bytes(rq)); + blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index dbafc5df03f3..95cecbf67f5c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1777,14 +1777,14 @@ void blk_dump_cmd(char *buf, struct request *rq) } } -void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) +void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes) { int i = 0; - if (rw & REQ_PREFLUSH) + if (op & REQ_PREFLUSH) rwbs[i++] = 'F'; - switch (op) { + switch (op & REQ_OP_MASK) { case REQ_OP_WRITE: case REQ_OP_WRITE_SAME: rwbs[i++] = 'W'; @@ -1806,13 +1806,13 @@ void blk_fill_rwbs(char *rwbs, int op, u32 rw, int bytes) rwbs[i++] = 'N'; } - if (rw & REQ_FUA) + if (op & REQ_FUA) rwbs[i++] = 'F'; - if (rw & REQ_RAHEAD) + if (op & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & REQ_SYNC) + if (op & REQ_SYNC) rwbs[i++] = 'S'; - if (rw & REQ_META) + if (op & REQ_META) rwbs[i++] = 'M'; rwbs[i] = '\0'; -- cgit From 6a83e74d214a47a1371cd2e6a783264fcba7d428 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 2 Nov 2016 10:09:51 -0600 Subject: blk-mq: Introduce blk_mq_quiesce_queue() blk_mq_quiesce_queue() waits until ongoing .queue_rq() invocations have finished. This function does *not* wait until all outstanding requests have finished (this means invocation of request.end_io()). The algorithm used by blk_mq_quiesce_queue() is as follows: * Hold either an RCU read lock or an SRCU read lock around .queue_rq() calls. The former is used if .queue_rq() does not block and the latter if .queue_rq() may block. * blk_mq_quiesce_queue() first calls blk_mq_stop_hw_queues() followed by synchronize_srcu() or synchronize_rcu(). The latter call waits for .queue_rq() invocations that started before blk_mq_quiesce_queue() was called. * The blk_mq_hctx_stopped() calls that control whether or not .queue_rq() will be called are called with the (S)RCU read lock held. This is necessary to avoid race conditions against blk_mq_quiesce_queue(). Signed-off-by: Bart Van Assche Cc: Hannes Reinecke Cc: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/Kconfig | 1 + block/blk-mq.c | 71 +++++++++++++++++++++++++++++++++++++++++++++----- include/linux/blk-mq.h | 3 +++ include/linux/blkdev.h | 1 + 4 files changed, 69 insertions(+), 7 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/Kconfig b/block/Kconfig index 6b0ad08f0677..3a024440a669 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,6 +5,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y select SBITMAP + select SRCU help Provide block layer support for the kernel. diff --git a/block/blk-mq.c b/block/blk-mq.c index a461823644fb..3dc323543293 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -115,6 +115,33 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); +/** + * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished + * @q: request queue. + * + * Note: this function does not prevent that the struct request end_io() + * callback function is invoked. Additionally, it is not prevented that + * new queue_rq() calls occur unless the queue has been stopped first. + */ +void blk_mq_quiesce_queue(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + bool rcu = false; + + blk_mq_stop_hw_queues(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(&hctx->queue_rq_srcu); + else + rcu = true; + } + if (rcu) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; @@ -766,7 +793,7 @@ static inline unsigned int queued_to_index(unsigned int queued) * of IO. In particular, we'd like FIFO behaviour on handling existing * items on the hctx->dispatch list. Ignore that for now. */ -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct request *rq; @@ -778,9 +805,6 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx))) return; - WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)); - hctx->run++; /* @@ -871,6 +895,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } } +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + int srcu_idx; + + WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)); + + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + blk_mq_process_rq_list(hctx); + rcu_read_unlock(); + } else { + srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); + blk_mq_process_rq_list(hctx); + srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); + } +} + /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. @@ -1268,7 +1310,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) const int is_flush_fua = bio->bi_opf & (REQ_PREFLUSH | REQ_FUA); struct blk_mq_alloc_data data; struct request *rq; - unsigned int request_count = 0; + unsigned int request_count = 0, srcu_idx; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; @@ -1311,7 +1353,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio); /* - * We do limited pluging. If the bio can be merged, do that. + * We do limited plugging. If the bio can be merged, do that. * Otherwise the existing request in the plug list will be * issued. So the plug list will have one request at most */ @@ -1331,7 +1373,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_put_ctx(data.ctx); if (!old_rq) goto done; - blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); + + if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) { + rcu_read_lock(); + blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); + rcu_read_unlock(); + } else { + srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu); + blk_mq_try_issue_directly(data.hctx, old_rq, &cookie); + srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx); + } goto done; } @@ -1610,6 +1661,9 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); + if (hctx->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(&hctx->queue_rq_srcu); + blk_mq_remove_cpuhp(hctx); blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); @@ -1690,6 +1744,9 @@ static int blk_mq_init_hctx(struct request_queue *q, flush_start_tag + hctx_idx, node)) goto free_fq; + if (hctx->flags & BLK_MQ_F_BLOCKING) + init_srcu_struct(&hctx->queue_rq_srcu); + return 0; free_fq: diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a85a20f80aaa..ed20ac74c62a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -3,6 +3,7 @@ #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -35,6 +36,8 @@ struct blk_mq_hw_ctx { struct blk_mq_tags *tags; + struct srcu_struct queue_rq_srcu; + unsigned long queued; unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8396da2bb698..13d893a69b46 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -918,6 +918,7 @@ extern void __blk_run_queue(struct request_queue *q); extern void __blk_run_queue_uncond(struct request_queue *q); extern void blk_run_queue(struct request_queue *); extern void blk_run_queue_async(struct request_queue *q); +extern void blk_mq_quiesce_queue(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); -- cgit From 50d24c34403c62ad29e8b6db559d491bae20b4b7 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 3 Nov 2016 17:03:53 -0700 Subject: block: immediately dispatch big size request Currently block plug holds up to 16 non-mergeable requests. This makes sense if the request size is small, eg, reduce lock contention. But if request size is big enough, we don't need to worry about lock contention. Holding such request makes no sense and it lows the disk utilization. In practice, this improves 10% throughput for my raid5 sequential write workload. The size (128k) is arbitrary right now, but it makes sure lock contention is small. This probably could be more intelligent, eg, check average request size holded. Since this is mainly for sequential IO, probably not worthy. V2: check the last request instead of the first request, so as long as there is one big size request we flush the plug. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +++- include/linux/blkdev.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 0bfaa54d3e9f..2deca48a4a05 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1746,7 +1746,9 @@ get_rq: if (!request_count) trace_block_plug(q); else { - if (request_count >= BLK_MAX_REQUEST_COUNT) { + struct request *last = list_entry_rq(plug->list.prev); + if (request_count >= BLK_MAX_REQUEST_COUNT || + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) { blk_flush_plug_list(plug, false); trace_block_plug(q); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 13d893a69b46..9189a2d5c392 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1173,6 +1173,7 @@ struct blk_plug { struct list_head cb_list; /* md requires an unplug callback */ }; #define BLK_MAX_REQUEST_COUNT 16 +#define BLK_PLUG_FLUSH_SIZE (128 * 1024) struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); -- cgit From d278d4a8892f13b6a9eb6102b356402f0e062324 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Mar 2016 10:21:08 -0600 Subject: block: add code to track actual device queue depth For blk-mq, ->nr_requests does track queue depth, at least at init time. But for the older queue paths, it's simply a soft setting. On top of that, it's generally larger than the hardware setting on purpose, to allow backup of requests for merging. Fill a hole in struct request with a 'queue_depth' member, that drivers can call to more closely inform the block layer of the real queue depth. Signed-off-by: Jens Axboe Reviewed-by: Jan Kara --- block/blk-settings.c | 12 ++++++++++++ drivers/scsi/scsi.c | 3 +++ include/linux/blkdev.h | 11 +++++++++++ 3 files changed, 26 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-settings.c b/block/blk-settings.c index 55369a65dea2..9cf053759363 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -836,6 +836,18 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable) } EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); +/** + * blk_set_queue_depth - tell the block layer about the device queue depth + * @q: the request queue for the device + * @depth: queue depth + * + */ +void blk_set_queue_depth(struct request_queue *q, unsigned int depth) +{ + q->queue_depth = depth; +} +EXPORT_SYMBOL(blk_set_queue_depth); + /** * blk_queue_write_cache - configure queue's write cache * @q: the request queue for the device diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 1deb6adc411f..75455d4dab68 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) wmb(); } + if (sdev->request_queue) + blk_set_queue_depth(sdev->request_queue, depth); + return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9189a2d5c392..d364be6e6959 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -405,6 +405,8 @@ struct request_queue { struct blk_mq_ctx __percpu *queue_ctx; unsigned int nr_queues; + unsigned int queue_depth; + /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; @@ -777,6 +779,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) return false; } +static inline unsigned int blk_queue_depth(struct request_queue *q) +{ + if (q->queue_depth) + return q->queue_depth; + + return q->nr_requests; +} + /* * q->prep_rq_fn return values */ @@ -1094,6 +1104,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, -- cgit From cf43e6be865a582ba66ee4747ae27a0513f6bba1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Nov 2016 21:32:37 -0700 Subject: block: add scalable completion tracking of requests For legacy block, we simply track them in the request queue. For blk-mq, we track them on a per-sw queue basis, which we can then sum up through the hardware queues and finally to a per device state. The stats are tracked in, roughly, 0.1s interval windows. Add sysfs files to display the stats. The feature is off by default, to avoid any extra overhead. In-kernel users of it can turn it on by setting QUEUE_FLAG_STATS in the queue flags. We currently don't turn it on if someone just reads any of the stats files, that is something we could add as well. Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-core.c | 14 ++- block/blk-mq-sysfs.c | 47 +++++++++ block/blk-mq.c | 25 +++++ block/blk-mq.h | 3 + block/blk-stat.c | 248 ++++++++++++++++++++++++++++++++++++++++++++++ block/blk-stat.h | 42 ++++++++ block/blk-sysfs.c | 26 +++++ include/linux/blk_types.h | 16 +++ include/linux/blkdev.h | 7 ++ 10 files changed, 427 insertions(+), 3 deletions(-) create mode 100644 block/blk-stat.c create mode 100644 block/blk-stat.h (limited to 'include/linux/blkdev.h') diff --git a/block/Makefile b/block/Makefile index 934dac73fb37..2528c596f7ec 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-core.c b/block/blk-core.c index 2deca48a4a05..216372b01624 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2464,6 +2464,11 @@ void blk_start_request(struct request *req) { blk_dequeue_request(req); + if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { + blk_stat_set_issue_time(&req->issue_stat); + req->rq_flags |= RQF_STATS; + } + /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. @@ -2683,8 +2688,13 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); */ void blk_finish_request(struct request *req, int error) { + struct request_queue *q = req->q; + + if (req->rq_flags & RQF_STATS) + blk_stat_add(&q->rq_stats[rq_data_dir(req)], req); + if (req->rq_flags & RQF_QUEUED) - blk_queue_end_tag(req->q, req); + blk_queue_end_tag(q, req); BUG_ON(blk_queued_rq(req)); @@ -2704,7 +2714,7 @@ void blk_finish_request(struct request *req, int error) if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); - __blk_put_request(req->q, req); + __blk_put_request(q, req); } } EXPORT_SYMBOL(blk_finish_request); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 01fb455d3377..eacd3af72099 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -259,6 +259,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + unsigned int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[BLK_STAT_READ]); + blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); + } +} + +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t count) +{ + blk_mq_stat_clear(hctx); + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_stat_init(&stat[BLK_STAT_READ]); + blk_stat_init(&stat[BLK_STAT_WRITE]); + + blk_hctx_stat_get(hctx, stat); + + ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); + ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); + return ret; +} + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_sysfs_dispatched_show, @@ -317,6 +358,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { .show = blk_mq_hw_sysfs_poll_show, .store = blk_mq_hw_sysfs_poll_store, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, + .show = blk_mq_hw_sysfs_stat_show, + .store = blk_mq_hw_sysfs_stat_store, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -327,6 +373,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, + &blk_mq_hw_sysfs_stat.attr, NULL, }; diff --git a/block/blk-mq.c b/block/blk-mq.c index 6f5cb3f3dcac..19795886d46e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -30,6 +30,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-stat.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -403,10 +404,27 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } +static void blk_mq_stat_add(struct request *rq) +{ + if (rq->rq_flags & RQF_STATS) { + /* + * We could rq->mq_ctx here, but there's less of a risk + * of races if we have the completion event add the stats + * to the local software queue. + */ + struct blk_mq_ctx *ctx; + + ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id()); + blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq); + } +} + static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_stat_add(rq); + if (!q->softirq_done_fn) blk_mq_end_request(rq, rq->errors); else @@ -450,6 +468,11 @@ void blk_mq_start_request(struct request *rq) if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + blk_stat_set_issue_time(&rq->issue_stat); + rq->rq_flags |= RQF_STATS; + } + blk_add_timer(rq); /* @@ -1784,6 +1807,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; + blk_stat_init(&__ctx->stat[BLK_STAT_READ]); + blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) diff --git a/block/blk-mq.h b/block/blk-mq.h index ac772dac7ce8..b444370ae05b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include "blk-stat.h" + struct blk_mq_tag_set; struct blk_mq_ctx { @@ -18,6 +20,7 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; diff --git a/block/blk-stat.c b/block/blk-stat.c new file mode 100644 index 000000000000..688c958367ee --- /dev/null +++ b/block/blk-stat.c @@ -0,0 +1,248 @@ +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include + +#include "blk-stat.h" +#include "blk-mq.h" + +static void blk_stat_flush_batch(struct blk_rq_stat *stat) +{ + const s32 nr_batch = READ_ONCE(stat->nr_batch); + const s32 nr_samples = READ_ONCE(stat->nr_batch); + + if (!nr_batch) + return; + if (!nr_samples) + stat->mean = div64_s64(stat->batch, nr_batch); + else { + stat->mean = div64_s64((stat->mean * nr_samples) + + stat->batch, + nr_batch + nr_samples); + } + + stat->nr_samples += nr_batch; + stat->nr_batch = stat->batch = 0; +} + +static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + blk_stat_flush_batch(src); + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + if (!dst->nr_samples) + dst->mean = src->mean; + else { + dst->mean = div64_s64((src->mean * src->nr_samples) + + (dst->mean * dst->nr_samples), + dst->nr_samples + src->nr_samples); + } + dst->nr_samples += src->nr_samples; +} + +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + uint64_t latest = 0; + int i, j, nr; + + blk_stat_init(&dst[BLK_STAT_READ]); + blk_stat_init(&dst[BLK_STAT_WRITE]); + + nr = 0; + do { + uint64_t newest = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (!ctx->stat[BLK_STAT_READ].nr_samples && + !ctx->stat[BLK_STAT_WRITE].nr_samples) + continue; + if (ctx->stat[BLK_STAT_READ].time > newest) + newest = ctx->stat[BLK_STAT_READ].time; + if (ctx->stat[BLK_STAT_WRITE].time > newest) + newest = ctx->stat[BLK_STAT_WRITE].time; + } + } + + /* + * No samples + */ + if (!newest) + break; + + if (newest > latest) + latest = newest; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (ctx->stat[BLK_STAT_READ].time == newest) { + blk_stat_sum(&dst[BLK_STAT_READ], + &ctx->stat[BLK_STAT_READ]); + nr++; + } + if (ctx->stat[BLK_STAT_WRITE].time == newest) { + blk_stat_sum(&dst[BLK_STAT_WRITE], + &ctx->stat[BLK_STAT_WRITE]); + nr++; + } + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare. + */ + } while (!nr); + + dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest; +} + +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + if (q->mq_ops) + blk_mq_stat_get(q, dst); + else { + memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ], + sizeof(struct blk_rq_stat)); + memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE], + sizeof(struct blk_rq_stat)); + } +} + +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +{ + struct blk_mq_ctx *ctx; + unsigned int i, nr; + + nr = 0; + do { + uint64_t newest = 0; + + hctx_for_each_ctx(hctx, ctx, i) { + if (!ctx->stat[BLK_STAT_READ].nr_samples && + !ctx->stat[BLK_STAT_WRITE].nr_samples) + continue; + + if (ctx->stat[BLK_STAT_READ].time > newest) + newest = ctx->stat[BLK_STAT_READ].time; + if (ctx->stat[BLK_STAT_WRITE].time > newest) + newest = ctx->stat[BLK_STAT_WRITE].time; + } + + if (!newest) + break; + + hctx_for_each_ctx(hctx, ctx, i) { + if (ctx->stat[BLK_STAT_READ].time == newest) { + blk_stat_sum(&dst[BLK_STAT_READ], + &ctx->stat[BLK_STAT_READ]); + nr++; + } + if (ctx->stat[BLK_STAT_WRITE].time == newest) { + blk_stat_sum(&dst[BLK_STAT_WRITE], + &ctx->stat[BLK_STAT_WRITE]); + nr++; + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare, as the window is only updated + * occasionally + */ + } while (!nr); +} + +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->batch = stat->nr_batch = 0; + stat->time = time_now & BLK_STAT_NSEC_MASK; +} + +void blk_stat_init(struct blk_rq_stat *stat) +{ + __blk_stat_init(stat, ktime_to_ns(ktime_get())); +} + +static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now) +{ + return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK); +} + +bool blk_stat_is_current(struct blk_rq_stat *stat) +{ + return __blk_stat_is_current(stat, ktime_to_ns(ktime_get())); +} + +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +{ + s64 now, value; + + now = __blk_stat_time(ktime_to_ns(ktime_get())); + if (now < blk_stat_time(&rq->issue_stat)) + return; + + if (!__blk_stat_is_current(stat, now)) + __blk_stat_init(stat, now); + + value = now - blk_stat_time(&rq->issue_stat); + if (value > stat->max) + stat->max = value; + if (value < stat->min) + stat->min = value; + + if (stat->batch + value < stat->batch || + stat->nr_batch + 1 == BLK_RQ_STAT_BATCH) + blk_stat_flush_batch(stat); + + stat->batch += value; + stat->nr_batch++; +} + +void blk_stat_clear(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_init(&ctx->stat[BLK_STAT_READ]); + blk_stat_init(&ctx->stat[BLK_STAT_WRITE]); + } + } + } else { + blk_stat_init(&q->rq_stats[BLK_STAT_READ]); + blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]); + } +} + +void blk_stat_set_issue_time(struct blk_issue_stat *stat) +{ + stat->time = (stat->time & BLK_STAT_MASK) | + (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK); +} + +/* + * Enable stat tracking, return whether it was enabled + */ +bool blk_stat_enable(struct request_queue *q) +{ + if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + set_bit(QUEUE_FLAG_STATS, &q->queue_flags); + return false; + } + + return true; +} diff --git a/block/blk-stat.h b/block/blk-stat.h new file mode 100644 index 000000000000..a2050a0a5314 --- /dev/null +++ b/block/blk-stat.h @@ -0,0 +1,42 @@ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +/* + * ~0.13s window as a power-of-2 (2^27 nsecs) + */ +#define BLK_STAT_NSEC 134217728ULL +#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1) + +/* + * Upper 3 bits can be used elsewhere + */ +#define BLK_STAT_RES_BITS 3 +#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS) +#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1) +#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK + +enum { + BLK_STAT_READ = 0, + BLK_STAT_WRITE, +}; + +void blk_stat_add(struct blk_rq_stat *, struct request *); +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); +void blk_stat_clear(struct request_queue *); +void blk_stat_init(struct blk_rq_stat *); +bool blk_stat_is_current(struct blk_rq_stat *); +void blk_stat_set_issue_time(struct blk_issue_stat *); +bool blk_stat_enable(struct request_queue *); + +static inline u64 __blk_stat_time(u64 time) +{ + return time & BLK_STAT_TIME_MASK; +} + +static inline u64 blk_stat_time(struct blk_issue_stat *stat) +{ + return __blk_stat_time(stat->time); +} + +#endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 488c2e28feb8..9cdb7247727a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -401,6 +401,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page) return queue_var_show(blk_queue_dax(q), page); } +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t queue_stats_show(struct request_queue *q, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_queue_stat_get(q, stat); + + ret = print_stat(page, &stat[BLK_STAT_READ], "read :"); + ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:"); + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -553,6 +573,11 @@ static struct queue_sysfs_entry queue_dax_entry = { .show = queue_dax_show, }; +static struct queue_sysfs_entry queue_stats_entry = { + .attr = {.name = "stats", .mode = S_IRUGO }, + .show = queue_stats_show, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -582,6 +607,7 @@ static struct attribute *default_attrs[] = { &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_dax_entry.attr, + &queue_stats_entry.attr, NULL, }; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 562ac46cb790..4d0044d09984 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -250,4 +250,20 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +struct blk_issue_stat { + u64 time; +}; + +#define BLK_RQ_STAT_BATCH 64 + +struct blk_rq_stat { + s64 mean; + u64 min; + u64 max; + s32 nr_samples; + s32 nr_batch; + u64 batch; + s64 time; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d364be6e6959..303723a2e5b8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -117,6 +117,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ #define RQF_HASHED ((__force req_flags_t)(1 << 16)) +/* IO stats tracking on */ +#define RQF_STATS ((__force req_flags_t)(1 << 17)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -197,6 +199,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; + struct blk_issue_stat issue_stat; #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; @@ -492,6 +495,9 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; + + struct blk_rq_stat rq_stats[2]; + /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -585,6 +591,7 @@ struct request_queue { #define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ +#define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit From 87760e5eef359788047d6fd54fc12eec74ce0d27 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Nov 2016 12:38:14 -0700 Subject: block: hook up writeback throttling Enable throttling of buffered writeback to make it a lot more smooth, and has way less impact on other system activity. Background writeback should be, by definition, background activity. The fact that we flush huge bundles of it at the time means that it potentially has heavy impacts on foreground workloads, which isn't ideal. We can't easily limit the sizes of writes that we do, since that would impact file system layout in the presence of delayed allocation. So just throttle back buffered writeback, unless someone is waiting for it. The algorithm for when to throttle takes its inspiration in the CoDel networking scheduling algorithm. Like CoDel, blk-wb monitors the minimum latencies of requests over a window of time. In that window of time, if the minimum latency of any request exceeds a given target, then a scale count is incremented and the queue depth is shrunk. The next monitoring window is shrunk accordingly. Unlike CoDel, if we hit a window that exhibits good behavior, then we simply increment the scale count and re-calculate the limits for that scale value. This prevents us from oscillating between a close-to-ideal value and max all the time, instead remaining in the windows where we get good behavior. Unlike CoDel, blk-wb allows the scale count to to negative. This happens if we primarily have writes going on. Unlike positive scale counts, this doesn't change the size of the monitoring window. When the heavy writers finish, blk-bw quickly snaps back to it's stable state of a zero scale count. The patch registers a sysfs entry, 'wb_lat_usec'. This sets the latency target to me met. It defaults to 2 msec for non-rotational storage, and 75 msec for rotational storage. Setting this value to '0' disables blk-wb. Generally, a user would not have to touch this setting. We don't enable WBT on devices that are managed with CFQ, and have a non-root block cgroup attached. If we have a proportional share setup on this particular disk, then the wbt throttling will interfere with that. We don't have a strong need for wbt for that case, since we will rely on CFQ doing that for us. Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.txt | 7 +++ block/Kconfig | 26 +++++++++++ block/blk-core.c | 17 ++++++- block/blk-mq.c | 26 ++++++++++- block/blk-settings.c | 4 ++ block/blk-sysfs.c | 88 +++++++++++++++++++++++++++++++++++++ block/cfq-iosched.c | 14 ++++++ include/linux/blkdev.h | 3 ++ 8 files changed, 181 insertions(+), 4 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index 2a3904030dea..87abf1ac2939 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -169,5 +169,12 @@ This is the number of bytes the device can write in a single write-same command. A value of '0' means write-same is not supported by this device. +wb_lat_usec (RW) +---------------- +If the device is registered for writeback throttling, then this file shows +the target minimum read latency. If this latency is exceeded in a given +window of time (see wb_window_usec), then the writeback throttling will start +scaling back writes. + Jens Axboe , February 2009 diff --git a/block/Kconfig b/block/Kconfig index 3a024440a669..8bf114a3858a 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -121,6 +121,32 @@ config BLK_CMDLINE_PARSER See Documentation/block/cmdline-partition.txt for more information. +config BLK_WBT + bool "Enable support for block device writeback throttling" + default n + ---help--- + Enabling this option enables the block layer to throttle buffered + background writeback from the VM, making it more smooth and having + less impact on foreground operations. The throttling is done + dynamically on an algorithm loosely based on CoDel, factoring in + the realtime performance of the disk. + +config BLK_WBT_SQ + bool "Single queue writeback throttling" + default n + depends on BLK_WBT + ---help--- + Enable writeback throttling by default on legacy single queue devices + +config BLK_WBT_MQ + bool "Multiqueue writeback throttling" + default y + depends on BLK_WBT + ---help--- + Enable writeback throttling by default on multiqueue devices. + Multiqueue currently doesn't have support for IO scheduling, + enabling this option is recommended. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/blk-core.c b/block/blk-core.c index 216372b01624..59f8129a4295 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wbt.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -882,6 +883,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, fail: blk_free_flush_queue(q->fq); + wbt_exit(q); return NULL; } EXPORT_SYMBOL(blk_init_allocated_queue); @@ -1344,6 +1346,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->issue_stat); if (rq->rq_flags & RQF_QUEUED) blk_queue_end_tag(q, rq); @@ -1436,6 +1439,8 @@ void __blk_put_request(struct request_queue *q, struct request *req) /* this is a bio leak */ WARN_ON(req->bio != NULL); + wbt_done(q->rq_wb, &req->issue_stat); + /* * Request may not have originated from ll_rw_blk. if not, * it didn't come out of our reserved rq pools @@ -1663,6 +1668,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) int el_ret, where = ELEVATOR_INSERT_SORT; struct request *req; unsigned int request_count = 0; + unsigned int wb_acct; /* * low level driver can indicate that it wants pages above a @@ -1715,17 +1721,22 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) } get_rq: + wb_acct = wbt_wait(q->rq_wb, bio, q->queue_lock); + /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ req = get_request(q, bio->bi_opf, bio, GFP_NOIO); if (IS_ERR(req)) { + __wbt_done(q->rq_wb, wb_acct); bio->bi_error = PTR_ERR(req); bio_endio(bio); goto out_unlock; } + wbt_track(&req->issue_stat, wb_acct); + /* * After dropping the lock and possibly sleeping here, our request * may now be mergeable after it had proven unmergeable (above). @@ -2467,6 +2478,7 @@ void blk_start_request(struct request *req) if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { blk_stat_set_issue_time(&req->issue_stat); req->rq_flags |= RQF_STATS; + wbt_issue(req->q->rq_wb, &req->issue_stat); } /* @@ -2708,9 +2720,10 @@ void blk_finish_request(struct request *req, int error) blk_account_io_done(req); - if (req->end_io) + if (req->end_io) { + wbt_done(req->q->rq_wb, &req->issue_stat); req->end_io(req, error); - else { + } else { if (blk_bidi_rq(req)) __blk_put_request(req->next_rq->q, req->next_rq); diff --git a/block/blk-mq.c b/block/blk-mq.c index 19795886d46e..d180c989a0e5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -31,6 +31,7 @@ #include "blk-mq.h" #include "blk-mq-tag.h" #include "blk-stat.h" +#include "blk-wbt.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -326,6 +327,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, if (rq->rq_flags & RQF_MQ_INFLIGHT) atomic_dec(&hctx->nr_active); + + wbt_done(q->rq_wb, &rq->issue_stat); rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); @@ -354,6 +357,7 @@ inline void __blk_mq_end_request(struct request *rq, int error) blk_account_io_done(rq); if (rq->end_io) { + wbt_done(rq->q->rq_wb, &rq->issue_stat); rq->end_io(rq, error); } else { if (unlikely(blk_bidi_rq(rq))) @@ -471,6 +475,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { blk_stat_set_issue_time(&rq->issue_stat); rq->rq_flags |= RQF_STATS; + wbt_issue(q->rq_wb, &rq->issue_stat); } blk_add_timer(rq); @@ -508,6 +513,7 @@ static void __blk_mq_requeue_request(struct request *rq) struct request_queue *q = rq->q; trace_block_rq_requeue(q, rq); + wbt_requeue(q->rq_wb, &rq->issue_stat); if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { if (q->dma_drain_size && blk_rq_bytes(rq)) @@ -1339,6 +1345,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; + unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1353,9 +1360,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) return BLK_QC_T_NONE; + wb_acct = wbt_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; + } + + wbt_track(&rq->issue_stat, wb_acct); cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -1439,6 +1452,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) struct blk_mq_alloc_data data; struct request *rq; blk_qc_t cookie; + unsigned int wb_acct; blk_queue_bounce(q, &bio); @@ -1455,9 +1469,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) } else request_count = blk_plug_queued_count(q); + wb_acct = wbt_wait(q->rq_wb, bio, NULL); + rq = blk_mq_map_request(q, bio, &data); - if (unlikely(!rq)) + if (unlikely(!rq)) { + __wbt_done(q->rq_wb, wb_acct); return BLK_QC_T_NONE; + } + + wbt_track(&rq->issue_stat, wb_acct); cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); @@ -2139,6 +2159,8 @@ void blk_mq_free_queue(struct request_queue *q) list_del_init(&q->all_q_node); mutex_unlock(&all_q_mutex); + wbt_exit(q); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); diff --git a/block/blk-settings.c b/block/blk-settings.c index 9cf053759363..c7ccabc0ec3e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -13,6 +13,7 @@ #include #include "blk.h" +#include "blk-wbt.h" unsigned long blk_max_low_pfn; EXPORT_SYMBOL(blk_max_low_pfn); @@ -845,6 +846,7 @@ EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); void blk_set_queue_depth(struct request_queue *q, unsigned int depth) { q->queue_depth = depth; + wbt_set_queue_depth(q->rq_wb, depth); } EXPORT_SYMBOL(blk_set_queue_depth); @@ -868,6 +870,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) else queue_flag_clear(QUEUE_FLAG_FUA, q); spin_unlock_irq(q->queue_lock); + + wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9cdb7247727a..9262d2d60a09 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -13,6 +13,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-wbt.h" struct queue_sysfs_entry { struct attribute attr; @@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } +static ssize_t queue_var_store64(u64 *var, const char *page) +{ + int err; + u64 v; + + err = kstrtou64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, (page)); @@ -364,6 +378,32 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!q->rq_wb) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + ssize_t ret; + u64 val; + + if (!q->rq_wb) + return -EINVAL; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + + q->rq_wb->min_lat_nsec = val * 1000ULL; + wbt_update_limits(q->rq_wb); + return count; +} + static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -578,6 +618,12 @@ static struct queue_sysfs_entry queue_stats_entry = { .show = queue_stats_show, }; +static struct queue_sysfs_entry queue_wb_lat_entry = { + .attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR }, + .show = queue_wb_lat_show, + .store = queue_wb_lat_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -608,6 +654,7 @@ static struct attribute *default_attrs[] = { &queue_wc_entry.attr, &queue_dax_entry.attr, &queue_stats_entry.attr, + &queue_wb_lat_entry.attr, NULL, }; @@ -682,6 +729,7 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); + wbt_exit(q); bdi_exit(&q->backing_dev_info); blkcg_exit_queue(q); @@ -722,6 +770,44 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat) +{ + blk_queue_stat_get(data, stat); +} + +static void blk_wb_stat_clear(void *data) +{ + blk_stat_clear(data); +} + +static bool blk_wb_stat_is_current(struct blk_rq_stat *stat) +{ + return blk_stat_is_current(stat); +} + +static struct wb_stat_ops wb_stat_ops = { + .get = blk_wb_stat_get, + .is_current = blk_wb_stat_is_current, + .clear = blk_wb_stat_clear, +}; + +static void blk_wb_init(struct request_queue *q) +{ +#ifndef CONFIG_BLK_WBT_MQ + if (q->mq_ops) + return; +#endif +#ifndef CONFIG_BLK_WBT_SQ + if (q->request_fn) + return; +#endif + + /* + * If this fails, we don't get throttling + */ + wbt_init(q, &wb_stat_ops); +} + int blk_register_queue(struct gendisk *disk) { int ret; @@ -761,6 +847,8 @@ int blk_register_queue(struct gendisk *disk) if (q->mq_ops) blk_mq_register_dev(dev, q); + blk_wb_init(q); + if (!q->request_fn) return 0; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 61010511c5a0..e280d08ef6d7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -16,6 +16,7 @@ #include #include #include "blk.h" +#include "blk-wbt.h" /* * tunables @@ -3762,9 +3763,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; uint64_t serial_nr; + bool nonroot_cg; rcu_read_lock(); serial_nr = bio_blkcg(bio)->css.serial_nr; + nonroot_cg = bio_blkcg(bio) != &blkcg_root; rcu_read_unlock(); /* @@ -3774,6 +3777,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; + /* + * If we have a non-root cgroup, we can depend on that to + * do proper throttling of writes. Turn off wbt for that + * case. + */ + if (nonroot_cg) { + struct request_queue *q = cfqd->queue; + + wbt_disable(q->rq_wb); + } + /* * Drop reference to queues. New queues will be assigned in new * group upon arrival of fresh requests. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 303723a2e5b8..15da9e430f90 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; +struct rq_wb; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -383,6 +384,8 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct rq_wb *rq_wb; + /* * If blkcg is not used, @q->root_rl serves all requests. If blkcg * is used, root blkg allocates from @q->root_rl and all other -- cgit From bbd7bb7017d5c2b1e75f3818b4ce88fa58bb0eab Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Nov 2016 09:34:34 -0600 Subject: block: move poll code to blk-mq The poll code is blk-mq specific, let's move it to blk-mq.c. This is a prep patch for improving the polling code. Signed-off-by: Jens Axboe Reviewed-by: Christoph Hellwig --- block/blk-core.c | 46 ------------------------------------- block/blk-mq.c | 54 ++++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/io-cmd.c | 2 +- fs/direct-io.c | 2 +- include/linux/blkdev.h | 2 +- 5 files changed, 57 insertions(+), 49 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-core.c b/block/blk-core.c index 59f8129a4295..eea246567884 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3312,52 +3312,6 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); -bool blk_poll(struct request_queue *q, blk_qc_t cookie) -{ - struct blk_plug *plug; - long state; - unsigned int queue_num; - struct blk_mq_hw_ctx *hctx; - - if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return false; - - queue_num = blk_qc_t_to_queue_num(cookie); - hctx = q->queue_hw_ctx[queue_num]; - hctx->poll_considered++; - - plug = current->plug; - if (plug) - blk_flush_plug_list(plug, false); - - state = current->state; - while (!need_resched()) { - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); - if (ret > 0) { - hctx->poll_success++; - set_current_state(TASK_RUNNING); - return true; - } - - if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); - - if (current->state == TASK_RUNNING) - return true; - if (ret < 0) - break; - cpu_relax(); - } - - return false; -} -EXPORT_SYMBOL_GPL(blk_poll); - #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine diff --git a/block/blk-mq.c b/block/blk-mq.c index 77110aed24ea..ae8df5ec20d3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2461,6 +2461,60 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + struct request_queue *q = hctx->queue; + long state; + + hctx->poll_considered++; + + state = current->state; + while (!need_resched()) { + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, rq->tag); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + +bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_plug *plug; + struct request *rq; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + + return __blk_mq_poll(hctx, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_poll); + void blk_mq_disable_hotplug(void) { mutex_lock(&all_q_mutex); diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index c2784cfc5e29..ef52b1e70144 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -96,7 +96,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) cookie = submit_bio(bio); - blk_poll(bdev_get_queue(req->ns->bdev), cookie); + blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie); } static void nvmet_execute_flush(struct nvmet_req *req) diff --git a/fs/direct-io.c b/fs/direct-io.c index a5138c564019..835e23a4ee4b 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -457,7 +457,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 15da9e430f90..bab18ee5810d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -952,7 +952,7 @@ extern int blk_execute_rq(struct request_queue *, struct gendisk *, extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); -bool blk_poll(struct request_queue *q, blk_qc_t cookie); +bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { -- cgit From 06426adf072bca62ac31ea396ff2159a34f276c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Nov 2016 13:01:59 -0700 Subject: blk-mq: implement hybrid poll mode for sync O_DIRECT This patch enables a hybrid polling mode. Instead of polling after IO submission, we can induce an artificial delay, and then poll after that. For example, if the IO is presumed to complete in 8 usecs from now, we can sleep for 4 usecs, wake up, and then do our polling. This still puts a sleep/wakeup cycle in the IO path, but instead of the wakeup happening after the IO has completed, it'll happen before. With this hybrid scheme, we can achieve big latency reductions while still using the same (or less) amount of CPU. Signed-off-by: Jens Axboe Tested-By: Stephen Bates Reviewed-By: Stephen Bates --- block/blk-mq.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk-sysfs.c | 29 +++++++++++++++++++++++++++++ block/blk.h | 1 + include/linux/blkdev.h | 1 + 4 files changed, 81 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-mq.c b/block/blk-mq.c index f39e69c732cc..8cb248fb6a68 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -332,6 +332,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, rq->rq_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); + clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); blk_mq_put_tag(hctx, ctx, tag); blk_queue_exit(q); } @@ -2468,11 +2469,60 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, + struct request *rq) +{ + struct hrtimer_sleeper hs; + enum hrtimer_mode mode; + ktime_t kt; + + if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + return false; + + set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); + + /* + * This will be replaced with the stats tracking code, using + * 'avg_completion_time / 2' as the pre-sleep target. + */ + kt = ktime_set(0, q->poll_nsec); + + mode = HRTIMER_MODE_REL; + hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); + hrtimer_set_expires(&hs.timer, kt); + + hrtimer_init_sleeper(&hs, current); + do { + if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); + hrtimer_start_expires(&hs.timer, mode); + if (hs.task) + io_schedule(); + hrtimer_cancel(&hs.timer); + mode = HRTIMER_MODE_ABS; + } while (hs.task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&hs.timer); + return true; +} + static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct request_queue *q = hctx->queue; long state; + /* + * If we sleep, have the caller restart the poll loop to reset + * the state. Like for the other success return cases, the + * caller is responsible for checking if the IO completed. If + * the IO isn't complete, we'll get called again and will go + * straight to the busy poll loop. + */ + if (blk_mq_poll_hybrid_sleep(q, rq)) + return true; + hctx->poll_considered++; state = current->state; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 415e764807d0..dcdfcaa12653 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -350,6 +350,28 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->poll_nsec / 1000, page); +} + +static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long poll_usec; + ssize_t ret; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + ret = queue_var_store(&poll_usec, page, count); + if (ret < 0) + return ret; + + q->poll_nsec = poll_usec * 1000; + return ret; +} + static ssize_t queue_poll_show(struct request_queue *q, char *page) { return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); @@ -602,6 +624,12 @@ static struct queue_sysfs_entry queue_poll_entry = { .store = queue_poll_store, }; +static struct queue_sysfs_entry queue_poll_delay_entry = { + .attr = {.name = "io_poll_delay", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_delay_show, + .store = queue_poll_delay_store, +}; + static struct queue_sysfs_entry queue_wc_entry = { .attr = {.name = "write_cache", .mode = S_IRUGO | S_IWUSR }, .show = queue_wc_show, @@ -655,6 +683,7 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_stats_entry.attr, &queue_wb_lat_entry.attr, + &queue_poll_delay_entry.attr, NULL, }; diff --git a/block/blk.h b/block/blk.h index aa132dea598c..041185e5f129 100644 --- a/block/blk.h +++ b/block/blk.h @@ -111,6 +111,7 @@ void blk_account_io_done(struct request *req); enum rq_atomic_flags { REQ_ATOM_COMPLETE = 0, REQ_ATOM_STARTED, + REQ_ATOM_POLL_SLEPT, }; /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bab18ee5810d..37ed4ea705c8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -509,6 +509,7 @@ struct request_queue { unsigned int request_fn_active; unsigned int rq_timeout; + unsigned int poll_nsec; struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; -- cgit From 64f1c21e86f7fe63337b5c23c129de3ec506431d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 14 Nov 2016 13:03:03 -0700 Subject: blk-mq: make the polling code adaptive The previous commit introduced the hybrid sleep/poll mode. Take that one step further, and use the completion latencies to automatically sleep for half the mean completion time. This is a good approximation. This changes the 'io_poll_delay' sysfs file a bit to expose the various options. Depending on the value, the polling code will behave differently: -1 Never enter hybrid sleep mode 0 Use half of the completion mean for the sleep delay >0 Use this specific value as the sleep delay Signed-off-by: Jens Axboe Tested-By: Stephen Bates Reviewed-By: Stephen Bates --- block/blk-mq.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++--- block/blk-sysfs.c | 26 ++++++++++++++------ include/linux/blkdev.h | 2 +- 3 files changed, 83 insertions(+), 12 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8cb248fb6a68..9d4a1d630d0b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2132,6 +2132,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, */ q->nr_requests = set->queue_depth; + /* + * Default to classic polling + */ + q->poll_nsec = -1; + if (set->ops->complete) blk_queue_softirq_done(q, set->ops->complete); @@ -2469,14 +2474,70 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); +static unsigned long blk_mq_poll_nsecs(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct blk_rq_stat stat[2]; + unsigned long ret = 0; + + /* + * If stats collection isn't on, don't sleep but turn it on for + * future users + */ + if (!blk_stat_enable(q)) + return 0; + + /* + * We don't have to do this once per IO, should optimize this + * to just use the current window of stats until it changes + */ + memset(&stat, 0, sizeof(stat)); + blk_hctx_stat_get(hctx, stat); + + /* + * As an optimistic guess, use half of the mean service time + * for this type of request. We can (and should) make this smarter. + * For instance, if the completion latencies are tight, we can + * get closer than just half the mean. This is especially + * important on devices where the completion latencies are longer + * than ~10 usec. + */ + if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples) + ret = (stat[BLK_STAT_READ].mean + 1) / 2; + else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples) + ret = (stat[BLK_STAT_WRITE].mean + 1) / 2; + + return ret; +} + static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, struct request *rq) { struct hrtimer_sleeper hs; enum hrtimer_mode mode; + unsigned int nsecs; ktime_t kt; - if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) + return false; + + /* + * poll_nsec can be: + * + * -1: don't ever hybrid sleep + * 0: use half of prev avg + * >0: use this specific value + */ + if (q->poll_nsec == -1) + return false; + else if (q->poll_nsec > 0) + nsecs = q->poll_nsec; + else + nsecs = blk_mq_poll_nsecs(q, hctx, rq); + + if (!nsecs) return false; set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); @@ -2485,7 +2546,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, * This will be replaced with the stats tracking code, using * 'avg_completion_time / 2' as the pre-sleep target. */ - kt = ktime_set(0, q->poll_nsec); + kt = ktime_set(0, nsecs); mode = HRTIMER_MODE_REL; hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); @@ -2520,7 +2581,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) * the IO isn't complete, we'll get called again and will go * straight to the busy poll loop. */ - if (blk_mq_poll_hybrid_sleep(q, rq)) + if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) return true; hctx->poll_considered++; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index dcdfcaa12653..1855c6770045 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -352,24 +352,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { - return queue_var_show(q->poll_nsec / 1000, page); + int val; + + if (q->poll_nsec == -1) + val = -1; + else + val = q->poll_nsec / 1000; + + return sprintf(page, "%d\n", val); } static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, size_t count) { - unsigned long poll_usec; - ssize_t ret; + int err, val; if (!q->mq_ops || !q->mq_ops->poll) return -EINVAL; - ret = queue_var_store(&poll_usec, page, count); - if (ret < 0) - return ret; + err = kstrtoint(page, 10, &val); + if (err < 0) + return err; - q->poll_nsec = poll_usec * 1000; - return ret; + if (val == -1) + q->poll_nsec = -1; + else + q->poll_nsec = val * 1000; + + return count; } static ssize_t queue_poll_show(struct request_queue *q, char *page) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 37ed4ea705c8..85699bc90a51 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -509,7 +509,7 @@ struct request_queue { unsigned int request_fn_active; unsigned int rq_timeout; - unsigned int poll_nsec; + int poll_nsec; struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; -- cgit From 9a05e7541c39680d28ecf91892338e074738d5fd Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 18 Nov 2016 15:16:06 +0100 Subject: block: Change extern inline to static inline With compilers which follow the C99 standard (like modern versions of gcc and clang), "extern inline" does the opposite thing from older versions of gcc (emits code for an externally linkable version of the inline function). "static inline" does the intended behavior in all cases instead. Description taken from commit 6d91857d4826 ("staging, rtl8192e, LLVMLinux: Change extern inline to static inline"). This also fixes the following GCC warning when building with CONFIG_PM disabled: ./include/linux/blkdev.h:1143:20: warning: no previous prototype for 'blk_set_runtime_active' [-Wmissing-prototypes] Fixes: d07ab6d11477 ("block: Add blk_set_runtime_active()") Reviewed-by: Mika Westerberg Signed-off-by: Tobias Klauser Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 85699bc90a51..541fdd8787a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1174,7 +1174,7 @@ static inline int blk_pre_runtime_suspend(struct request_queue *q) static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {} static inline void blk_pre_runtime_resume(struct request_queue *q) {} static inline void blk_post_runtime_resume(struct request_queue *q, int err) {} -extern inline void blk_set_runtime_active(struct request_queue *q) {} +static inline void blk_set_runtime_active(struct request_queue *q) {} #endif /* -- cgit From e73c23ff736e1ea371dfa419d7bf8e77ee53044a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:58 -0800 Subject: block: add async variant of blkdev_issue_zeroout Similar to __blkdev_issue_discard this variant allows submitting the final bio asynchronously and chaining multiple ranges into a single completion. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-lib.c | 115 ++++++++++++++++++++++++++++++++++--------------- include/linux/blkdev.h | 3 ++ 2 files changed, 84 insertions(+), 34 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/blk-lib.c b/block/blk-lib.c index 18abda862915..bfb28b03765e 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -137,24 +137,24 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, EXPORT_SYMBOL(blkdev_issue_discard); /** - * blkdev_issue_write_same - queue a write same operation + * __blkdev_issue_write_same - generate number of bios with same page * @bdev: target blockdev * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) * @page: page containing data to write + * @biop: pointer to anchor bio * * Description: - * Issue a write same request for the sectors in question. + * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page. */ -int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, - struct page *page) +static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct page *page, + struct bio **biop) { struct request_queue *q = bdev_get_queue(bdev); unsigned int max_write_same_sectors; - struct bio *bio = NULL; - int ret = 0; + struct bio *bio = *biop; sector_t bs_mask; if (!q) @@ -164,6 +164,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if ((sector | nr_sects) & bs_mask) return -EINVAL; + if (!bdev_write_same(bdev)) + return -EOPNOTSUPP; + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ max_write_same_sectors = UINT_MAX >> 9; @@ -185,32 +188,63 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, bio->bi_iter.bi_size = nr_sects << 9; nr_sects = 0; } + cond_resched(); } - if (bio) { + *biop = bio; + return 0; +} + +/** + * blkdev_issue_write_same - queue a write same operation + * @bdev: target blockdev + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @page: page containing data + * + * Description: + * Issue a write same request for the sectors in question. + */ +int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, + struct page *page) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page, + &bio); + if (ret == 0 && bio) { ret = submit_bio_wait(bio); bio_put(bio); } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(blkdev_issue_write_same); /** - * blkdev_issue_zeroout - generate number of zero filed write bios + * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue * @sector: start sector * @nr_sects: number of sectors to write * @gfp_mask: memory allocation flags (for bio_alloc) + * @biop: pointer to anchor bio + * @discard: discard flag * * Description: * Generate and issue number of bios with zerofiled pages. */ - -static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask) +int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, + bool discard) { int ret; - struct bio *bio = NULL; + int bi_size = 0; + struct bio *bio = *biop; unsigned int sz; sector_t bs_mask; @@ -218,6 +252,19 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if ((sector | nr_sects) & bs_mask) return -EINVAL; + if (discard) { + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, + BLKDEV_DISCARD_ZERO, biop); + if (ret == 0 || (ret && ret != -EOPNOTSUPP)) + goto out; + } + + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, + ZERO_PAGE(0), biop); + if (ret == 0 || (ret && ret != -EOPNOTSUPP)) + goto out; + + ret = 0; while (nr_sects != 0) { bio = next_bio(bio, min(nr_sects, (sector_t)BIO_MAX_PAGES), gfp_mask); @@ -227,21 +274,20 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); - ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); - nr_sects -= ret >> 9; - sector += ret >> 9; - if (ret < (sz << 9)) + bi_size = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); + nr_sects -= bi_size >> 9; + sector += bi_size >> 9; + if (bi_size < (sz << 9)) break; } + cond_resched(); } - if (bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - return ret; - } - return 0; + *biop = bio; +out: + return ret; } +EXPORT_SYMBOL(__blkdev_issue_zeroout); /** * blkdev_issue_zeroout - zero-fill a block range @@ -263,21 +309,22 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, * clearing the block range. Otherwise the zeroing will be performed * using regular WRITE calls. */ - int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard) { - if (discard) { - if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, - BLKDEV_DISCARD_ZERO)) - return 0; - } + int ret; + struct bio *bio = NULL; + struct blk_plug plug; - if (bdev_write_same(bdev) && - blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, - ZERO_PAGE(0)) == 0) - return 0; + blk_start_plug(&plug); + ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, + &bio, discard); + if (ret == 0 && bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); - return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask); + return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 541fdd8787a5..7e9d8a0895be 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1269,6 +1269,9 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct bio **biop); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); +extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, + bool discard); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, bool discard); static inline int sb_issue_discard(struct super_block *sb, sector_t block, -- cgit From a6f0788ec2881ac14e97ff7fa6a78a807f87b5ba Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 30 Nov 2016 12:28:59 -0800 Subject: block: add support for REQ_OP_WRITE_ZEROES This adds a new block layer operation to zero out a range of LBAs. This allows to implement zeroing for devices that don't use either discard with a predictable zero pattern or WRITE SAME of zeroes. The prominent example of that is NVMe with the Write Zeroes command, but in the future, this should also help with improving the way zeroing discards work. For this operation, suitable entry is exported in sysfs which indicate the number of maximum bytes allowed in one write zeroes operation by the device. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 13 ++++++++ block/bio.c | 1 + block/blk-core.c | 4 +++ block/blk-lib.c | 58 +++++++++++++++++++++++++++++++++-- block/blk-merge.c | 17 +++++++--- block/blk-settings.c | 17 ++++++++++ block/blk-sysfs.c | 11 +++++++ block/blk-wbt.c | 5 +-- include/linux/bio.h | 25 ++++++++------- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 19 ++++++++++++ 11 files changed, 153 insertions(+), 19 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index ee2d5cd26bfe..2da04ce6aeef 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -235,6 +235,19 @@ Description: write_same_max_bytes is 0, write same is not supported by the device. +What: /sys/block//queue/write_zeroes_max_bytes +Date: November 2016 +Contact: Chaitanya Kulkarni +Description: + Devices that support write zeroes operation in which a + single request can be issued to zero out the range of + contiguous blocks on storage without having any payload + in the request. This can be used to optimize writing zeroes + to the devices. write_zeroes_max_bytes indicates how many + bytes can be written in a single write zeroes command. If + write_zeroes_max_bytes is 0, write zeroes is not supported + by the device. + What: /sys/block//queue/zoned Date: September 2016 Contact: Damien Le Moal diff --git a/block/bio.c b/block/bio.c index de257ced69b1..83db1f37fd0b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -674,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: break; case REQ_OP_WRITE_SAME: bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; diff --git a/block/blk-core.c b/block/blk-core.c index 6c4a425690fc..3f2eb8d80189 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1950,6 +1950,10 @@ generic_make_request_checks(struct bio *bio) if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; + case REQ_OP_WRITE_ZEROES: + if (!bdev_write_zeroes_sectors(bio->bi_bdev)) + goto not_supported; + break; default: break; } diff --git a/block/blk-lib.c b/block/blk-lib.c index bfb28b03765e..510a6fb15318 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -226,6 +226,55 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_write_same); +/** + * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES + * @bdev: blockdev to issue + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @biop: pointer to anchor bio + * + * Description: + * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages. + */ +static int __blkdev_issue_write_zeroes(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop) +{ + struct bio *bio = *biop; + unsigned int max_write_zeroes_sectors; + struct request_queue *q = bdev_get_queue(bdev); + + if (!q) + return -ENXIO; + + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ + max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); + + if (max_write_zeroes_sectors == 0) + return -EOPNOTSUPP; + + while (nr_sects) { + bio = next_bio(bio, 0, gfp_mask); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0); + + if (nr_sects > max_write_zeroes_sectors) { + bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; + nr_sects -= max_write_zeroes_sectors; + sector += max_write_zeroes_sectors; + } else { + bio->bi_iter.bi_size = nr_sects << 9; + nr_sects = 0; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + /** * __blkdev_issue_zeroout - generate number of zero filed write bios * @bdev: blockdev to issue @@ -259,6 +308,11 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, goto out; } + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, + biop); + if (ret == 0 || (ret && ret != -EOPNOTSUPP)) + goto out; + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, ZERO_PAGE(0), biop); if (ret == 0 || (ret && ret != -EOPNOTSUPP)) @@ -304,8 +358,8 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout); * the discard request fail, if the discard flag is not set, or if * discard_zeroes_data is not supported, this function will resort to * zeroing the blocks manually, thus provisioning (allocating, - * anchoring) them. If the block device supports the WRITE SAME command - * blkdev_issue_zeroout() will use it to optimize the process of + * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME + * command(s), blkdev_issue_zeroout() will use it to optimize the process of * clearing the block range. Otherwise the zeroing will be performed * using regular WRITE calls. */ diff --git a/block/blk-merge.c b/block/blk-merge.c index fda6a12fc776..cf2848cb91d8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio, case REQ_OP_SECURE_ERASE: split = blk_bio_discard_split(q, *bio, bs, &nsegs); break; + case REQ_OP_WRITE_ZEROES: + split = NULL; + nsegs = (*bio)->bi_phys_segments; + break; case REQ_OP_WRITE_SAME: split = blk_bio_write_same_split(q, *bio, bs, &nsegs); break; @@ -241,11 +245,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, * This should probably be returning 0, but blk_add_request_payload() * (Christoph!!!!) */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - return 1; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: return 1; + default: + break; + } fbio = bio; cluster = blk_queue_cluster(q); @@ -416,6 +424,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: /* * This is a hack - drivers should be neither modifying the * biovec, nor relying on bi_vcnt - but because of diff --git a/block/blk-settings.c b/block/blk-settings.c index c7ccabc0ec3e..8a2bc124a684 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; + lim->max_write_zeroes_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; @@ -132,6 +133,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; + lim->max_write_zeroes_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -299,6 +301,19 @@ void blk_queue_max_write_same_sectors(struct request_queue *q, } EXPORT_SYMBOL(blk_queue_max_write_same_sectors); +/** + * blk_queue_max_write_zeroes_sectors - set max sectors for a single + * write zeroes + * @q: the request queue for the device + * @max_write_zeroes_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_zeroes_sectors) +{ + q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; +} +EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); + /** * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device @@ -527,6 +542,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); + t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, + b->max_write_zeroes_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a97841491769..706b27bd73a1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -211,6 +211,11 @@ static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_same_sectors << 9); } +static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_write_zeroes_sectors << 9); +} static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) @@ -611,6 +616,11 @@ static struct queue_sysfs_entry queue_write_same_max_entry = { .show = queue_write_same_max_show, }; +static struct queue_sysfs_entry queue_write_zeroes_max_entry = { + .attr = {.name = "write_zeroes_max_bytes", .mode = S_IRUGO }, + .show = queue_write_zeroes_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_show_nonrot, @@ -700,6 +710,7 @@ static struct attribute *default_attrs[] = { &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, + &queue_write_zeroes_max_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nomerges_entry.attr, diff --git a/block/blk-wbt.c b/block/blk-wbt.c index b8647343141f..d500e43da5d9 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -575,9 +575,10 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) const int op = bio_op(bio); /* - * If not a WRITE (or a discard), do nothing + * If not a WRITE (or a discard or write zeroes), do nothing */ - if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD)) + if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_OP_WRITE_ZEROES)) return false; /* diff --git a/include/linux/bio.h b/include/linux/bio.h index 70a7244f08a7..b15323934a29 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -76,7 +76,8 @@ static inline bool bio_has_data(struct bio *bio) if (bio && bio->bi_iter.bi_size && bio_op(bio) != REQ_OP_DISCARD && - bio_op(bio) != REQ_OP_SECURE_ERASE) + bio_op(bio) != REQ_OP_SECURE_ERASE && + bio_op(bio) != REQ_OP_WRITE_ZEROES) return true; return false; @@ -86,7 +87,8 @@ static inline bool bio_no_advance_iter(struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || - bio_op(bio) == REQ_OP_WRITE_SAME; + bio_op(bio) == REQ_OP_WRITE_SAME || + bio_op(bio) == REQ_OP_WRITE_ZEROES; } static inline bool bio_mergeable(struct bio *bio) @@ -188,18 +190,19 @@ static inline unsigned bio_segments(struct bio *bio) struct bvec_iter iter; /* - * We special case discard/write same, because they interpret bi_size - * differently: + * We special case discard/write same/write zeroes, because they + * interpret bi_size differently: */ - if (bio_op(bio) == REQ_OP_DISCARD) - return 1; - - if (bio_op(bio) == REQ_OP_SECURE_ERASE) - return 1; - - if (bio_op(bio) == REQ_OP_WRITE_SAME) + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE_ZEROES: return 1; + default: + break; + } bio_for_each_segment(bv, bio, iter) segs++; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f57458a6a93b..519ea2c9df61 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -159,6 +159,8 @@ enum req_opf { REQ_OP_ZONE_RESET = 6, /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, + /* write the zero filled sector many times */ + REQ_OP_WRITE_ZEROES = 8, REQ_OP_LAST, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 7e9d8a0895be..ebeef2b79c5a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -323,6 +323,7 @@ struct queue_limits { unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; + unsigned int max_write_zeroes_sectors; unsigned int discard_granularity; unsigned int discard_alignment; @@ -774,6 +775,9 @@ static inline bool rq_mergeable(struct request *rq) if (req_op(rq) == REQ_OP_FLUSH) return false; + if (req_op(rq) == REQ_OP_WRITE_ZEROES) + return false; + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) @@ -1004,6 +1008,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_SAME)) return q->limits.max_write_same_sectors; + if (unlikely(op == REQ_OP_WRITE_ZEROES)) + return q->limits.max_write_zeroes_sectors; + return q->limits.max_sectors; } @@ -1107,6 +1114,8 @@ extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_same_sectors(struct request_queue *q, unsigned int max_write_same_sectors); +extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, @@ -1475,6 +1484,16 @@ static inline unsigned int bdev_write_same(struct block_device *bdev) return 0; } +static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q) + return q->limits.max_write_zeroes_sectors; + + return 0; +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit From f9d03f96b988002027d4b28ea1b7a24729a4c9b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Dec 2016 15:20:32 -0700 Subject: block: improve handling of the magic discard payload Instead of allocating a single unused biovec for discard requests, send them down without any payload. Instead we allow the driver to add a "special" payload using a biovec embedded into struct request (unioned over other fields never used while in the driver), and overloading the number of segments for this case. This has a couple of advantages: - we don't have to allocate the bio_vec - the amount of special casing for discard requests in the block layer is significantly reduced - using this same scheme for other request types is trivial, which will be important for implementing the new WRITE_ZEROES op on devices where it actually requires a payload (e.g. SCSI) - we can get rid of playing games with the request length, as we'll never touch it and completions will work just fine - it will allow us to support ranged discard operations in the future by merging non-contiguous discard bios into a single request - last but not least it removes a lot of code This patch is the common base for my WIP series for ranges discards and to remove discard_zeroes_data in favor of always using REQ_OP_WRITE_ZEROES, so it would be good to get it in quickly. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 10 +-------- block/blk-core.c | 34 ++--------------------------- block/blk-lib.c | 2 +- block/blk-merge.c | 53 +++++++++++++++------------------------------- drivers/nvme/host/core.c | 17 ++++----------- drivers/nvme/host/nvme.h | 6 ++++-- drivers/nvme/host/pci.c | 27 +++++++++++------------ drivers/nvme/host/rdma.c | 13 +++++------- drivers/nvme/target/loop.c | 4 ++-- drivers/scsi/scsi_lib.c | 6 +++--- drivers/scsi/sd.c | 24 ++++++++------------- include/linux/bio.h | 3 ++- include/linux/blkdev.h | 15 ++++++++++--- 13 files changed, 76 insertions(+), 138 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/block/bio.c b/block/bio.c index 83db1f37fd0b..2b375020fc49 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1840,15 +1840,7 @@ struct bio *bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); BUG_ON(sectors >= bio_sectors(bio)); - /* - * Discards need a mutable bio_vec to accommodate the payload - * required by the DSM TRIM and UNMAP commands. - */ - if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE) - split = bio_clone_bioset(bio, gfp, bs); - else - split = bio_clone_fast(bio, gfp, bs); - + split = bio_clone_fast(bio, gfp, bs); if (!split) return NULL; diff --git a/block/blk-core.c b/block/blk-core.c index 4b7ec5958055..bd642a43b98b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1475,38 +1475,6 @@ void blk_put_request(struct request *req) } EXPORT_SYMBOL(blk_put_request); -/** - * blk_add_request_payload - add a payload to a request - * @rq: request to update - * @page: page backing the payload - * @offset: offset in page - * @len: length of the payload. - * - * This allows to later add a payload to an already submitted request by - * a block driver. The driver needs to take care of freeing the payload - * itself. - * - * Note that this is a quite horrible hack and nothing but handling of - * discard requests should ever use it. - */ -void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len) -{ - struct bio *bio = rq->bio; - - bio->bi_io_vec->bv_page = page; - bio->bi_io_vec->bv_offset = offset; - bio->bi_io_vec->bv_len = len; - - bio->bi_iter.bi_size = len; - bio->bi_vcnt = 1; - bio->bi_phys_segments = 1; - - rq->__data_len = rq->resid_len = len; - rq->nr_phys_segments = 1; -} -EXPORT_SYMBOL_GPL(blk_add_request_payload); - bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio) { @@ -2642,6 +2610,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) return false; } + WARN_ON_ONCE(req->rq_flags & RQF_SPECIAL_PAYLOAD); + req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ diff --git a/block/blk-lib.c b/block/blk-lib.c index 510a6fb15318..ed89c8f4b2a0 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -80,7 +80,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, req_sects = end_sect - sector; } - bio = next_bio(bio, 1, gfp_mask); + bio = next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); diff --git a/block/blk-merge.c b/block/blk-merge.c index 1002afdfee99..182398cb1524 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -241,18 +241,13 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (!bio) return 0; - /* - * This should probably be returning 0, but blk_add_request_payload() - * (Christoph!!!!) - */ switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: return 1; - default: - break; } fbio = bio; @@ -410,39 +405,21 @@ new_segment: *bvprv = *bvec; } +static inline int __blk_bvec_map_sg(struct request_queue *q, struct bio_vec bv, + struct scatterlist *sglist, struct scatterlist **sg) +{ + *sg = sglist; + sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); + return 1; +} + static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist, struct scatterlist **sg) { struct bio_vec bvec, bvprv = { NULL }; struct bvec_iter iter; - int nsegs, cluster; - - nsegs = 0; - cluster = blk_queue_cluster(q); - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - /* - * This is a hack - drivers should be neither modifying the - * biovec, nor relying on bi_vcnt - but because of - * blk_add_request_payload(), a discard bio may or may not have - * a payload we need to set up here (thank you Christoph) and - * bi_vcnt is really the only way of telling if we need to. - */ - if (!bio->bi_vcnt) - return 0; - /* Fall through */ - case REQ_OP_WRITE_SAME: - *sg = sglist; - bvec = bio_iovec(bio); - sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); - return 1; - default: - break; - } + int cluster = blk_queue_cluster(q), nsegs = 0; for_each_bio(bio) bio_for_each_segment(bvec, bio, iter) @@ -462,7 +439,11 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sg = NULL; int nsegs = 0; - if (rq->bio) + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + nsegs = __blk_bvec_map_sg(q, rq->special_vec, sglist, &sg); + else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) + nsegs = __blk_bvec_map_sg(q, bio_iovec(rq->bio), sglist, &sg); + else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); if (unlikely(rq->rq_flags & RQF_COPY_USER) && @@ -495,7 +476,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, * Something must have been wrong if the figured number of * segment is bigger than number of req's physical segments */ - WARN_ON(nsegs > rq->nr_phys_segments); + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); return nsegs; } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1b48514fbe99..3b1d6478dcfb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -239,8 +239,6 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd) { struct nvme_dsm_range *range; - struct page *page; - int offset; unsigned int nr_bytes = blk_rq_bytes(req); range = kmalloc(sizeof(*range), GFP_ATOMIC); @@ -257,17 +255,10 @@ static inline int nvme_setup_discard(struct nvme_ns *ns, struct request *req, cmnd->dsm.nr = 0; cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - req->completion_data = range; - page = virt_to_page(range); - offset = offset_in_page(range); - blk_add_request_payload(req, page, offset, sizeof(*range)); - - /* - * we set __data_len back to the size of the area to be discarded - * on disk. This allows us to report completion on the full amount - * of blocks described by the request. - */ - req->__data_len = nr_bytes; + req->special_vec.bv_page = virt_to_page(range); + req->special_vec.bv_offset = offset_in_page(range); + req->special_vec.bv_len = sizeof(*range); + req->rq_flags |= RQF_SPECIAL_PAYLOAD; return BLK_MQ_RQ_QUEUE_OK; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a3d6ffd874af..bd5321441d12 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -236,8 +236,10 @@ static inline unsigned nvme_map_len(struct request *rq) static inline void nvme_cleanup_cmd(struct request *req) { - if (req_op(req) == REQ_OP_DISCARD) - kfree(req->completion_data); + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + kfree(page_address(req->special_vec.bv_page) + + req->special_vec.bv_offset); + } } static inline int nvme_error_status(u16 status) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 82b9b3f1f21d..717d6ea47ee4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -302,14 +302,14 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, static __le64 **iod_list(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - return (__le64 **)(iod->sg + req->nr_phys_segments); + return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); } static int nvme_init_iod(struct request *rq, unsigned size, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); - int nseg = rq->nr_phys_segments; + int nseg = blk_rq_nr_phys_segments(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); @@ -339,8 +339,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; - nvme_cleanup_cmd(req); - if (iod->npages == 0) dma_pool_free(dev->prp_small_pool, list[0], prp_dma); for (i = 0; i < iod->npages; i++) { @@ -510,7 +508,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_TO_DEVICE : DMA_FROM_DEVICE; int ret = BLK_MQ_RQ_QUEUE_ERROR; - sg_init_table(iod->sg, req->nr_phys_segments); + sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(q, req, iod->sg); if (!iod->nents) goto out; @@ -566,6 +564,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) } } + nvme_cleanup_cmd(req); nvme_free_iod(dev, req); } @@ -596,20 +595,20 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, } } - map_len = nvme_map_len(req); - ret = nvme_init_iod(req, map_len, dev); + ret = nvme_setup_cmd(ns, req, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK) return ret; - ret = nvme_setup_cmd(ns, req, &cmnd); + map_len = nvme_map_len(req); + ret = nvme_init_iod(req, map_len, dev); if (ret != BLK_MQ_RQ_QUEUE_OK) - goto out; + goto out_free_cmd; - if (req->nr_phys_segments) + if (blk_rq_nr_phys_segments(req)) ret = nvme_map_data(dev, req, map_len, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK) - goto out; + goto out_cleanup_iod; blk_mq_start_request(req); @@ -620,14 +619,16 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, else ret = BLK_MQ_RQ_QUEUE_ERROR; spin_unlock_irq(&nvmeq->q_lock); - goto out; + goto out_cleanup_iod; } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); return BLK_MQ_RQ_QUEUE_OK; -out: +out_cleanup_iod: nvme_free_iod(dev, req); +out_free_cmd: + nvme_cleanup_cmd(req); return ret; } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b037d0cb2a7e..251101bf982f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -952,8 +952,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_device *dev = queue->device; struct ib_device *ibdev = dev->dev; - int nents, count; - int ret; + int count, ret; req->num_sge = 1; req->inline_data = false; @@ -965,16 +964,14 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, return nvme_rdma_set_sg_null(c); req->sg_table.sgl = req->first_sgl; - ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments, - req->sg_table.sgl); + ret = sg_alloc_table_chained(&req->sg_table, + blk_rq_nr_phys_segments(rq), req->sg_table.sgl); if (ret) return -ENOMEM; - nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); - BUG_ON(nents > rq->nr_phys_segments); - req->nents = nents; + req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); - count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents, + count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (unlikely(count <= 0)) { sg_free_table_chained(&req->sg_table, true); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 57ded6b3ed8a..9aaa70071ae5 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -185,13 +185,13 @@ static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (blk_rq_bytes(req)) { iod->sg_table.sgl = iod->first_sgl; ret = sg_alloc_table_chained(&iod->sg_table, - req->nr_phys_segments, iod->sg_table.sgl); + blk_rq_nr_phys_segments(req), + iod->sg_table.sgl); if (ret) return BLK_MQ_RQ_QUEUE_BUSY; iod->req.sg = iod->sg_table.sgl; iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); - BUG_ON(iod->req.sg_cnt > req->nr_phys_segments); } blk_mq_start_request(req); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 47a5c8783b89..9a8ccff1121f 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1007,8 +1007,8 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb) /* * If sg table allocation fails, requeue request later. */ - if (unlikely(sg_alloc_table_chained(&sdb->table, req->nr_phys_segments, - sdb->table.sgl))) + if (unlikely(sg_alloc_table_chained(&sdb->table, + blk_rq_nr_phys_segments(req), sdb->table.sgl))) return BLKPREP_DEFER; /* @@ -1040,7 +1040,7 @@ int scsi_init_io(struct scsi_cmnd *cmd) bool is_mq = (rq->mq_ctx != NULL); int error; - BUG_ON(!rq->nr_phys_segments); + BUG_ON(!blk_rq_nr_phys_segments(rq)); error = scsi_init_sgtable(rq, &cmd->sdb); if (error) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 65738b0aad36..079c2d9759fb 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -716,7 +716,6 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd) struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); sector_t sector = blk_rq_pos(rq); unsigned int nr_sectors = blk_rq_sectors(rq); - unsigned int nr_bytes = blk_rq_bytes(rq); unsigned int len; int ret; char *buf; @@ -772,24 +771,19 @@ static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd) goto out; } - rq->completion_data = page; rq->timeout = SD_TIMEOUT; cmd->transfersize = len; cmd->allowed = SD_MAX_RETRIES; - /* - * Initially __data_len is set to the amount of data that needs to be - * transferred to the target. This amount depends on whether WRITE SAME - * or UNMAP is being used. After the scatterlist has been mapped by - * scsi_init_io() we set __data_len to the size of the area to be - * discarded on disk. This allows us to report completion on the full - * amount of blocks described by the request. - */ - blk_add_request_payload(rq, page, 0, len); - ret = scsi_init_io(cmd); - rq->__data_len = nr_bytes; + rq->special_vec.bv_page = page; + rq->special_vec.bv_offset = 0; + rq->special_vec.bv_len = len; + + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->resid_len = len; + ret = scsi_init_io(cmd); out: if (ret != BLKPREP_OK) __free_page(page); @@ -1182,8 +1176,8 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = SCpnt->request; - if (req_op(rq) == REQ_OP_DISCARD) - __free_page(rq->completion_data); + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + __free_page(rq->special_vec.bv_page); if (SCpnt->cmnd != rq->cmd) { mempool_free(SCpnt->cmnd, sd_cdb_pool); diff --git a/include/linux/bio.h b/include/linux/bio.h index b15323934a29..7cf8a6c70a3f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -197,8 +197,9 @@ static inline unsigned bio_segments(struct bio *bio) switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_SAME: case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: return 1; default: break; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ebeef2b79c5a..c5393766909d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -120,10 +120,13 @@ typedef __u32 __bitwise req_flags_t; #define RQF_HASHED ((__force req_flags_t)(1 << 16)) /* IO stats tracking on */ #define RQF_STATS ((__force req_flags_t)(1 << 17)) +/* Look at ->special_vec for the actual data payload instead of the + bio chain. */ +#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ) + (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) #define BLK_MAX_CDB 16 @@ -175,6 +178,7 @@ struct request { */ union { struct rb_node rb_node; /* sort/lookup */ + struct bio_vec special_vec; void *completion_data; }; @@ -909,8 +913,6 @@ extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); extern void blk_rq_set_block_pc(struct request *); extern void blk_requeue_request(struct request_queue *, struct request *); -extern void blk_add_request_payload(struct request *rq, struct page *page, - int offset, unsigned int len); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, @@ -1153,6 +1155,13 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); +static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return 1; + return rq->nr_phys_segments; +} + extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); extern long nr_blockdev_pages(void); -- cgit From e8465447d2f3366069115f7453153561ac9a1220 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Fri, 16 Dec 2016 10:11:56 +0530 Subject: block: Remove unused member (busy) from struct blk_queue_tag Signed-off-by: Ritesh Harjani Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 286b2a264383..83695641bd5e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -288,7 +288,6 @@ enum blk_queue_state { struct blk_queue_tag { struct request **tag_index; /* map of busy tags */ unsigned long *tag_map; /* bit map of free/busy tags */ - int busy; /* current depth */ int max_depth; /* what we will send to device */ int real_max_depth; /* what the array can hold */ atomic_t refcnt; /* map can be shared */ -- cgit