diff options
Diffstat (limited to 'drivers/block')
30 files changed, 1056 insertions, 900 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index f79f20430ef7..5b9d4aaebb81 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -385,6 +385,23 @@ config BLK_DEV_UBLK can handle batch more effectively, but task_work_add() isn't exported for module, so ublk has to be built to kernel. +config BLKDEV_UBLK_LEGACY_OPCODES + bool "Support legacy command opcode" + depends on BLK_DEV_UBLK + default y + help + ublk driver started to take plain command encoding, which turns out + one bad way. The traditional ioctl command opcode encodes more + info and basically defines each code uniquely, so opcode conflict + is avoided, and driver can handle wrong command easily, meantime it + may help security subsystem to audit io_uring command. + + Say Y if your application still uses legacy command opcode. + + Say N if you don't want to support legacy command opcode. It is + suggested to enable N if your application(ublk server) switches to + ioctl command encoding. + source "drivers/block/rnbd/Kconfig" endif # BLK_DEV diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index 7a368c90467d..4c666f72203f 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -290,7 +290,7 @@ aoechr_init(void) } init_completion(&emsgs_comp); spin_lock_init(&emsgs_lock); - aoe_class = class_create(THIS_MODULE, "aoe"); + aoe_class = class_create("aoe"); if (IS_ERR(aoe_class)) { unregister_chrdev(AOE_MAJOR, "aoechr"); return PTR_ERR(aoe_class); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 34177f1bd97d..bcad9b926b0c 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -404,7 +404,6 @@ static int brd_alloc(int i) /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); if (err) diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 429255876800..64b3a1c76f03 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -735,8 +735,9 @@ static bool update_rs_extent(struct drbd_device *device, return false; } -void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) +void drbd_advance_rs_marks(struct drbd_peer_device *peer_device, unsigned long still_to_go) { + struct drbd_device *device = peer_device->device; unsigned long now = jiffies; unsigned long last = device->rs_mark_time[device->rs_last_mark]; int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; @@ -819,7 +820,7 @@ static int update_sync_bits(struct drbd_device *device, if (mode == SET_IN_SYNC) { unsigned long still_to_go = drbd_bm_total_weight(device); bool rs_is_done = (still_to_go <= device->rs_failed); - drbd_advance_rs_marks(device, still_to_go); + drbd_advance_rs_marks(first_peer_device(device), still_to_go); if (cleared || rs_is_done) maybe_schedule_on_disk_bitmap_update(device, rs_is_done); } else if (mode == RECORD_RS_FAILED) @@ -843,10 +844,11 @@ static bool plausible_request_size(int size) * called by worker on C_SYNC_TARGET and receiver on SyncSource. * */ -int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, +int __drbd_change_sync(struct drbd_peer_device *peer_device, sector_t sector, int size, enum update_sync_bits_mode mode) { /* Is called from worker and receiver context _only_ */ + struct drbd_device *device = peer_device->device; unsigned long sbnr, ebnr, lbnr; unsigned long count = 0; sector_t esector, nr_sectors; @@ -1009,14 +1011,15 @@ retry: * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN * if there is still application IO going on in this area. */ -int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) +int drbd_try_rs_begin_io(struct drbd_peer_device *peer_device, sector_t sector) { + struct drbd_device *device = peer_device->device; unsigned int enr = BM_SECT_TO_EXT(sector); const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; struct lc_element *e; struct bm_extent *bm_ext; int i; - bool throttle = drbd_rs_should_slow_down(device, sector, true); + bool throttle = drbd_rs_should_slow_down(peer_device, sector, true); /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, * not yet BME_LOCKED) extent needs to be kicked out explicitly if we diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 289876ffbc31..6ac8c54b44c7 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1216,7 +1216,9 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned * drbd_bm_read() - Read the whole bitmap from its on disk location. * @device: DRBD device. */ -int drbd_bm_read(struct drbd_device *device) __must_hold(local) +int drbd_bm_read(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) + { return bm_rw(device, BM_AIO_READ, 0); } @@ -1227,7 +1229,8 @@ int drbd_bm_read(struct drbd_device *device) __must_hold(local) * * Will only write pages that have changed since last IO. */ -int drbd_bm_write(struct drbd_device *device) __must_hold(local) +int drbd_bm_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) { return bm_rw(device, 0, 0); } @@ -1238,7 +1241,8 @@ int drbd_bm_write(struct drbd_device *device) __must_hold(local) * * Will write all pages. */ -int drbd_bm_write_all(struct drbd_device *device) __must_hold(local) +int drbd_bm_write_all(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) { return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0); } @@ -1264,7 +1268,8 @@ int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_ho * verify is aborted due to a failed peer disk, while local IO continues, or * pending resync acks are still being processed. */ -int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local) +int drbd_bm_write_copy_pages(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) { return bm_rw(device, BM_AIO_COPY_PAGES, 0); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index d89b7d03d4c8..a30a5ed811be 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -66,6 +66,7 @@ extern int drbd_proc_details; struct drbd_device; struct drbd_connection; +struct drbd_peer_device; /* Defines to control fault insertion */ enum { @@ -126,8 +127,8 @@ struct bm_xfer_ctx { unsigned bytes[2]; }; -extern void INFO_bm_xfer_stats(struct drbd_device *device, - const char *direction, struct bm_xfer_ctx *c); +extern void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device, + const char *direction, struct bm_xfer_ctx *c); static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) { @@ -541,9 +542,10 @@ struct drbd_md_io { struct bm_io_work { struct drbd_work w; + struct drbd_peer_device *peer_device; char *why; enum bm_flag flags; - int (*io_fn)(struct drbd_device *device); + int (*io_fn)(struct drbd_device *device, struct drbd_peer_device *peer_device); void (*done)(struct drbd_device *device, int rv); }; @@ -1041,7 +1043,7 @@ extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector, enum drbd_packet cmd); extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size); -extern int drbd_send_bitmap(struct drbd_device *device); +extern int drbd_send_bitmap(struct drbd_device *device, struct drbd_peer_device *peer_device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *); @@ -1065,17 +1067,22 @@ extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold extern int drbd_md_test_flag(struct drbd_backing_dev *, int); extern void drbd_md_mark_dirty(struct drbd_device *device); extern void drbd_queue_bitmap_io(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), void (*done)(struct drbd_device *, int), - char *why, enum bm_flag flags); + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device); extern int drbd_bitmap_io(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), - char *why, enum bm_flag flags); + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device); extern int drbd_bitmap_io_from_worker(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), - char *why, enum bm_flag flags); -extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local); -extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local); + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device); +extern int drbd_bmio_set_n_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); +extern int drbd_bmio_clear_n_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); /* Meta data layout * @@ -1284,14 +1291,18 @@ extern void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e); extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr); extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); -extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_read(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); -extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); -extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); -extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_all(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local); extern size_t drbd_bm_words(struct drbd_device *device); extern unsigned long drbd_bm_bits(struct drbd_device *device); extern sector_t drbd_bm_capacity(struct drbd_device *device); @@ -1422,21 +1433,24 @@ void drbd_resync_after_changed(struct drbd_device *device); extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side); extern void resume_next_sg(struct drbd_device *device); extern void suspend_other_sg(struct drbd_device *device); -extern int drbd_resync_finished(struct drbd_device *device); +extern int drbd_resync_finished(struct drbd_peer_device *peer_device); /* maybe rather drbd_main.c ? */ extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); extern void drbd_md_put_buffer(struct drbd_device *device); extern int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, sector_t sector, enum req_op op); -extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int); +extern void drbd_ov_out_of_sync_found(struct drbd_peer_device *peer_device, + sector_t sector, int size); extern void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, unsigned int *done); -extern void drbd_rs_controller_reset(struct drbd_device *device); +extern void drbd_rs_controller_reset(struct drbd_peer_device *peer_device); -static inline void ov_out_of_sync_print(struct drbd_device *device) +static inline void ov_out_of_sync_print(struct drbd_peer_device *peer_device) { + struct drbd_device *device = peer_device->device; + if (device->ov_last_oos_size) { - drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n", + drbd_err(peer_device, "Out of sync: start=%llu, size=%lu (sectors)\n", (unsigned long long)device->ov_last_oos_start, (unsigned long)device->ov_last_oos_size); } @@ -1475,7 +1489,7 @@ extern int drbd_ack_receiver(struct drbd_thread *thi); extern void drbd_send_ping_wf(struct work_struct *ws); extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); -extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, +extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, bool throttle_if_app_is_waiting); extern int drbd_submit_peer_request(struct drbd_peer_request *peer_req); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); @@ -1531,22 +1545,22 @@ extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i); extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector); extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector); -extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector); +extern int drbd_try_rs_begin_io(struct drbd_peer_device *peer_device, sector_t sector); extern void drbd_rs_cancel_all(struct drbd_device *device); extern int drbd_rs_del_all(struct drbd_device *device); -extern void drbd_rs_failed_io(struct drbd_device *device, +extern void drbd_rs_failed_io(struct drbd_peer_device *peer_device, sector_t sector, int size); -extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); +extern void drbd_advance_rs_marks(struct drbd_peer_device *peer_device, unsigned long still_to_go); enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; -extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, +extern int __drbd_change_sync(struct drbd_peer_device *peer_device, sector_t sector, int size, enum update_sync_bits_mode mode); -#define drbd_set_in_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_IN_SYNC) -#define drbd_set_out_of_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC) -#define drbd_rs_failed_io(device, sector, size) \ - __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) +#define drbd_set_in_sync(peer_device, sector, size) \ + __drbd_change_sync(peer_device, sector, size, SET_IN_SYNC) +#define drbd_set_out_of_sync(peer_device, sector, size) \ + __drbd_change_sync(peer_device, sector, size, SET_OUT_OF_SYNC) +#define drbd_rs_failed_io(peer_device, sector, size) \ + __drbd_change_sync(peer_device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); extern int drbd_al_initialize(struct drbd_device *, void *); @@ -1918,18 +1932,14 @@ static inline void inc_ap_pending(struct drbd_device *device) atomic_inc(&device->ap_pending_cnt); } -#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ - if (atomic_read(&device->which) < 0) \ - drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n", \ - func, line, \ - atomic_read(&device->which)) - -#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__) -static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line) +#define dec_ap_pending(device) ((void)expect((device), __dec_ap_pending(device) >= 0)) +static inline int __dec_ap_pending(struct drbd_device *device) { - if (atomic_dec_and_test(&device->ap_pending_cnt)) + int ap_pending_cnt = atomic_dec_return(&device->ap_pending_cnt); + + if (ap_pending_cnt == 0) wake_up(&device->misc_wait); - ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); + return ap_pending_cnt; } /* counts how many resync-related answers we still expect from the peer @@ -1938,16 +1948,16 @@ static inline void _dec_ap_pending(struct drbd_device *device, const char *func, * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER) * (or P_NEG_ACK with ID_SYNCER) */ -static inline void inc_rs_pending(struct drbd_device *device) +static inline void inc_rs_pending(struct drbd_peer_device *peer_device) { - atomic_inc(&device->rs_pending_cnt); + atomic_inc(&peer_device->device->rs_pending_cnt); } -#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__) -static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line) +#define dec_rs_pending(peer_device) \ + ((void)expect((peer_device), __dec_rs_pending(peer_device) >= 0)) +static inline int __dec_rs_pending(struct drbd_peer_device *peer_device) { - atomic_dec(&device->rs_pending_cnt); - ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); + return atomic_dec_return(&peer_device->device->rs_pending_cnt); } /* counts how many answers we still need to send to the peer. @@ -1964,18 +1974,16 @@ static inline void inc_unacked(struct drbd_device *device) atomic_inc(&device->unacked_cnt); } -#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__) -static inline void _dec_unacked(struct drbd_device *device, const char *func, int line) +#define dec_unacked(device) ((void)expect(device, __dec_unacked(device) >= 0)) +static inline int __dec_unacked(struct drbd_device *device) { - atomic_dec(&device->unacked_cnt); - ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); + return atomic_dec_return(&device->unacked_cnt); } -#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__) -static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line) +#define sub_unacked(device, n) ((void)expect(device, __sub_unacked(device) >= 0)) +static inline int __sub_unacked(struct drbd_device *device, int n) { - atomic_sub(n, &device->unacked_cnt); - ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); + return atomic_sub_return(n, &device->unacked_cnt); } static inline bool is_sync_target_state(enum drbd_conns connection_state) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2c764f7ee4a7..83987e7a5ef2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -231,9 +231,11 @@ void tl_release(struct drbd_connection *connection, unsigned int barrier_nr, } req = list_prepare_entry(tmp, &connection->transfer_log, tl_requests); list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) { + struct drbd_peer_device *peer_device; if (req->epoch != expect_epoch) break; - _req_mod(req, BARRIER_ACKED); + peer_device = conn_peer_device(connection, req->device->vnr); + _req_mod(req, BARRIER_ACKED, peer_device); } spin_unlock_irq(&connection->resource->req_lock); @@ -256,10 +258,13 @@ bail: /* must hold resource->req_lock */ void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what) { + struct drbd_peer_device *peer_device; struct drbd_request *req, *r; - list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) - _req_mod(req, what); + list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) { + peer_device = conn_peer_device(connection, req->device->vnr); + _req_mod(req, what, peer_device); + } } void tl_restart(struct drbd_connection *connection, enum drbd_req_event what) @@ -297,7 +302,7 @@ void tl_abort_disk_io(struct drbd_device *device) continue; if (req->device != device) continue; - _req_mod(req, ABORT_DISK_IO); + _req_mod(req, ABORT_DISK_IO, NULL); } spin_unlock_irq(&connection->resource->req_lock); } @@ -1198,10 +1203,11 @@ static int fill_bitmap_rle_bits(struct drbd_device *device, * code upon failure. */ static int -send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) +send_bitmap_rle_or_plain(struct drbd_peer_device *peer_device, struct bm_xfer_ctx *c) { - struct drbd_socket *sock = &first_peer_device(device)->connection->data; - unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock = &peer_device->connection->data; + unsigned int header_size = drbd_header_size(peer_device->connection); struct p_compressed_bm *p = sock->sbuf + header_size; int len, err; @@ -1212,7 +1218,7 @@ send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) if (len) { dcbp_set_code(p, RLE_VLI_Bits); - err = __send_command(first_peer_device(device)->connection, device->vnr, sock, + err = __send_command(peer_device->connection, device->vnr, sock, P_COMPRESSED_BITMAP, sizeof(*p) + len, NULL, 0); c->packets[0]++; @@ -1233,7 +1239,8 @@ send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) len = num_words * sizeof(*p); if (len) drbd_bm_get_lel(device, c->word_offset, num_words, p); - err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0); + err = __send_command(peer_device->connection, device->vnr, sock, P_BITMAP, + len, NULL, 0); c->word_offset += num_words; c->bit_offset = c->word_offset * BITS_PER_LONG; @@ -1245,7 +1252,7 @@ send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) } if (!err) { if (len == 0) { - INFO_bm_xfer_stats(device, "send", c); + INFO_bm_xfer_stats(peer_device, "send", c); return 0; } else return 1; @@ -1254,7 +1261,8 @@ send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) } /* See the comment at receive_bitmap() */ -static int _drbd_send_bitmap(struct drbd_device *device) +static int _drbd_send_bitmap(struct drbd_device *device, + struct drbd_peer_device *peer_device) { struct bm_xfer_ctx c; int err; @@ -1266,7 +1274,7 @@ static int _drbd_send_bitmap(struct drbd_device *device) if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) { drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n"); drbd_bm_set_all(device); - if (drbd_bm_write(device)) { + if (drbd_bm_write(device, peer_device)) { /* write_bm did fail! Leave full sync flag set in Meta P_DATA * but otherwise process as per normal - need to tell other * side that a full resync is required! */ @@ -1285,20 +1293,20 @@ static int _drbd_send_bitmap(struct drbd_device *device) }; do { - err = send_bitmap_rle_or_plain(device, &c); + err = send_bitmap_rle_or_plain(peer_device, &c); } while (err > 0); return err == 0; } -int drbd_send_bitmap(struct drbd_device *device) +int drbd_send_bitmap(struct drbd_device *device, struct drbd_peer_device *peer_device) { - struct drbd_socket *sock = &first_peer_device(device)->connection->data; + struct drbd_socket *sock = &peer_device->connection->data; int err = -1; mutex_lock(&sock->mutex); if (sock->socket) - err = !_drbd_send_bitmap(device); + err = !_drbd_send_bitmap(device, peer_device); mutex_unlock(&sock->mutex); return err; } @@ -3406,7 +3414,9 @@ void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) * * Sets all bits in the bitmap and writes the whole bitmap to stable storage. */ -int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) +int drbd_bmio_set_n_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) + { int rv = -EIO; @@ -3414,7 +3424,7 @@ int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) drbd_md_sync(device); drbd_bm_set_all(device); - rv = drbd_bm_write(device); + rv = drbd_bm_write(device, peer_device); if (!rv) { drbd_md_clear_flag(device, MDF_FULL_SYNC); @@ -3430,11 +3440,13 @@ int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) * * Clears all bits in the bitmap and writes the whole bitmap to stable storage. */ -int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local) +int drbd_bmio_clear_n_write(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) + { drbd_resume_al(device); drbd_bm_clear_all(device); - return drbd_bm_write(device); + return drbd_bm_write(device, peer_device); } static int w_bitmap_io(struct drbd_work *w, int unused) @@ -3453,7 +3465,7 @@ static int w_bitmap_io(struct drbd_work *w, int unused) if (get_ldev(device)) { drbd_bm_lock(device, work->why, work->flags); - rv = work->io_fn(device); + rv = work->io_fn(device, work->peer_device); drbd_bm_unlock(device); put_ldev(device); } @@ -3488,11 +3500,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused) * put_ldev(). */ void drbd_queue_bitmap_io(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), void (*done)(struct drbd_device *, int), - char *why, enum bm_flag flags) + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device) { - D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); + D_ASSERT(device, current == peer_device->connection->worker.task); D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags)); D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags)); @@ -3501,6 +3514,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n", why, device->bm_io_work.why); + device->bm_io_work.peer_device = peer_device; device->bm_io_work.io_fn = io_fn; device->bm_io_work.done = done; device->bm_io_work.why = why; @@ -3512,7 +3526,7 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * application IO does not conflict anyways. */ if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) - drbd_queue_work(&first_peer_device(device)->connection->sender_work, + drbd_queue_work(&peer_device->connection->sender_work, &device->bm_io_work.w); } spin_unlock_irq(&device->resource->req_lock); @@ -3528,8 +3542,10 @@ void drbd_queue_bitmap_io(struct drbd_device *device, * freezes application IO while that the actual IO operations runs. This * functions MAY NOT be called from worker context. */ -int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), - char *why, enum bm_flag flags) +int drbd_bitmap_io(struct drbd_device *device, + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device) { /* Only suspend io, if some operation is supposed to be locked out */ const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST); @@ -3541,7 +3557,7 @@ int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device * drbd_suspend_io(device); drbd_bm_lock(device, why, flags); - rv = io_fn(device); + rv = io_fn(device, peer_device); drbd_bm_unlock(device); if (do_suspend_io) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 60757ac31701..1a5d3d72d91d 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1053,7 +1053,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, - "size changed", BM_LOCKED_MASK); + "size changed", BM_LOCKED_MASK, NULL); /* on-disk bitmap and activity log is authoritative again * (unless there was an IO error meanwhile...) */ @@ -1615,7 +1615,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) drbd_send_sync_param(peer_device); } - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); kfree(old_plan); mod_timer(&device->request_timer, jiffies + HZ); goto success; @@ -2027,13 +2027,15 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_info(device, "Assuming that all blocks are out of sync " "(aka FullSync)\n"); if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, - "set_n_write from attaching", BM_LOCKED_MASK)) { + "set_n_write from attaching", BM_LOCKED_MASK, + NULL)) { retcode = ERR_IO_MD_DISK; goto force_diskless_dec; } } else { if (drbd_bitmap_io(device, &drbd_bm_read, - "read from attaching", BM_LOCKED_MASK)) { + "read from attaching", BM_LOCKED_MASK, + NULL)) { retcode = ERR_IO_MD_DISK; goto force_diskless_dec; } @@ -2446,7 +2448,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&connection->resource->conf_update); mutex_unlock(&connection->data.mutex); - kvfree_rcu(old_net_conf); + kvfree_rcu_mightsleep(old_net_conf); if (connection->cstate >= C_WF_REPORT_PARAMS) { struct drbd_peer_device *peer_device; @@ -2860,7 +2862,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) new_disk_conf->disk_size = (sector_t)rs.resize_size; rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&device->resource->conf_update); - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); new_disk_conf = NULL; } @@ -2972,7 +2974,7 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT)); if (retcode >= SS_SUCCESS) { if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, - "set_n_write from invalidate", BM_LOCKED_MASK)) + "set_n_write from invalidate", BM_LOCKED_MASK, NULL)) retcode = ERR_IO_MD_DISK; } } else @@ -3005,11 +3007,12 @@ out: return 0; } -static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local) +static int drbd_bmio_set_susp_al(struct drbd_device *device, + struct drbd_peer_device *peer_device) __must_hold(local) { int rv; - rv = drbd_bmio_set_n_write(device); + rv = drbd_bmio_set_n_write(device, peer_device); drbd_suspend_al(device); return rv; } @@ -3052,7 +3055,7 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) if (retcode >= SS_SUCCESS) { if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al, "set_n_write from invalidate_peer", - BM_LOCKED_SET_ALLOWED)) + BM_LOCKED_SET_ALLOWED, NULL)) retcode = ERR_IO_MD_DISK; } } else @@ -4148,7 +4151,7 @@ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) if (args.clear_bm) { err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write, - "clear_n_write from new_c_uuid", BM_LOCKED_MASK); + "clear_n_write from new_c_uuid", BM_LOCKED_MASK, NULL); if (err) { drbd_err(device, "Writing bitmap failed with %d\n", err); retcode = ERR_IO_MD_DISK; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 757f4692b5bd..8c2bc47de473 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1283,7 +1283,7 @@ static void one_flush_endio(struct bio *bio) static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) { struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0, - REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO); + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO); struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); if (!octx) { @@ -2044,11 +2044,11 @@ static int e_end_resync_block(struct drbd_work *w, int unused) D_ASSERT(device, drbd_interval_empty(&peer_req->i)); if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { - drbd_set_in_sync(device, sector, peer_req->i.size); + drbd_set_in_sync(peer_device, sector, peer_req->i.size); err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req); } else { /* Record failure to sync */ - drbd_rs_failed_io(device, sector, peer_req->i.size); + drbd_rs_failed_io(peer_device, sector, peer_req->i.size); err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); } @@ -2067,7 +2067,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto if (!peer_req) goto fail; - dec_rs_pending(device); + dec_rs_pending(peer_device); inc_unacked(device); /* corresponding dec_unacked() in e_end_resync_block() @@ -2138,7 +2138,7 @@ static int receive_DataReply(struct drbd_connection *connection, struct packet_i err = recv_dless_read(peer_device, req, sector, pi->size); if (!err) - req_mod(req, DATA_RECEIVED); + req_mod(req, DATA_RECEIVED, peer_device); /* else: nothing. handled from drbd_disconnect... * I don't think we may complete this just yet * in case we are "on-disconnect: freeze" */ @@ -2196,7 +2196,7 @@ static void restart_conflicting_writes(struct drbd_device *device, continue; /* as it is RQ_POSTPONED, this will cause it to * be queued on the retry workqueue. */ - __req_mod(req, CONFLICT_RESOLVED, NULL); + __req_mod(req, CONFLICT_RESOLVED, NULL, NULL); } } @@ -2220,7 +2220,7 @@ static int e_end_block(struct drbd_work *w, int cancel) P_RS_WRITE_ACK : P_WRITE_ACK; err = drbd_send_ack(peer_device, pcmd, peer_req); if (pcmd == P_RS_WRITE_ACK) - drbd_set_in_sync(device, sector, peer_req->i.size); + drbd_set_in_sync(peer_device, sector, peer_req->i.size); } else { err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); /* we expect it to be marked out of sync anyways... @@ -2420,6 +2420,7 @@ static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf) static void fail_postponed_requests(struct drbd_device *device, sector_t sector, unsigned int size) { + struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_interval *i; repeat: @@ -2433,7 +2434,7 @@ static void fail_postponed_requests(struct drbd_device *device, sector_t sector, if (!(req->rq_state & RQ_POSTPONED)) continue; req->rq_state &= ~RQ_POSTPONED; - __req_mod(req, NEG_ACKED, &m); + __req_mod(req, NEG_ACKED, peer_device, &m); spin_unlock_irq(&device->resource->req_lock); if (m.bio) complete_master_bio(device, &m); @@ -2690,7 +2691,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * if (device->state.pdsk < D_INCONSISTENT) { /* In case we have the only disk of the cluster, */ - drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); + drbd_set_out_of_sync(peer_device, peer_req->i.sector, peer_req->i.size); peer_req->flags &= ~EE_MAY_SET_IN_SYNC; drbd_al_begin_io(device, &peer_req->i); peer_req->flags |= EE_CALL_AL_COMPLETE_IO; @@ -2729,9 +2730,10 @@ out_interrupted: * The current sync rate used here uses only the most recent two step marks, * to have a short time average so we can react faster. */ -bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, +bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, bool throttle_if_app_is_waiting) { + struct drbd_device *device = peer_device->device; struct lc_element *tmp; bool throttle = drbd_rs_c_min_rate_throttle(device); @@ -2843,7 +2845,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet break; case P_OV_REPLY: verb = 0; - dec_rs_pending(device); + dec_rs_pending(peer_device); drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC); break; default: @@ -2914,7 +2916,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet /* track progress, we may need to throttle */ atomic_add(size >> 9, &device->rs_sect_in); peer_req->w.cb = w_e_end_ov_reply; - dec_rs_pending(device); + dec_rs_pending(peer_device); /* drbd_rs_begin_io done when we sent this request, * but accounting still needs to be done. */ goto submit_for_resync; @@ -2977,7 +2979,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet update_receiver_timing_details(connection, drbd_rs_should_slow_down); if (device->state.peer != R_PRIMARY - && drbd_rs_should_slow_down(device, sector, false)) + && drbd_rs_should_slow_down(peer_device, sector, false)) schedule_timeout_uninterruptible(HZ/10); update_receiver_timing_details(connection, drbd_rs_begin_io); if (drbd_rs_begin_io(device, sector)) @@ -3226,10 +3228,11 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, -1096 requires proto 96 */ -static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local) +static int drbd_uuid_compare(struct drbd_peer_device *const peer_device, + enum drbd_role const peer_role, int *rule_nr) __must_hold(local) { - struct drbd_peer_device *const peer_device = first_peer_device(device); - struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + struct drbd_connection *const connection = peer_device->connection; + struct drbd_device *device = peer_device->device; u64 self, peer; int i, j; @@ -3465,7 +3468,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); - hg = drbd_uuid_compare(device, peer_role, &rule_nr); + hg = drbd_uuid_compare(peer_device, peer_role, &rule_nr); spin_unlock_irq(&device->ldev->md.uuid_lock); drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); @@ -3591,7 +3594,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, if (abs(hg) >= 2) { drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake", - BM_LOCKED_SET_ALLOWED)) + BM_LOCKED_SET_ALLOWED, NULL)) return C_MASK; } @@ -3759,7 +3762,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in drbd_info(connection, "peer data-integrity-alg: %s\n", integrity_alg[0] ? integrity_alg : "(none)"); - kvfree_rcu(old_net_conf); + kvfree_rcu_mightsleep(old_net_conf); return 0; disconnect_rcu_unlock: @@ -4127,7 +4130,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); mutex_unlock(&connection->resource->conf_update); - kvfree_rcu(old_disk_conf); + kvfree_rcu_mightsleep(old_disk_conf); drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n", (unsigned long)p_usize, (unsigned long)my_usize); @@ -4270,7 +4273,7 @@ static int receive_uuids(struct drbd_connection *connection, struct packet_info drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n"); drbd_bitmap_io(device, &drbd_bmio_clear_n_write, "clear_n_write from receive_uuids", - BM_LOCKED_TEST_ALLOWED); + BM_LOCKED_TEST_ALLOWED, NULL); _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]); _drbd_uuid_set(device, UI_BITMAP, 0); _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), @@ -4448,7 +4451,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info else if (os.conn >= C_SYNC_SOURCE && peer_state.conn == C_CONNECTED) { if (drbd_bm_total_weight(device) <= device->rs_failed) - drbd_resync_finished(device); + drbd_resync_finished(peer_device); return 0; } } @@ -4456,8 +4459,8 @@ static int receive_state(struct drbd_connection *connection, struct packet_info /* explicit verify finished notification, stop sector reached. */ if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(peer_device); + drbd_resync_finished(peer_device); return 0; } @@ -4766,11 +4769,11 @@ decode_bitmap_c(struct drbd_peer_device *peer_device, return -EIO; } -void INFO_bm_xfer_stats(struct drbd_device *device, +void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device, const char *direction, struct bm_xfer_ctx *c) { /* what would it take to transfer it "plaintext" */ - unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + unsigned int header_size = drbd_header_size(peer_device->connection); unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; unsigned int plain = header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + @@ -4794,7 +4797,7 @@ void INFO_bm_xfer_stats(struct drbd_device *device, r = 1000; r = 1000 - r; - drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " + drbd_info(peer_device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " "total %u; compression: %u.%u%%\n", direction, c->bytes[1], c->packets[1], @@ -4872,12 +4875,12 @@ static int receive_bitmap(struct drbd_connection *connection, struct packet_info goto out; } - INFO_bm_xfer_stats(device, "receive", &c); + INFO_bm_xfer_stats(peer_device, "receive", &c); if (device->state.conn == C_WF_BITMAP_T) { enum drbd_state_rv rv; - err = drbd_send_bitmap(device); + err = drbd_send_bitmap(device, peer_device); if (err) goto out; /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ @@ -4935,7 +4938,7 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet drbd_conn_str(device->state.conn)); } - drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); + drbd_set_out_of_sync(peer_device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); return 0; } @@ -4956,7 +4959,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac sector = be64_to_cpu(p->sector); size = be32_to_cpu(p->blksize); - dec_rs_pending(device); + dec_rs_pending(peer_device); if (get_ldev(device)) { struct drbd_peer_request *peer_req; @@ -5214,7 +5217,7 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) if (get_ldev(device)) { drbd_bitmap_io(device, &drbd_bm_write_copy_pages, - "write from disconnected", BM_LOCKED_CHANGE_ALLOWED); + "write from disconnected", BM_LOCKED_CHANGE_ALLOWED, NULL); put_ldev(device); } @@ -5648,22 +5651,23 @@ static int got_IsInSync(struct drbd_connection *connection, struct packet_info * if (get_ldev(device)) { drbd_rs_complete_io(device, sector); - drbd_set_in_sync(device, sector, blksize); + drbd_set_in_sync(peer_device, sector, blksize); /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); put_ldev(device); } - dec_rs_pending(device); + dec_rs_pending(peer_device); atomic_add(blksize >> 9, &device->rs_sect_in); return 0; } static int -validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector, +validate_req_change_req_state(struct drbd_peer_device *peer_device, u64 id, sector_t sector, struct rb_root *root, const char *func, enum drbd_req_event what, bool missing_ok) { + struct drbd_device *device = peer_device->device; struct drbd_request *req; struct bio_and_error m; @@ -5673,7 +5677,7 @@ validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t secto spin_unlock_irq(&device->resource->req_lock); return -EIO; } - __req_mod(req, what, &m); + __req_mod(req, what, peer_device, &m); spin_unlock_irq(&device->resource->req_lock); if (m.bio) @@ -5698,8 +5702,8 @@ static int got_BlockAck(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (p->block_id == ID_SYNCER) { - drbd_set_in_sync(device, sector, blksize); - dec_rs_pending(device); + drbd_set_in_sync(peer_device, sector, blksize); + dec_rs_pending(peer_device); return 0; } switch (pi->cmd) { @@ -5722,7 +5726,7 @@ static int got_BlockAck(struct drbd_connection *connection, struct packet_info * BUG(); } - return validate_req_change_req_state(device, p->block_id, sector, + return validate_req_change_req_state(peer_device, p->block_id, sector, &device->write_requests, __func__, what, false); } @@ -5744,12 +5748,12 @@ static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (p->block_id == ID_SYNCER) { - dec_rs_pending(device); - drbd_rs_failed_io(device, sector, size); + dec_rs_pending(peer_device); + drbd_rs_failed_io(peer_device, sector, size); return 0; } - err = validate_req_change_req_state(device, p->block_id, sector, + err = validate_req_change_req_state(peer_device, p->block_id, sector, &device->write_requests, __func__, NEG_ACKED, true); if (err) { @@ -5758,7 +5762,7 @@ static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi request is no longer in the collision hash. */ /* In Protocol B we might already have got a P_RECV_ACK but then get a P_NEG_ACK afterwards. */ - drbd_set_out_of_sync(device, sector, size); + drbd_set_out_of_sync(peer_device, sector, size); } return 0; } @@ -5780,7 +5784,7 @@ static int got_NegDReply(struct drbd_connection *connection, struct packet_info drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n", (unsigned long long)sector, be32_to_cpu(p->blksize)); - return validate_req_change_req_state(device, p->block_id, sector, + return validate_req_change_req_state(peer_device, p->block_id, sector, &device->read_requests, __func__, NEG_ACKED, false); } @@ -5803,13 +5807,13 @@ static int got_NegRSDReply(struct drbd_connection *connection, struct packet_inf update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); - dec_rs_pending(device); + dec_rs_pending(peer_device); if (get_ldev_if_state(device, D_FAILED)) { drbd_rs_complete_io(device, sector); switch (pi->cmd) { case P_NEG_RS_DREPLY: - drbd_rs_failed_io(device, sector, size); + drbd_rs_failed_io(peer_device, sector, size); break; case P_RS_CANCEL: break; @@ -5866,21 +5870,21 @@ static int got_OVResult(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) - drbd_ov_out_of_sync_found(device, sector, size); + drbd_ov_out_of_sync_found(peer_device, sector, size); else - ov_out_of_sync_print(device); + ov_out_of_sync_print(peer_device); if (!get_ldev(device)) return 0; drbd_rs_complete_io(device, sector); - dec_rs_pending(device); + dec_rs_pending(peer_device); --device->ov_left; /* let's advance progress step marks only for every other megabyte */ if ((device->ov_left & 0x200) == 0x200) - drbd_advance_rs_marks(device, device->ov_left); + drbd_advance_rs_marks(peer_device, device->ov_left); if (device->ov_left == 0) { dw = kmalloc(sizeof(*dw), GFP_NOIO); @@ -5890,8 +5894,8 @@ static int got_OVResult(struct drbd_connection *connection, struct packet_info * drbd_queue_work(&peer_device->connection->sender_work, &dw->w); } else { drbd_err(device, "kmalloc(dw) failed."); - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(peer_device); + drbd_resync_finished(peer_device); } } put_ldev(device); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index e36216d50753..380e6584a4ee 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -122,12 +122,13 @@ void drbd_req_destroy(struct kref *kref) * before it even was submitted or sent. * In that case we do not want to touch the bitmap at all. */ + struct drbd_peer_device *peer_device = first_peer_device(device); if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) - drbd_set_out_of_sync(device, req->i.sector, req->i.size); + drbd_set_out_of_sync(peer_device, req->i.sector, req->i.size); if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) - drbd_set_in_sync(device, req->i.sector, req->i.size); + drbd_set_in_sync(peer_device, req->i.sector, req->i.size); } /* one might be tempted to move the drbd_al_complete_io @@ -552,12 +553,15 @@ static inline bool is_pending_write_protocol_A(struct drbd_request *req) * happen "atomically" within the req_lock, * and it enforces that we have to think in a very structured manner * about the "events" that may happen to a request during its life time ... + * + * + * peer_device == NULL means local disk */ int __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct drbd_peer_device *peer_device, struct bio_and_error *m) { struct drbd_device *const device = req->device; - struct drbd_peer_device *const peer_device = first_peer_device(device); struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; struct net_conf *nc; int p, rv = 0; @@ -617,7 +621,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case READ_COMPLETED_WITH_ERROR: - drbd_set_out_of_sync(device, req->i.sector, req->i.size); + drbd_set_out_of_sync(peer_device, req->i.sector, req->i.size); drbd_report_io_error(device, req); __drbd_chk_io_error(device, DRBD_READ_ERROR); fallthrough; @@ -1100,6 +1104,7 @@ static bool drbd_should_send_out_of_sync(union drbd_dev_state s) static int drbd_process_write_request(struct drbd_request *req) { struct drbd_device *device = req->device; + struct drbd_peer_device *peer_device = first_peer_device(device); int remote, send_oos; remote = drbd_should_do_remote(device->state); @@ -1115,7 +1120,7 @@ static int drbd_process_write_request(struct drbd_request *req) /* The only size==0 bios we expect are empty flushes. */ D_ASSERT(device, req->master_bio->bi_opf & REQ_PREFLUSH); if (remote) - _req_mod(req, QUEUE_AS_DRBD_BARRIER); + _req_mod(req, QUEUE_AS_DRBD_BARRIER, peer_device); return remote; } @@ -1125,10 +1130,10 @@ static int drbd_process_write_request(struct drbd_request *req) D_ASSERT(device, !(remote && send_oos)); if (remote) { - _req_mod(req, TO_BE_SENT); - _req_mod(req, QUEUE_FOR_NET_WRITE); - } else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size)) - _req_mod(req, QUEUE_FOR_SEND_OOS); + _req_mod(req, TO_BE_SENT, peer_device); + _req_mod(req, QUEUE_FOR_NET_WRITE, peer_device); + } else if (drbd_set_out_of_sync(peer_device, req->i.sector, req->i.size)) + _req_mod(req, QUEUE_FOR_SEND_OOS, peer_device); return remote; } @@ -1312,6 +1317,7 @@ static void drbd_update_plug(struct drbd_plug_cb *plug, struct drbd_request *req static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) { struct drbd_resource *resource = device->resource; + struct drbd_peer_device *peer_device = first_peer_device(device); const int rw = bio_data_dir(req->master_bio); struct bio_and_error m = { NULL, }; bool no_remote = false; @@ -1375,8 +1381,8 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request /* We either have a private_bio, or we can read from remote. * Otherwise we had done the goto nodata above. */ if (req->private_bio == NULL) { - _req_mod(req, TO_BE_SENT); - _req_mod(req, QUEUE_FOR_NET_READ); + _req_mod(req, TO_BE_SENT, peer_device); + _req_mod(req, QUEUE_FOR_NET_READ, peer_device); } else no_remote = true; } @@ -1397,7 +1403,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request req->pre_submit_jif = jiffies; list_add_tail(&req->req_pending_local, &device->pending_completion[rw == WRITE]); - _req_mod(req, TO_BE_SUBMITTED); + _req_mod(req, TO_BE_SUBMITTED, NULL); /* but we need to give up the spinlock to submit */ submit_private_bio = true; } else if (no_remote) { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index b4017b5c3fbc..9ae860e7591b 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -267,6 +267,7 @@ struct bio_and_error { extern void start_new_tl_epoch(struct drbd_connection *connection); extern void drbd_req_destroy(struct kref *kref); extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct drbd_peer_device *peer_device, struct bio_and_error *m); extern void complete_master_bio(struct drbd_device *device, struct bio_and_error *m); @@ -280,14 +281,15 @@ extern void drbd_restart_request(struct drbd_request *req); /* use this if you don't want to deal with calling complete_master_bio() * outside the spinlock, e.g. when walking some list on cleanup. */ -static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) +static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what, + struct drbd_peer_device *peer_device) { struct drbd_device *device = req->device; struct bio_and_error m; int rv; /* __req_mod possibly frees req, do not touch req after that! */ - rv = __req_mod(req, what, &m); + rv = __req_mod(req, what, peer_device, &m); if (m.bio) complete_master_bio(device, &m); @@ -299,7 +301,8 @@ static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) * of the lower level driver completion callback, so we need to * spin_lock_irqsave here. */ static inline int req_mod(struct drbd_request *req, - enum drbd_req_event what) + enum drbd_req_event what, + struct drbd_peer_device *peer_device) { unsigned long flags; struct drbd_device *device = req->device; @@ -307,7 +310,7 @@ static inline int req_mod(struct drbd_request *req, int rv; spin_lock_irqsave(&device->resource->req_lock, flags); - rv = __req_mod(req, what, &m); + rv = __req_mod(req, what, peer_device, &m); spin_unlock_irqrestore(&device->resource->req_lock, flags); if (m.bio) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 75d13ea0024f..287a8d1d3f70 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -1222,9 +1222,11 @@ void drbd_resume_al(struct drbd_device *device) } /* helper for _drbd_set_state */ -static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) +static void set_ov_position(struct drbd_peer_device *peer_device, enum drbd_conns cs) { - if (first_peer_device(device)->connection->agreed_pro_version < 90) + struct drbd_device *device = peer_device->device; + + if (peer_device->connection->agreed_pro_version < 90) device->ov_start_sector = 0; device->rs_total = drbd_bm_bits(device); device->ov_position = 0; @@ -1387,7 +1389,7 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns, unsigned long now = jiffies; int i; - set_ov_position(device, ns.conn); + set_ov_position(peer_device, ns.conn); device->rs_start = now; device->rs_last_sect_ev = 0; device->ov_last_oos_size = 0; @@ -1398,7 +1400,7 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns, device->rs_mark_time[i] = now; } - drbd_rs_controller_reset(device); + drbd_rs_controller_reset(peer_device); if (ns.conn == C_VERIFY_S) { drbd_info(device, "Starting Online Verify from sector %llu\n", @@ -1518,8 +1520,9 @@ static void abw_start_sync(struct drbd_device *device, int rv) } int drbd_bitmap_io_from_worker(struct drbd_device *device, - int (*io_fn)(struct drbd_device *), - char *why, enum bm_flag flags) + int (*io_fn)(struct drbd_device *, struct drbd_peer_device *), + char *why, enum bm_flag flags, + struct drbd_peer_device *peer_device) { int rv; @@ -1529,7 +1532,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, atomic_inc(&device->suspend_cnt); drbd_bm_lock(device, why, flags); - rv = io_fn(device); + rv = io_fn(device, peer_device); drbd_bm_unlock(device); drbd_resume_io(device); @@ -1809,7 +1812,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, device->state.conn == C_WF_BITMAP_S) drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)", - BM_LOCKED_TEST_ALLOWED); + BM_LOCKED_TEST_ALLOWED, peer_device); /* Lost contact to peer's copy of the data */ if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) { @@ -1839,7 +1842,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, * No harm done if the bitmap still changes, * redirtied pages will follow later. */ drbd_bitmap_io_from_worker(device, &drbd_bm_write, - "demote diskless peer", BM_LOCKED_SET_ALLOWED); + "demote diskless peer", BM_LOCKED_SET_ALLOWED, peer_device); put_ldev(device); } @@ -1851,7 +1854,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* No changes to the bitmap expected this time, so assert that, * even though no harm was done if it did change. */ drbd_bitmap_io_from_worker(device, &drbd_bm_write, - "demote", BM_LOCKED_TEST_ALLOWED); + "demote", BM_LOCKED_TEST_ALLOWED, peer_device); put_ldev(device); } @@ -1888,7 +1891,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* no other bitmap changes expected during this phase */ drbd_queue_bitmap_io(device, &drbd_bmio_set_n_write, &abw_start_sync, - "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); + "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED, + peer_device); /* first half of local IO error, failure to attach, * or administrative detach */ @@ -2011,7 +2015,8 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) && (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) { drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, - "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED, + peer_device); put_ldev(device); } @@ -2071,7 +2076,7 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) conn_free_crypto(connection); mutex_unlock(&connection->resource->conf_update); - kvfree_rcu(old_conf); + kvfree_rcu_mightsleep(old_conf); } if (ns_max.susp_fen) { diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f46738040d6b..4352a50fbb3f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -28,8 +28,8 @@ #include "drbd_protocol.h" #include "drbd_req.h" -static int make_ov_request(struct drbd_device *, int); -static int make_resync_request(struct drbd_device *, int); +static int make_ov_request(struct drbd_peer_device *, int); +static int make_resync_request(struct drbd_peer_device *, int); /* endio handlers: * drbd_md_endio (defined here) @@ -124,7 +124,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l * In case of a write error, send the neg ack anyways. */ if (!__test_and_set_bit(__EE_SEND_WRITE_ACK, &peer_req->flags)) inc_unacked(device); - drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); + drbd_set_out_of_sync(peer_device, peer_req->i.sector, peer_req->i.size); } spin_lock_irqsave(&device->resource->req_lock, flags); @@ -276,7 +276,7 @@ void drbd_request_endio(struct bio *bio) /* not req_mod(), we need irqsave here! */ spin_lock_irqsave(&device->resource->req_lock, flags); - __req_mod(req, what, &m); + __req_mod(req, what, NULL, &m); spin_unlock_irqrestore(&device->resource->req_lock, flags); put_ldev(device); @@ -363,7 +363,7 @@ static int w_e_send_csum(struct drbd_work *w, int cancel) * drbd_alloc_pages due to pp_in_use > max_buffers. */ drbd_free_peer_req(device, peer_req); peer_req = NULL; - inc_rs_pending(device); + inc_rs_pending(peer_device); err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_CSUM_RS_REQUEST); @@ -430,10 +430,10 @@ int w_resync_timer(struct drbd_work *w, int cancel) switch (device->state.conn) { case C_VERIFY_S: - make_ov_request(device, cancel); + make_ov_request(first_peer_device(device), cancel); break; case C_SYNC_TARGET: - make_resync_request(device, cancel); + make_resync_request(first_peer_device(device), cancel); break; } @@ -493,8 +493,9 @@ struct fifo_buffer *fifo_alloc(unsigned int fifo_size) return fb; } -static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) +static int drbd_rs_controller(struct drbd_peer_device *peer_device, unsigned int sect_in) { + struct drbd_device *device = peer_device->device; struct disk_conf *dc; unsigned int want; /* The number of sectors we want in-flight */ int req_sect; /* Number of sectors to request in this turn */ @@ -545,8 +546,9 @@ static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) return req_sect; } -static int drbd_rs_number_requests(struct drbd_device *device) +static int drbd_rs_number_requests(struct drbd_peer_device *peer_device) { + struct drbd_device *device = peer_device->device; unsigned int sect_in; /* Number of sectors that came in since the last turn */ int number, mxb; @@ -556,7 +558,7 @@ static int drbd_rs_number_requests(struct drbd_device *device) rcu_read_lock(); mxb = drbd_get_max_buffers(device) / 2; if (rcu_dereference(device->rs_plan_s)->size) { - number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9); + number = drbd_rs_controller(peer_device, sect_in) >> (BM_BLOCK_SHIFT - 9); device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; } else { device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; @@ -580,9 +582,9 @@ static int drbd_rs_number_requests(struct drbd_device *device) return number; } -static int make_resync_request(struct drbd_device *const device, int cancel) +static int make_resync_request(struct drbd_peer_device *const peer_device, int cancel) { - struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_device *const device = peer_device->device; struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; unsigned long bit; sector_t sector; @@ -598,7 +600,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel) if (device->rs_total == 0) { /* empty resync? */ - drbd_resync_finished(device); + drbd_resync_finished(peer_device); return 0; } @@ -618,7 +620,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel) } max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; - number = drbd_rs_number_requests(device); + number = drbd_rs_number_requests(peer_device); if (number <= 0) goto requeue; @@ -653,7 +655,7 @@ next_sector: sector = BM_BIT_TO_SECT(bit); - if (drbd_try_rs_begin_io(device, sector)) { + if (drbd_try_rs_begin_io(peer_device, sector)) { device->bm_resync_fo = bit; goto requeue; } @@ -729,13 +731,13 @@ next_sector: } else { int err; - inc_rs_pending(device); + inc_rs_pending(peer_device); err = drbd_send_drequest(peer_device, size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST, sector, size, ID_SYNCER); if (err) { drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); - dec_rs_pending(device); + dec_rs_pending(peer_device); put_ldev(device); return err; } @@ -760,8 +762,9 @@ next_sector: return 0; } -static int make_ov_request(struct drbd_device *device, int cancel) +static int make_ov_request(struct drbd_peer_device *peer_device, int cancel) { + struct drbd_device *device = peer_device->device; int number, i, size; sector_t sector; const sector_t capacity = get_capacity(device->vdisk); @@ -770,7 +773,7 @@ static int make_ov_request(struct drbd_device *device, int cancel) if (unlikely(cancel)) return 1; - number = drbd_rs_number_requests(device); + number = drbd_rs_number_requests(peer_device); sector = device->ov_position; for (i = 0; i < number; i++) { @@ -788,7 +791,7 @@ static int make_ov_request(struct drbd_device *device, int cancel) size = BM_BLOCK_SIZE; - if (drbd_try_rs_begin_io(device, sector)) { + if (drbd_try_rs_begin_io(peer_device, sector)) { device->ov_position = sector; goto requeue; } @@ -796,9 +799,9 @@ static int make_ov_request(struct drbd_device *device, int cancel) if (sector + (size>>9) > capacity) size = (capacity-sector)<<9; - inc_rs_pending(device); + inc_rs_pending(peer_device); if (drbd_send_ov_request(first_peer_device(device), sector, size)) { - dec_rs_pending(device); + dec_rs_pending(peer_device); return 0; } sector += BM_SECT_PER_BIT; @@ -818,8 +821,8 @@ int w_ov_finished(struct drbd_work *w, int cancel) container_of(w, struct drbd_device_work, w); struct drbd_device *device = dw->device; kfree(dw); - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(first_peer_device(device)); + drbd_resync_finished(first_peer_device(device)); return 0; } @@ -831,7 +834,7 @@ static int w_resync_finished(struct drbd_work *w, int cancel) struct drbd_device *device = dw->device; kfree(dw); - drbd_resync_finished(device); + drbd_resync_finished(first_peer_device(device)); return 0; } @@ -846,9 +849,10 @@ static void ping_peer(struct drbd_device *device) test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED); } -int drbd_resync_finished(struct drbd_device *device) +int drbd_resync_finished(struct drbd_peer_device *peer_device) { - struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_device *device = peer_device->device; + struct drbd_connection *connection = peer_device->connection; unsigned long db, dt, dbdt; unsigned long n_oos; union drbd_state os, ns; @@ -1129,7 +1133,7 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req); } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { if (likely(device->state.pdsk >= D_INCONSISTENT)) { - inc_rs_pending(device); + inc_rs_pending(peer_device); if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req)) err = drbd_send_rs_deallocated(peer_device, peer_req); else @@ -1148,7 +1152,7 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req); /* update resync data with failure */ - drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size); + drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size); } dec_unacked(device); @@ -1199,12 +1203,12 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) } if (eq) { - drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size); + drbd_set_in_sync(peer_device, peer_req->i.sector, peer_req->i.size); /* rs_same_csums unit is BM_BLOCK_SIZE */ device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req); } else { - inc_rs_pending(device); + inc_rs_pending(peer_device); peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ kfree(di); @@ -1257,10 +1261,10 @@ int w_e_end_ov_req(struct drbd_work *w, int cancel) * drbd_alloc_pages due to pp_in_use > max_buffers. */ drbd_free_peer_req(device, peer_req); peer_req = NULL; - inc_rs_pending(device); + inc_rs_pending(peer_device); err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY); if (err) - dec_rs_pending(device); + dec_rs_pending(peer_device); kfree(digest); out: @@ -1270,15 +1274,16 @@ out: return err; } -void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size) +void drbd_ov_out_of_sync_found(struct drbd_peer_device *peer_device, sector_t sector, int size) { + struct drbd_device *device = peer_device->device; if (device->ov_last_oos_start + device->ov_last_oos_size == sector) { device->ov_last_oos_size += size>>9; } else { device->ov_last_oos_start = sector; device->ov_last_oos_size = size>>9; } - drbd_set_out_of_sync(device, sector, size); + drbd_set_out_of_sync(peer_device, sector, size); } int w_e_end_ov_reply(struct drbd_work *w, int cancel) @@ -1328,9 +1333,9 @@ int w_e_end_ov_reply(struct drbd_work *w, int cancel) * drbd_alloc_pages due to pp_in_use > max_buffers. */ drbd_free_peer_req(device, peer_req); if (!eq) - drbd_ov_out_of_sync_found(device, sector, size); + drbd_ov_out_of_sync_found(peer_device, sector, size); else - ov_out_of_sync_print(device); + ov_out_of_sync_print(peer_device); err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); @@ -1341,14 +1346,14 @@ int w_e_end_ov_reply(struct drbd_work *w, int cancel) /* let's advance progress step marks only for every other megabyte */ if ((device->ov_left & 0x200) == 0x200) - drbd_advance_rs_marks(device, device->ov_left); + drbd_advance_rs_marks(peer_device, device->ov_left); stop_sector_reached = verify_can_do_stop_sector(device) && (sector + (size>>9)) >= device->ov_stop_sector; if (device->ov_left == 0 || stop_sector_reached) { - ov_out_of_sync_print(device); - drbd_resync_finished(device); + ov_out_of_sync_print(peer_device); + drbd_resync_finished(peer_device); } return err; @@ -1425,7 +1430,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); + req_mod(req, SEND_CANCELED, peer_device); return 0; } req->pre_send_jif = jiffies; @@ -1437,7 +1442,7 @@ int w_send_out_of_sync(struct drbd_work *w, int cancel) maybe_send_barrier(connection, req->epoch); err = drbd_send_out_of_sync(peer_device, req); - req_mod(req, OOS_HANDED_TO_NETWORK); + req_mod(req, OOS_HANDED_TO_NETWORK, peer_device); return err; } @@ -1457,7 +1462,7 @@ int w_send_dblock(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); + req_mod(req, SEND_CANCELED, peer_device); return 0; } req->pre_send_jif = jiffies; @@ -1467,7 +1472,7 @@ int w_send_dblock(struct drbd_work *w, int cancel) connection->send.current_epoch_writes++; err = drbd_send_dblock(peer_device, req); - req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK, peer_device); if (do_send_unplug && !err) pd_send_unplug_remote(peer_device); @@ -1490,7 +1495,7 @@ int w_send_read_req(struct drbd_work *w, int cancel) int err; if (unlikely(cancel)) { - req_mod(req, SEND_CANCELED); + req_mod(req, SEND_CANCELED, peer_device); return 0; } req->pre_send_jif = jiffies; @@ -1502,7 +1507,7 @@ int w_send_read_req(struct drbd_work *w, int cancel) err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size, (unsigned long)req); - req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK, peer_device); if (do_send_unplug && !err) pd_send_unplug_remote(peer_device); @@ -1668,8 +1673,9 @@ void drbd_resync_after_changed(struct drbd_device *device) } while (changed); } -void drbd_rs_controller_reset(struct drbd_device *device) +void drbd_rs_controller_reset(struct drbd_peer_device *peer_device) { + struct drbd_device *device = peer_device->device; struct gendisk *disk = device->ldev->backing_bdev->bd_disk; struct fifo_buffer *plan; @@ -1891,10 +1897,10 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) rcu_read_unlock(); schedule_timeout_interruptible(timeo); } - drbd_resync_finished(device); + drbd_resync_finished(peer_device); } - drbd_rs_controller_reset(device); + drbd_rs_controller_reset(peer_device); /* ns.conn may already be != device->state.conn, * we may have been paused in between, or become paused until * the timer triggers. @@ -1909,8 +1915,9 @@ out: mutex_unlock(device->state_mutex); } -static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done) +static void update_on_disk_bitmap(struct drbd_peer_device *peer_device, bool resync_done) { + struct drbd_device *device = peer_device->device; struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; device->rs_last_bcast = jiffies; @@ -1919,7 +1926,7 @@ static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done) drbd_bm_write_lazy(device, 0); if (resync_done && is_sync_state(device->state.conn)) - drbd_resync_finished(device); + drbd_resync_finished(peer_device); drbd_bcast_event(device, &sib); /* update timestamp, in case it took a while to write out stuff */ @@ -1945,6 +1952,7 @@ static void drbd_ldev_destroy(struct drbd_device *device) static void go_diskless(struct drbd_device *device) { + struct drbd_peer_device *peer_device = first_peer_device(device); D_ASSERT(device, device->state.disk == D_FAILED); /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will * inc/dec it frequently. Once we are D_DISKLESS, no one will touch @@ -1970,7 +1978,7 @@ static void go_diskless(struct drbd_device *device) * Any modifications would not be expected anymore, though. */ if (drbd_bitmap_io_from_worker(device, drbd_bm_write, - "detach", BM_LOCKED_TEST_ALLOWED)) { + "detach", BM_LOCKED_TEST_ALLOWED, peer_device)) { if (test_bit(WAS_READ_ERROR, &device->flags)) { drbd_md_set_flag(device, MDF_FULL_SYNC); drbd_md_sync(device); @@ -2017,7 +2025,7 @@ static void do_device_work(struct drbd_device *device, const unsigned long todo) do_md_sync(device); if (test_bit(RS_DONE, &todo) || test_bit(RS_PROGRESS, &todo)) - update_on_disk_bitmap(device, test_bit(RS_DONE, &todo)); + update_on_disk_bitmap(first_peer_device(device), test_bit(RS_DONE, &todo)); if (test_bit(GO_DISKLESS, &todo)) go_diskless(device); if (test_bit(DESTROY_DISK, &todo)) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 487840e3564d..cec2c20f5e59 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3108,7 +3108,7 @@ loop: ptr->resultcode = 0; if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) { - if (ptr->length <= 0 || ptr->length >= MAX_LEN) + if (ptr->length <= 0 || ptr->length > MAX_LEN) return -EINVAL; ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length); fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 28eb59fd71ca..bc31bb7072a2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1010,9 +1010,6 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); - /* suppress uevents while reconfiguring the device */ - dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); - /* * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. @@ -1067,6 +1064,9 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, } } + /* suppress uevents while reconfiguring the device */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1); + disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); @@ -1109,17 +1109,17 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); + /* enable and uncork uevent now that we are done */ + dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); + loop_global_unlock(lo, is_loop); if (partscan) loop_reread_partitions(lo); + if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); - error = 0; -done: - /* enable and uncork uevent now that we are done */ - dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0); - return error; + return 0; out_unlock: loop_global_unlock(lo, is_loop); @@ -1130,7 +1130,7 @@ out_putf: fput(file); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - goto done; + return error; } static void __loop_clr_fd(struct loop_device *lo, bool release) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 592cfa8b765a..65ecde3e2a5b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -325,6 +325,9 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, if (blk_validate_block_size(blksize)) return -EINVAL; + if (bytesize < 0) + return -EINVAL; + nbd->config->bytesize = bytesize; nbd->config->blksize_bits = __ffs(blksize); @@ -606,7 +609,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) request.len = htonl(size); } handle = nbd_cmd_handle(cmd); - memcpy(request.handle, &handle, sizeof(handle)); + request.cookie = cpu_to_be64(handle); trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd)); @@ -618,7 +621,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) trace_nbd_header_sent(req, handle); if (result < 0) { if (was_interrupted(result)) { - /* If we havne't sent anything we can just return BUSY, + /* If we haven't sent anything we can just return BUSY, * however if we have sent something we need to make * sure we only allow this req to be sent until we are * completely done. @@ -732,7 +735,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, u32 tag; int ret = 0; - memcpy(&handle, reply->handle, sizeof(handle)); + handle = be64_to_cpu(reply->cookie); tag = nbd_handle_to_tag(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) @@ -1111,6 +1114,9 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, struct nbd_sock *nsock; int err; + /* Arg will be cast to int, check it to avoid overflow */ + if (arg > INT_MAX) + return -EINVAL; sock = nbd_get_socket(nbd, arg, &err); if (!sock) return err; @@ -1660,7 +1666,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) return -EIO; dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir); - if (!dir) { + if (IS_ERR(dir)) { dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n", nbd_name(nbd)); return -EIO; @@ -1686,7 +1692,7 @@ static int nbd_dbg_init(void) struct dentry *dbg_dir; dbg_dir = debugfs_create_dir("nbd", NULL); - if (!dbg_dir) + if (IS_ERR(dbg_dir)) return -EIO; nbd_dbg_dir = dbg_dir; @@ -1799,7 +1805,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) * Tell the block layer that we are not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 0; blk_queue_max_discard_sectors(disk->queue, 0); blk_queue_max_segment_size(disk->queue, UINT_MAX); @@ -1934,11 +1939,11 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } } - if (!info->attrs[NBD_ATTR_SOCKETS]) { + if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SOCKETS)) { pr_err("must specify at least one socket\n"); return -EINVAL; } - if (!info->attrs[NBD_ATTR_SIZE_BYTES]) { + if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_SIZE_BYTES)) { pr_err("must specify a size in bytes for the device\n"); return -EINVAL; } @@ -2123,7 +2128,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - if (!info->attrs[NBD_ATTR_INDEX]) { + if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) { pr_err("must specify an index to disconnect\n"); return -EINVAL; } @@ -2161,7 +2166,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) if (!netlink_capable(skb, CAP_SYS_ADMIN)) return -EPERM; - if (!info->attrs[NBD_ATTR_INDEX]) { + if (GENL_REQ_ATTR_CHECK(info, NBD_ATTR_INDEX)) { pr_err("must specify a device to reconfigure\n"); return -EINVAL; } @@ -2325,6 +2330,7 @@ static struct genl_family nbd_genl_family __ro_after_init = { .n_small_ops = ARRAY_SIZE(nbd_connect_genl_ops), .resv_start_op = NBD_CMD_STATUS + 1, .maxattr = NBD_ATTR_MAX, + .netnsok = 1, .policy = nbd_attr_policy, .mcgrps = nbd_mcast_grps, .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), diff --git a/drivers/block/null_blk/Kconfig b/drivers/block/null_blk/Kconfig index 6bf1f8ca20a2..ff23bb9346d0 100644 --- a/drivers/block/null_blk/Kconfig +++ b/drivers/block/null_blk/Kconfig @@ -9,4 +9,4 @@ config BLK_DEV_NULL_BLK config BLK_DEV_NULL_BLK_FAULT_INJECTION bool "Support fault injection for Null test block driver" - depends on BLK_DEV_NULL_BLK && FAULT_INJECTION + depends on BLK_DEV_NULL_BLK && FAULT_INJECTION_CONFIGFS diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 9e6b032c8ecc..b3fedafe301e 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -250,7 +250,7 @@ static void null_free_device_storage(struct nullb_device *dev, bool is_cache); static inline struct nullb_device *to_nullb_device(struct config_item *item) { - return item ? container_of(item, struct nullb_device, item) : NULL; + return item ? container_of(to_config_group(item), struct nullb_device, group) : NULL; } static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page) @@ -593,8 +593,29 @@ static const struct config_item_type nullb_device_type = { .ct_owner = THIS_MODULE, }; +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + +static void nullb_add_fault_config(struct nullb_device *dev) +{ + fault_config_init(&dev->timeout_config, "timeout_inject"); + fault_config_init(&dev->requeue_config, "requeue_inject"); + fault_config_init(&dev->init_hctx_fault_config, "init_hctx_fault_inject"); + + configfs_add_default_group(&dev->timeout_config.group, &dev->group); + configfs_add_default_group(&dev->requeue_config.group, &dev->group); + configfs_add_default_group(&dev->init_hctx_fault_config.group, &dev->group); +} + +#else + +static void nullb_add_fault_config(struct nullb_device *dev) +{ +} + +#endif + static struct -config_item *nullb_group_make_item(struct config_group *group, const char *name) +config_group *nullb_group_make_group(struct config_group *group, const char *name) { struct nullb_device *dev; @@ -605,9 +626,10 @@ config_item *nullb_group_make_item(struct config_group *group, const char *name) if (!dev) return ERR_PTR(-ENOMEM); - config_item_init_type_name(&dev->item, name, &nullb_device_type); + config_group_init_type_name(&dev->group, name, &nullb_device_type); + nullb_add_fault_config(dev); - return &dev->item; + return &dev->group; } static void @@ -645,7 +667,7 @@ static struct configfs_attribute *nullb_group_attrs[] = { }; static struct configfs_group_operations nullb_group_ops = { - .make_item = nullb_group_make_item, + .make_group = nullb_group_make_group, .drop_item = nullb_group_drop_item, }; @@ -676,6 +698,13 @@ static struct nullb_device *null_alloc_dev(void) dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; + +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + dev->timeout_config.attr = null_timeout_attr; + dev->requeue_config.attr = null_requeue_attr; + dev->init_hctx_fault_config.attr = null_init_hctx_attr; +#endif + INIT_RADIX_TREE(&dev->data, GFP_ATOMIC); INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC); if (badblocks_init(&dev->badblocks, 0)) { @@ -1030,8 +1059,8 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) if (!t_page) return -ENOMEM; - src = kmap_atomic(c_page->page); - dst = kmap_atomic(t_page->page); + src = kmap_local_page(c_page->page); + dst = kmap_local_page(t_page->page); for (i = 0; i < PAGE_SECTORS; i += (nullb->dev->blocksize >> SECTOR_SHIFT)) { @@ -1043,8 +1072,8 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) } } - kunmap_atomic(dst); - kunmap_atomic(src); + kunmap_local(dst); + kunmap_local(src); ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page); null_free_page(ret); @@ -1112,7 +1141,6 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, size_t temp, count = 0; unsigned int offset; struct nullb_page *t_page; - void *dst, *src; while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); @@ -1126,11 +1154,7 @@ static int copy_to_nullb(struct nullb *nullb, struct page *source, if (!t_page) return -ENOSPC; - src = kmap_atomic(source); - dst = kmap_atomic(t_page->page); - memcpy(dst + offset, src + off + count, temp); - kunmap_atomic(dst); - kunmap_atomic(src); + memcpy_page(t_page->page, offset, source, off + count, temp); __set_bit(sector & SECTOR_MASK, t_page->bitmap); @@ -1149,7 +1173,6 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest, size_t temp, count = 0; unsigned int offset; struct nullb_page *t_page; - void *dst, *src; while (count < n) { temp = min_t(size_t, nullb->dev->blocksize, n - count); @@ -1158,16 +1181,11 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest, t_page = null_lookup_page(nullb, sector, false, !null_cache_active(nullb)); - dst = kmap_atomic(dest); - if (!t_page) { - memset(dst + off + count, 0, temp); - goto next; - } - src = kmap_atomic(t_page->page); - memcpy(dst + off + count, src + offset, temp); - kunmap_atomic(src); -next: - kunmap_atomic(dst); + if (t_page) + memcpy_page(dest, off + count, t_page->page, offset, + temp); + else + zero_user(dest, off + count, temp); count += temp; sector += temp >> SECTOR_SHIFT; @@ -1178,11 +1196,7 @@ next: static void nullb_fill_pattern(struct nullb *nullb, struct page *page, unsigned int len, unsigned int off) { - void *dst; - - dst = kmap_atomic(page); - memset(dst + off, 0xFF, len); - kunmap_atomic(dst); + memset_page(page, off, 0xff, len); } blk_status_t null_handle_discard(struct nullb_device *dev, @@ -1529,24 +1543,48 @@ static void null_submit_bio(struct bio *bio) null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio)); } +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + +static bool should_timeout_request(struct request *rq) +{ + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct nullb_device *dev = cmd->nq->dev; + + return should_fail(&dev->timeout_config.attr, 1); +} + +static bool should_requeue_request(struct request *rq) +{ + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct nullb_device *dev = cmd->nq->dev; + + return should_fail(&dev->requeue_config.attr, 1); +} + +static bool should_init_hctx_fail(struct nullb_device *dev) +{ + return should_fail(&dev->init_hctx_fault_config.attr, 1); +} + +#else + static bool should_timeout_request(struct request *rq) { -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_timeout_str[0]) - return should_fail(&null_timeout_attr, 1); -#endif return false; } static bool should_requeue_request(struct request *rq) { -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_requeue_str[0]) - return should_fail(&null_requeue_attr, 1); -#endif return false; } +static bool should_init_hctx_fail(struct nullb_device *dev) +{ + return false; +} + +#endif + static void null_map_queues(struct blk_mq_tag_set *set) { struct nullb *nullb = set->driver_data; @@ -1743,10 +1781,8 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, struct nullb *nullb = hctx->queue->queuedata; struct nullb_queue *nq; -#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION - if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1)) + if (should_init_hctx_fail(nullb->dev)) return -EFAULT; -#endif nq = &nullb->queues[hctx_idx]; hctx->driver_data = nq; @@ -1964,6 +2000,11 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) static int null_validate_conf(struct nullb_device *dev) { + if (dev->queue_mode == NULL_Q_RQ) { + pr_err("legacy IO path is no longer available\n"); + return -EINVAL; + } + dev->blocksize = round_down(dev->blocksize, 512); dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); @@ -2066,9 +2107,6 @@ static int null_add_dev(struct nullb_device *dev) if (rv) goto out_cleanup_queues; - if (!null_setup_fault()) - goto out_cleanup_tags; - nullb->tag_set->timeout = 5 * HZ; nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); if (IS_ERR(nullb->disk)) { @@ -2106,7 +2144,6 @@ static int null_add_dev(struct nullb_device *dev) nullb->q->queuedata = nullb; blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); mutex_lock(&lock); rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); @@ -2130,10 +2167,10 @@ static int null_add_dev(struct nullb_device *dev) null_config_discard(nullb); - if (config_item_name(&dev->item)) { + if (config_item_name(&dev->group.cg_item)) { /* Use configfs dir name as the device name */ snprintf(nullb->disk_name, sizeof(nullb->disk_name), - "%s", config_item_name(&dev->item)); + "%s", config_item_name(&dev->group.cg_item)); } else { sprintf(nullb->disk_name, "nullb%d", nullb->index); } @@ -2233,6 +2270,9 @@ static int __init null_init(void) g_home_node = NUMA_NO_NODE; } + if (!null_setup_fault()) + return -EINVAL; + if (g_queue_mode == NULL_Q_RQ) { pr_err("legacy IO path is no longer available\n"); return -EINVAL; diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index eb5972c50be8..929f659dd255 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -69,7 +69,12 @@ enum { struct nullb_device { struct nullb *nullb; - struct config_item item; + struct config_group group; +#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION + struct fault_config timeout_config; + struct fault_config requeue_config; + struct fault_config init_hctx_fault_config; +#endif struct radix_tree_root data; /* data stored in the disk */ struct radix_tree_root cache; /* disk cache data */ unsigned long flags; /* device flags */ diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 2f1a92509271..d5d7884cedd4 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -100,7 +100,8 @@ static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ static mempool_t psd_pool; static struct bio_set pkt_bio_set; -static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */ +/* /sys/class/pktcdvd */ +static struct class class_pktcdvd; static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ /* forward declaration */ @@ -315,8 +316,8 @@ static const struct attribute_group *pkt_groups[] = { static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) { - if (class_pktcdvd) { - pd->dev = device_create_with_groups(class_pktcdvd, NULL, + if (class_is_registered(&class_pktcdvd)) { + pd->dev = device_create_with_groups(&class_pktcdvd, NULL, MKDEV(0, 0), pd, pkt_groups, "%s", pd->name); if (IS_ERR(pd->dev)) @@ -326,7 +327,7 @@ static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) { - if (class_pktcdvd) + if (class_is_registered(&class_pktcdvd)) device_unregister(pd->dev); } @@ -338,12 +339,7 @@ static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) device_map show mappings *******************************************************************/ -static void class_pktcdvd_release(struct class *cls) -{ - kfree(cls); -} - -static ssize_t device_map_show(struct class *c, struct class_attribute *attr, +static ssize_t device_map_show(const struct class *c, const struct class_attribute *attr, char *data) { int n = 0; @@ -364,7 +360,7 @@ static ssize_t device_map_show(struct class *c, struct class_attribute *attr, } static CLASS_ATTR_RO(device_map); -static ssize_t add_store(struct class *c, struct class_attribute *attr, +static ssize_t add_store(const struct class *c, const struct class_attribute *attr, const char *buf, size_t count) { unsigned int major, minor; @@ -385,7 +381,7 @@ static ssize_t add_store(struct class *c, struct class_attribute *attr, } static CLASS_ATTR_WO(add); -static ssize_t remove_store(struct class *c, struct class_attribute *attr, +static ssize_t remove_store(const struct class *c, const struct class_attribute *attr, const char *buf, size_t count) { unsigned int major, minor; @@ -405,36 +401,23 @@ static struct attribute *class_pktcdvd_attrs[] = { }; ATTRIBUTE_GROUPS(class_pktcdvd); +static struct class class_pktcdvd = { + .name = DRIVER_NAME, + .class_groups = class_pktcdvd_groups, +}; + static int pkt_sysfs_init(void) { - int ret = 0; - /* * create control files in sysfs * /sys/class/pktcdvd/... */ - class_pktcdvd = kzalloc(sizeof(*class_pktcdvd), GFP_KERNEL); - if (!class_pktcdvd) - return -ENOMEM; - class_pktcdvd->name = DRIVER_NAME; - class_pktcdvd->owner = THIS_MODULE; - class_pktcdvd->class_release = class_pktcdvd_release; - class_pktcdvd->class_groups = class_pktcdvd_groups; - ret = class_register(class_pktcdvd); - if (ret) { - kfree(class_pktcdvd); - class_pktcdvd = NULL; - pr_err("failed to create class pktcdvd\n"); - return ret; - } - return 0; + return class_register(&class_pktcdvd); } static void pkt_sysfs_cleanup(void) { - if (class_pktcdvd) - class_destroy(class_pktcdvd); - class_pktcdvd = NULL; + class_unregister(&class_pktcdvd); } /******************************************************************** @@ -1869,12 +1852,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) /* * enable/disable write caching on drive */ -static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, - int set) +static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd) { struct packet_command cgc; struct scsi_sense_hdr sshdr; unsigned char buf[64]; + bool set = IS_ENABLED(CONFIG_CDROM_PKTCDVD_WCACHE); int ret; init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); @@ -1890,7 +1873,12 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, if (ret) return ret; - buf[pd->mode_offset + 10] |= (!!set << 2); + /* + * use drive write caching -- we need deferred error handling to be + * able to successfully recover with this option (drive will return good + * status as soon as the cdb is validated). + */ + buf[pd->mode_offset + 10] |= (set << 2); cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); ret = pkt_mode_select(pd, &cgc); @@ -2085,7 +2073,7 @@ static int pkt_open_write(struct pktcdvd_device *pd) return -EIO; } - pkt_write_caching(pd, USE_WCACHING); + pkt_write_caching(pd); ret = pkt_get_max_speed(pd, &write_speed); if (ret) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5cb008b9700a..84ad3b17956f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -491,12 +491,12 @@ static bool single_major = true; module_param(single_major, bool, 0444); MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); -static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count); -static ssize_t remove_store(struct bus_type *bus, const char *buf, +static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count); +static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count); -static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, +static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf, size_t count); -static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, +static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf, size_t count); static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); @@ -538,7 +538,7 @@ static bool rbd_is_lock_owner(struct rbd_device *rbd_dev) return is_lock_owner; } -static ssize_t supported_features_show(struct bus_type *bus, char *buf) +static ssize_t supported_features_show(const struct bus_type *bus, char *buf) { return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED); } @@ -6967,9 +6967,7 @@ err_out_format: return ret; } -static ssize_t do_rbd_add(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_add(const char *buf, size_t count) { struct rbd_device *rbd_dev = NULL; struct ceph_options *ceph_opts = NULL; @@ -7081,18 +7079,18 @@ err_out_args: goto out; } -static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count) +static ssize_t add_store(const struct bus_type *bus, const char *buf, size_t count) { if (single_major) return -EINVAL; - return do_rbd_add(bus, buf, count); + return do_rbd_add(buf, count); } -static ssize_t add_single_major_store(struct bus_type *bus, const char *buf, +static ssize_t add_single_major_store(const struct bus_type *bus, const char *buf, size_t count) { - return do_rbd_add(bus, buf, count); + return do_rbd_add(buf, count); } static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) @@ -7122,9 +7120,7 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) } } -static ssize_t do_rbd_remove(struct bus_type *bus, - const char *buf, - size_t count) +static ssize_t do_rbd_remove(const char *buf, size_t count) { struct rbd_device *rbd_dev = NULL; struct list_head *tmp; @@ -7196,18 +7192,18 @@ static ssize_t do_rbd_remove(struct bus_type *bus, return count; } -static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count) +static ssize_t remove_store(const struct bus_type *bus, const char *buf, size_t count) { if (single_major) return -EINVAL; - return do_rbd_remove(bus, buf, count); + return do_rbd_remove(buf, count); } -static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf, +static ssize_t remove_single_major_store(const struct bus_type *bus, const char *buf, size_t count) { - return do_rbd_remove(bus, buf, count); + return do_rbd_remove(buf, count); } /* diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index e7c7d9a68168..8c6087949794 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -646,7 +646,7 @@ int rnbd_clt_create_sysfs_files(void) { int err; - rnbd_dev_class = class_create(THIS_MODULE, "rnbd-client"); + rnbd_dev_class = class_create("rnbd-client"); if (IS_ERR(rnbd_dev_class)) return PTR_ERR(rnbd_dev_class); diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index ea7ac8bca63c..da1d0542d7e2 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -241,7 +241,7 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) bio_opf = REQ_OP_WRITE; break; case RNBD_OP_FLUSH: - bio_opf = REQ_OP_FLUSH | REQ_PREFLUSH; + bio_opf = REQ_OP_WRITE | REQ_PREFLUSH; break; case RNBD_OP_DISCARD: bio_opf = REQ_OP_DISCARD; diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 297a6924ff4e..d5d9267e1fa5 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -215,7 +215,7 @@ int rnbd_srv_create_sysfs_files(void) { int err; - rnbd_dev_class = class_create(THIS_MODULE, "rnbd-server"); + rnbd_dev_class = class_create("rnbd-server"); if (IS_ERR(rnbd_dev_class)) return PTR_ERR(rnbd_dev_class); diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c73cc57ec547..c7ed5d69e9ee 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -53,7 +53,8 @@ | UBLK_F_NEED_GET_DATA \ | UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ - | UBLK_F_UNPRIVILEGED_DEV) + | UBLK_F_UNPRIVILEGED_DEV \ + | UBLK_F_CMD_IOCTL_ENCODE) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ @@ -128,6 +129,7 @@ struct ublk_queue { unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; bool force_abort; + bool timeout; unsigned short nr_io_ready; /* how many ios setup */ struct ublk_device *dev; struct ublk_io ios[]; @@ -246,7 +248,7 @@ static int ublk_validate_params(const struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { const struct ublk_param_basic *p = &ub->params.basic; - if (p->logical_bs_shift > PAGE_SHIFT) + if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) return -EINVAL; if (p->logical_bs_shift > p->physical_bs_shift) @@ -298,9 +300,7 @@ static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { - if (ubq->flags & UBLK_F_NEED_GET_DATA) - return true; - return false; + return ubq->flags & UBLK_F_NEED_GET_DATA; } static struct ublk_device *ublk_get_device(struct ublk_device *ub) @@ -349,25 +349,19 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) static inline bool ublk_queue_can_use_recovery_reissue( struct ublk_queue *ubq) { - if ((ubq->flags & UBLK_F_USER_RECOVERY) && - (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE)) - return true; - return false; + return (ubq->flags & UBLK_F_USER_RECOVERY) && + (ubq->flags & UBLK_F_USER_RECOVERY_REISSUE); } static inline bool ublk_queue_can_use_recovery( struct ublk_queue *ubq) { - if (ubq->flags & UBLK_F_USER_RECOVERY) - return true; - return false; + return ubq->flags & UBLK_F_USER_RECOVERY; } static inline bool ublk_can_use_recovery(struct ublk_device *ub) { - if (ub->dev_info.flags & UBLK_F_USER_RECOVERY) - return true; - return false; + return ub->dev_info.flags & UBLK_F_USER_RECOVERY; } static void ublk_free_disk(struct gendisk *disk) @@ -428,10 +422,9 @@ static const struct block_device_operations ub_fops = { #define UBLK_MAX_PIN_PAGES 32 struct ublk_map_data { - const struct ublk_queue *ubq; const struct request *rq; - const struct ublk_io *io; - unsigned max_bytes; + unsigned long ubuf; + unsigned int len; }; struct ublk_io_iter { @@ -488,18 +481,17 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, return done; } -static inline int ublk_copy_user_pages(struct ublk_map_data *data, - bool to_vm) +static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm) { const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; - const unsigned long start_vm = data->io->addr; + const unsigned long start_vm = data->ubuf; unsigned int done = 0; struct ublk_io_iter iter = { .pg_off = start_vm & (PAGE_SIZE - 1), .bio = data->rq->bio, .iter = data->rq->bio->bi_iter, }; - const unsigned int nr_pages = round_up(data->max_bytes + + const unsigned int nr_pages = round_up(data->len + (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; while (done < nr_pages) { @@ -512,42 +504,49 @@ static inline int ublk_copy_user_pages(struct ublk_map_data *data, iter.pages); if (iter.nr_pages <= 0) return done == 0 ? iter.nr_pages : done; - len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm); + len = ublk_copy_io_pages(&iter, data->len, to_vm); for (i = 0; i < iter.nr_pages; i++) { if (to_vm) set_page_dirty(iter.pages[i]); put_page(iter.pages[i]); } - data->max_bytes -= len; + data->len -= len; done += iter.nr_pages; } return done; } +static inline bool ublk_need_map_req(const struct request *req) +{ + return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE; +} + +static inline bool ublk_need_unmap_req(const struct request *req) +{ + return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ; +} + static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, struct ublk_io *io) { const unsigned int rq_bytes = blk_rq_bytes(req); + /* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current * context is pretty fast, see ublk_pin_user_pages */ - if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH) - return rq_bytes; - - if (ublk_rq_has_data(req)) { + if (ublk_need_map_req(req)) { struct ublk_map_data data = { - .ubq = ubq, .rq = req, - .io = io, - .max_bytes = rq_bytes, + .ubuf = io->addr, + .len = rq_bytes, }; ublk_copy_user_pages(&data, true); - return rq_bytes - data.max_bytes; + return rq_bytes - data.len; } return rq_bytes; } @@ -558,19 +557,18 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); - if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) { + if (ublk_need_unmap_req(req)) { struct ublk_map_data data = { - .ubq = ubq, .rq = req, - .io = io, - .max_bytes = io->res, + .ubuf = io->addr, + .len = io->res, }; WARN_ON_ONCE(io->res > rq_bytes); ublk_copy_user_pages(&data, false); - return io->res - data.max_bytes; + return io->res - data.len; } return rq_bytes; } @@ -655,14 +653,15 @@ static void ublk_complete_rq(struct request *req) struct ublk_queue *ubq = req->mq_hctx->driver_data; struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; + blk_status_t res = BLK_STS_OK; /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; if (io->res < 0) { - blk_mq_end_request(req, errno_to_blk_status(io->res)); - return; + res = errno_to_blk_status(io->res); + goto exit; } /* @@ -671,10 +670,8 @@ static void ublk_complete_rq(struct request *req) * * Both the two needn't unmap. */ - if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) { - blk_mq_end_request(req, BLK_STS_OK); - return; - } + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) + goto exit; /* for READ request, writing data in iod->addr to rq buffers */ unmapped_bytes = ublk_unmap_io(ubq, req, io); @@ -691,6 +688,10 @@ static void ublk_complete_rq(struct request *req) blk_mq_requeue_request(req, true); else __blk_mq_end_request(req, BLK_STS_OK); + + return; +exit: + blk_mq_end_request(req, res); } /* @@ -771,9 +772,7 @@ static inline void __ublk_rq_task_work(struct request *req, return; } - if (ublk_need_get_data(ubq) && - (req_op(req) == REQ_OP_WRITE || - req_op(req) == REQ_OP_FLUSH)) { + if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) { /* * We have not handled UBLK_IO_NEED_GET_DATA command yet, * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv @@ -900,6 +899,22 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) } } +static enum blk_eh_timer_return ublk_timeout(struct request *rq) +{ + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { + if (!ubq->timeout) { + send_sig(SIGKILL, ubq->ubq_daemon, 0); + ubq->timeout = true; + } + + return BLK_EH_DONE; + } + + return BLK_EH_RESET_TIMER; +} + static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -959,6 +974,7 @@ static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, .init_hctx = ublk_init_hctx, .init_request = ublk_init_rq, + .timeout = ublk_timeout, }; static int ublk_ch_open(struct inode *inode, struct file *filp) @@ -1019,7 +1035,7 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) } static void ublk_commit_completion(struct ublk_device *ub, - struct ublksrv_io_cmd *ub_cmd) + const struct ublksrv_io_cmd *ub_cmd) { u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; struct ublk_queue *ubq = ublk_get_queue(ub, qid); @@ -1261,9 +1277,23 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, ublk_queue_cmd(ubq, req); } -static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +static inline int ublk_check_cmd_op(u32 cmd_op) +{ + u32 ioc_type = _IOC_TYPE(cmd_op); + + if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u') + return -EOPNOTSUPP; + + if (ioc_type != 'u' && ioc_type != 0) + return -EOPNOTSUPP; + + return 0; +} + +static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags, + const struct ublksrv_io_cmd *ub_cmd) { - struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; struct ublk_io *io; @@ -1302,10 +1332,15 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA. */ if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) - ^ (cmd_op == UBLK_IO_NEED_GET_DATA)) + ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) + goto out; + + ret = ublk_check_cmd_op(cmd_op); + if (ret) goto out; - switch (cmd_op) { + ret = -EINVAL; + switch (_IOC_NR(cmd_op)) { case UBLK_IO_FETCH_REQ: /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ if (ublk_queue_ready(ubq)) { @@ -1362,6 +1397,23 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EIOCBQUEUED; } +static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + /* + * Not necessary for async retry, but let's keep it simple and always + * copy the values to avoid any potential reuse. + */ + const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe); + const struct ublksrv_io_cmd ub_cmd = { + .q_id = READ_ONCE(ub_src->q_id), + .tag = READ_ONCE(ub_src->tag), + .result = READ_ONCE(ub_src->result), + .addr = READ_ONCE(ub_src->addr) + }; + + return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); +} + static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, @@ -1567,7 +1619,7 @@ static struct ublk_device *ublk_get_device_from_id(int idx) static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ublksrv_pid = (int)header->data[0]; struct gendisk *disk; int ret = -EINVAL; @@ -1630,7 +1682,7 @@ out_unlock: static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; cpumask_var_t cpumask; unsigned long queue; @@ -1681,7 +1733,7 @@ static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; struct ublksrv_ctrl_dev_info info; struct ublk_device *ub; @@ -1703,6 +1755,18 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) return -EPERM; + /* + * unprivileged device can't be trusted, but RECOVERY and + * RECOVERY_REISSUE still may hang error handling, so can't + * support recovery features for unprivileged ublk now + * + * TODO: provide forward progress for RECOVERY handler, so that + * unprivileged device can benefit from it + */ + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) + info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | + UBLK_F_USER_RECOVERY); + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); @@ -1752,6 +1816,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK)) ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; + ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE; + /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; @@ -1844,7 +1910,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", __func__, cmd->cmd_op, header->dev_id, header->queue_id, @@ -1863,7 +1929,7 @@ static int ublk_ctrl_stop_dev(struct ublk_device *ub) static int ublk_ctrl_get_dev_info(struct ublk_device *ub, struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) @@ -1894,7 +1960,7 @@ static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) static int ublk_ctrl_get_params(struct ublk_device *ub, struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; int ret; @@ -1925,7 +1991,7 @@ static int ublk_ctrl_get_params(struct ublk_device *ub, static int ublk_ctrl_set_params(struct ublk_device *ub, struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; int ret = -EFAULT; @@ -1952,6 +2018,8 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, /* clear all we don't support yet */ ub->params.types &= UBLK_PARAM_TYPE_ALL; ret = ublk_validate_params(ub); + if (ret) + ub->params.types = 0; } mutex_unlock(&ub->mutex); @@ -1969,6 +2037,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) put_task_struct(ubq->ubq_daemon); /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ ubq->ubq_daemon = NULL; + ubq->timeout = false; for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; @@ -1983,7 +2052,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) static int ublk_ctrl_start_recovery(struct ublk_device *ub, struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ret = -EINVAL; int i; @@ -2025,7 +2094,7 @@ static int ublk_ctrl_start_recovery(struct ublk_device *ub, static int ublk_ctrl_end_recovery(struct ublk_device *ub, struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); int ublksrv_pid = (int)header->data[0]; int ret = -EINVAL; @@ -2092,7 +2161,7 @@ exit: static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, struct io_uring_cmd *cmd) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe); bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; void __user *argp = (void __user *)(unsigned long)header->addr; char *dev_path = NULL; @@ -2108,7 +2177,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, * know if the specified device is created as unprivileged * mode. */ - if (cmd->cmd_op != UBLK_CMD_GET_DEV_INFO2) + if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2) return 0; } @@ -2134,7 +2203,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, dev_path[header->dev_path_len] = 0; ret = -EINVAL; - switch (cmd->cmd_op) { + switch (_IOC_NR(cmd->cmd_op)) { case UBLK_CMD_GET_DEV_INFO: case UBLK_CMD_GET_DEV_INFO2: case UBLK_CMD_GET_QUEUE_AFFINITY: @@ -2171,8 +2240,9 @@ exit: static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); struct ublk_device *ub = NULL; + u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; if (issue_flags & IO_URING_F_NONBLOCK) @@ -2183,22 +2253,22 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (!(issue_flags & IO_URING_F_SQE128)) goto out; - if (cmd->cmd_op != UBLK_CMD_ADD_DEV) { + ret = ublk_check_cmd_op(cmd_op); + if (ret) + goto out; + + if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { ret = -ENODEV; ub = ublk_get_device_from_id(header->dev_id); if (!ub) goto out; ret = ublk_ctrl_uring_cmd_permission(ub, cmd); - } else { - /* ADD_DEV permission check is done in command handler */ - ret = 0; + if (ret) + goto put_dev; } - if (ret) - goto put_dev; - - switch (cmd->cmd_op) { + switch (_IOC_NR(cmd_op)) { case UBLK_CMD_START_DEV: ret = ublk_ctrl_start_dev(ub, cmd); break; @@ -2272,7 +2342,7 @@ static int __init ublk_init(void) if (ret) goto unregister_mis; - ublk_chr_class = class_create(THIS_MODULE, "ublk-char"); + ublk_chr_class = class_create("ublk-char"); if (IS_ERR(ublk_chr_class)) { ret = PTR_ERR(ublk_chr_class); goto free_chrdev_region; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2723eede6f21..2b918e28acaa 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -96,16 +96,14 @@ struct virtblk_req { /* * The zone append command has an extended in header. - * The status field in zone_append_in_hdr must have - * the same offset in virtblk_req as the non-zoned - * status field above. + * The status field in zone_append_in_hdr must always + * be the last byte. */ struct { + __virtio64 sector; u8 status; - u8 reserved[7]; - __le64 append_sector; - } zone_append_in_hdr; - }; + } zone_append; + } in_hdr; size_t in_hdr_len; @@ -154,7 +152,7 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr) sgs[num_out + num_in++] = vbr->sg_table.sgl; } - sg_init_one(&in_hdr, &vbr->status, vbr->in_hdr_len); + sg_init_one(&in_hdr, &vbr->in_hdr.status, vbr->in_hdr_len); sgs[num_out + num_in++] = &in_hdr; return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); @@ -242,11 +240,14 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, struct request *req, struct virtblk_req *vbr) { - size_t in_hdr_len = sizeof(vbr->status); + size_t in_hdr_len = sizeof(vbr->in_hdr.status); bool unmap = false; u32 type; u64 sector = 0; + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) && op_is_zone_mgmt(req_op(req))) + return BLK_STS_NOTSUPP; + /* Set fields for all request types */ vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req)); @@ -287,7 +288,7 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, case REQ_OP_ZONE_APPEND: type = VIRTIO_BLK_T_ZONE_APPEND; sector = blk_rq_pos(req); - in_hdr_len = sizeof(vbr->zone_append_in_hdr); + in_hdr_len = sizeof(vbr->in_hdr.zone_append); break; case REQ_OP_ZONE_RESET: type = VIRTIO_BLK_T_ZONE_RESET; @@ -297,7 +298,10 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, type = VIRTIO_BLK_T_ZONE_RESET_ALL; break; case REQ_OP_DRV_IN: - /* Out header already filled in, nothing to do */ + /* + * Out header has already been prepared by the caller (virtblk_get_id() + * or virtblk_submit_zone_report()), nothing to do here. + */ return 0; default: WARN_ON_ONCE(1); @@ -318,16 +322,28 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev, return 0; } +/* + * The status byte is always the last byte of the virtblk request + * in-header. This helper fetches its value for all in-header formats + * that are currently defined. + */ +static inline u8 virtblk_vbr_status(struct virtblk_req *vbr) +{ + return *((u8 *)&vbr->in_hdr + vbr->in_hdr_len - 1); +} + static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); - blk_status_t status = virtblk_result(vbr->status); + blk_status_t status = virtblk_result(virtblk_vbr_status(vbr)); + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata; virtblk_unmap_data(req, vbr); virtblk_cleanup_cmd(req); if (req_op(req) == REQ_OP_ZONE_APPEND) - req->__sector = le64_to_cpu(vbr->zone_append_in_hdr.append_sector); + req->__sector = virtio64_to_cpu(vblk->vdev, + vbr->in_hdr.zone_append.sector); blk_mq_end_request(req, status); } @@ -355,7 +371,7 @@ static int virtblk_handle_req(struct virtio_blk_vq *vq, if (likely(!blk_should_fake_timeout(req->q)) && !blk_mq_complete_request_remote(req) && - !blk_mq_add_to_batch(req, iob, vbr->status, + !blk_mq_add_to_batch(req, iob, virtblk_vbr_status(vbr), virtblk_complete_batch)) virtblk_request_done(req); req_done++; @@ -550,7 +566,6 @@ static void virtio_queue_rqs(struct request **rqlist) #ifdef CONFIG_BLK_DEV_ZONED static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk, unsigned int nr_zones, - unsigned int zone_sectors, size_t *buflen) { struct request_queue *q = vblk->disk->queue; @@ -558,7 +573,7 @@ static void *virtblk_alloc_report_buffer(struct virtio_blk *vblk, void *buf; nr_zones = min_t(unsigned int, nr_zones, - get_capacity(vblk->disk) >> ilog2(zone_sectors)); + get_capacity(vblk->disk) >> ilog2(vblk->zone_sectors)); bufsize = sizeof(struct virtio_blk_zone_report) + nr_zones * sizeof(struct virtio_blk_zone_descriptor); @@ -592,7 +607,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk, return PTR_ERR(req); vbr = blk_mq_rq_to_pdu(req); - vbr->in_hdr_len = sizeof(vbr->status); + vbr->in_hdr_len = sizeof(vbr->in_hdr.status); vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT); vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector); @@ -601,7 +616,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk, goto out; blk_execute_rq(req, false); - err = blk_status_to_errno(virtblk_result(vbr->status)); + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status)); out: blk_mq_free_request(req); return err; @@ -609,29 +624,72 @@ out: static int virtblk_parse_zone(struct virtio_blk *vblk, struct virtio_blk_zone_descriptor *entry, - unsigned int idx, unsigned int zone_sectors, - report_zones_cb cb, void *data) + unsigned int idx, report_zones_cb cb, void *data) { struct blk_zone zone = { }; - if (entry->z_type != VIRTIO_BLK_ZT_SWR && - entry->z_type != VIRTIO_BLK_ZT_SWP && - entry->z_type != VIRTIO_BLK_ZT_CONV) { - dev_err(&vblk->vdev->dev, "invalid zone type %#x\n", - entry->z_type); - return -EINVAL; + zone.start = virtio64_to_cpu(vblk->vdev, entry->z_start); + if (zone.start + vblk->zone_sectors <= get_capacity(vblk->disk)) + zone.len = vblk->zone_sectors; + else + zone.len = get_capacity(vblk->disk) - zone.start; + zone.capacity = virtio64_to_cpu(vblk->vdev, entry->z_cap); + zone.wp = virtio64_to_cpu(vblk->vdev, entry->z_wp); + + switch (entry->z_type) { + case VIRTIO_BLK_ZT_SWR: + zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ; + break; + case VIRTIO_BLK_ZT_SWP: + zone.type = BLK_ZONE_TYPE_SEQWRITE_PREF; + break; + case VIRTIO_BLK_ZT_CONV: + zone.type = BLK_ZONE_TYPE_CONVENTIONAL; + break; + default: + dev_err(&vblk->vdev->dev, "zone %llu: invalid type %#x\n", + zone.start, entry->z_type); + return -EIO; } - zone.type = entry->z_type; - zone.cond = entry->z_state; - zone.len = zone_sectors; - zone.capacity = le64_to_cpu(entry->z_cap); - zone.start = le64_to_cpu(entry->z_start); - if (zone.cond == BLK_ZONE_COND_FULL) + switch (entry->z_state) { + case VIRTIO_BLK_ZS_EMPTY: + zone.cond = BLK_ZONE_COND_EMPTY; + break; + case VIRTIO_BLK_ZS_CLOSED: + zone.cond = BLK_ZONE_COND_CLOSED; + break; + case VIRTIO_BLK_ZS_FULL: + zone.cond = BLK_ZONE_COND_FULL; zone.wp = zone.start + zone.len; - else - zone.wp = le64_to_cpu(entry->z_wp); + break; + case VIRTIO_BLK_ZS_EOPEN: + zone.cond = BLK_ZONE_COND_EXP_OPEN; + break; + case VIRTIO_BLK_ZS_IOPEN: + zone.cond = BLK_ZONE_COND_IMP_OPEN; + break; + case VIRTIO_BLK_ZS_NOT_WP: + zone.cond = BLK_ZONE_COND_NOT_WP; + break; + case VIRTIO_BLK_ZS_RDONLY: + zone.cond = BLK_ZONE_COND_READONLY; + zone.wp = ULONG_MAX; + break; + case VIRTIO_BLK_ZS_OFFLINE: + zone.cond = BLK_ZONE_COND_OFFLINE; + zone.wp = ULONG_MAX; + break; + default: + dev_err(&vblk->vdev->dev, "zone %llu: invalid condition %#x\n", + zone.start, entry->z_state); + return -EIO; + } + /* + * The callback below checks the validity of the reported + * entry data, no need to further validate it here. + */ return cb(&zone, idx, data); } @@ -641,39 +699,47 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector, { struct virtio_blk *vblk = disk->private_data; struct virtio_blk_zone_report *report; - unsigned int zone_sectors = vblk->zone_sectors; - unsigned int nz, i; - int ret, zone_idx = 0; + unsigned long long nz, i; size_t buflen; + unsigned int zone_idx = 0; + int ret; if (WARN_ON_ONCE(!vblk->zone_sectors)) return -EOPNOTSUPP; - report = virtblk_alloc_report_buffer(vblk, nr_zones, - zone_sectors, &buflen); + report = virtblk_alloc_report_buffer(vblk, nr_zones, &buflen); if (!report) return -ENOMEM; + mutex_lock(&vblk->vdev_mutex); + + if (!vblk->vdev) { + ret = -ENXIO; + goto fail_report; + } + while (zone_idx < nr_zones && sector < get_capacity(vblk->disk)) { memset(report, 0, buflen); ret = virtblk_submit_zone_report(vblk, (char *)report, buflen, sector); - if (ret) { - if (ret > 0) - ret = -EIO; - goto out_free; - } - nz = min((unsigned int)le64_to_cpu(report->nr_zones), nr_zones); + if (ret) + goto fail_report; + + nz = min_t(u64, virtio64_to_cpu(vblk->vdev, report->nr_zones), + nr_zones); if (!nz) break; for (i = 0; i < nz && zone_idx < nr_zones; i++) { ret = virtblk_parse_zone(vblk, &report->zones[i], - zone_idx, zone_sectors, cb, data); + zone_idx, cb, data); if (ret) - goto out_free; - sector = le64_to_cpu(report->zones[i].z_start) + zone_sectors; + goto fail_report; + + sector = virtio64_to_cpu(vblk->vdev, + report->zones[i].z_start) + + vblk->zone_sectors; zone_idx++; } } @@ -682,7 +748,8 @@ static int virtblk_report_zones(struct gendisk *disk, sector_t sector, ret = zone_idx; else ret = -EINVAL; -out_free: +fail_report: + mutex_unlock(&vblk->vdev_mutex); kvfree(report); return ret; } @@ -691,20 +758,28 @@ static void virtblk_revalidate_zones(struct virtio_blk *vblk) { u8 model; - if (!vblk->zone_sectors) - return; - virtio_cread(vblk->vdev, struct virtio_blk_config, zoned.model, &model); - if (!blk_revalidate_disk_zones(vblk->disk, NULL)) - set_capacity_and_notify(vblk->disk, 0); + switch (model) { + default: + dev_err(&vblk->vdev->dev, "unknown zone model %d\n", model); + fallthrough; + case VIRTIO_BLK_Z_NONE: + case VIRTIO_BLK_Z_HA: + disk_set_zoned(vblk->disk, BLK_ZONED_NONE); + return; + case VIRTIO_BLK_Z_HM: + WARN_ON_ONCE(!vblk->zone_sectors); + if (!blk_revalidate_disk_zones(vblk->disk, NULL)) + set_capacity_and_notify(vblk->disk, 0); + } } static int virtblk_probe_zoned_device(struct virtio_device *vdev, struct virtio_blk *vblk, struct request_queue *q) { - u32 v; + u32 v, wg; u8 model; int ret; @@ -713,16 +788,11 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, switch (model) { case VIRTIO_BLK_Z_NONE: + case VIRTIO_BLK_Z_HA: + /* Present the host-aware device as non-zoned */ return 0; case VIRTIO_BLK_Z_HM: break; - case VIRTIO_BLK_Z_HA: - /* - * Present the host-aware device as a regular drive. - * TODO It is possible to add an option to make it appear - * in the system as a zoned drive. - */ - return 0; default: dev_err(&vdev->dev, "unsupported zone model %d\n", model); return -EINVAL; @@ -735,32 +805,31 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); - disk_set_max_open_zones(vblk->disk, le32_to_cpu(v)); - - dev_dbg(&vdev->dev, "max open zones = %u\n", le32_to_cpu(v)); + disk_set_max_open_zones(vblk->disk, v); + dev_dbg(&vdev->dev, "max open zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, zoned.max_active_zones, &v); - disk_set_max_active_zones(vblk->disk, le32_to_cpu(v)); - dev_dbg(&vdev->dev, "max active zones = %u\n", le32_to_cpu(v)); + disk_set_max_active_zones(vblk->disk, v); + dev_dbg(&vdev->dev, "max active zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, - zoned.write_granularity, &v); - if (!v) { + zoned.write_granularity, &wg); + if (!wg) { dev_warn(&vdev->dev, "zero write granularity reported\n"); return -ENODEV; } - blk_queue_physical_block_size(q, le32_to_cpu(v)); - blk_queue_io_min(q, le32_to_cpu(v)); + blk_queue_physical_block_size(q, wg); + blk_queue_io_min(q, wg); - dev_dbg(&vdev->dev, "write granularity = %u\n", le32_to_cpu(v)); + dev_dbg(&vdev->dev, "write granularity = %u\n", wg); /* * virtio ZBD specification doesn't require zones to be a power of * two sectors in size, but the code in this driver expects that. */ - virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, &v); - vblk->zone_sectors = le32_to_cpu(v); + virtio_cread(vdev, struct virtio_blk_config, zoned.zone_sectors, + &vblk->zone_sectors); if (vblk->zone_sectors == 0 || !is_power_of_2(vblk->zone_sectors)) { dev_err(&vdev->dev, "zoned device with non power of two zone size %u\n", @@ -783,36 +852,46 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, dev_warn(&vdev->dev, "zero max_append_sectors reported\n"); return -ENODEV; } - blk_queue_max_zone_append_sectors(q, le32_to_cpu(v)); - dev_dbg(&vdev->dev, "max append sectors = %u\n", le32_to_cpu(v)); + if ((v << SECTOR_SHIFT) < wg) { + dev_err(&vdev->dev, + "write granularity %u exceeds max_append_sectors %u limit\n", + wg, v); + return -ENODEV; + } + + blk_queue_max_zone_append_sectors(q, v); + dev_dbg(&vdev->dev, "max append sectors = %u\n", v); } return ret; } -static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev) -{ - return virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED); -} #else /* * Zoned block device support is not configured in this kernel. - * We only need to define a few symbols to avoid compilation errors. + * Host-managed zoned devices can't be supported, but others are + * good to go as regular block devices. */ #define virtblk_report_zones NULL + static inline void virtblk_revalidate_zones(struct virtio_blk *vblk) { } + static inline int virtblk_probe_zoned_device(struct virtio_device *vdev, struct virtio_blk *vblk, struct request_queue *q) { - return -EOPNOTSUPP; -} + u8 model; -static inline bool virtblk_has_zoned_feature(struct virtio_device *vdev) -{ - return false; + virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model); + if (model == VIRTIO_BLK_Z_HM) { + dev_err(&vdev->dev, + "virtio_blk: zoned devices are not supported"); + return -EOPNOTSUPP; + } + + return 0; } #endif /* CONFIG_BLK_DEV_ZONED */ @@ -831,7 +910,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) return PTR_ERR(req); vbr = blk_mq_rq_to_pdu(req); - vbr->in_hdr_len = sizeof(vbr->status); + vbr->in_hdr_len = sizeof(vbr->in_hdr.status); vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; @@ -840,7 +919,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) goto out; blk_execute_rq(req, false); - err = blk_status_to_errno(virtblk_result(vbr->status)); + err = blk_status_to_errno(virtblk_result(vbr->in_hdr.status)); out: blk_mq_free_request(req); return err; @@ -1498,15 +1577,16 @@ static int virtblk_probe(struct virtio_device *vdev) virtblk_update_capacity(vblk, false); virtio_device_ready(vdev); - if (virtblk_has_zoned_feature(vdev)) { + /* + * All steps that follow use the VQs therefore they need to be + * placed after the virtio_device_ready() call above. + */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) { err = virtblk_probe_zoned_device(vdev, vblk, q); if (err) goto out_cleanup_disk; } - dev_info(&vdev->dev, "blk config size: %zu\n", - sizeof(struct virtio_blk_config)); - err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); if (err) goto out_cleanup_disk; @@ -1607,10 +1687,7 @@ static unsigned int features[] = { VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES, - VIRTIO_BLK_F_SECURE_ERASE, -#ifdef CONFIG_BLK_DEV_ZONED - VIRTIO_BLK_F_ZONED, -#endif /* CONFIG_BLK_DEV_ZONED */ + VIRTIO_BLK_F_SECURE_ERASE, VIRTIO_BLK_F_ZONED, }; static struct virtio_driver virtio_blk = { diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index a5cf7f1e871c..c362f4ad80ab 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -239,9 +239,9 @@ static void put_persistent_gnt(struct xen_blkif_ring *ring, atomic_dec(&ring->persistent_gnt_in_use); } -static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root, - unsigned int num) +static void free_persistent_gnts(struct xen_blkif_ring *ring) { + struct rb_root *root = &ring->persistent_gnts; struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt; @@ -249,6 +249,9 @@ static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *ro int segs_to_unmap = 0; struct gntab_unmap_queue_data unmap_data; + if (RB_EMPTY_ROOT(root)) + return; + unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; @@ -277,9 +280,11 @@ static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *ro rb_erase(&persistent_gnt->node, root); kfree(persistent_gnt); - num--; + ring->persistent_gnt_c--; } - BUG_ON(num != 0); + + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + BUG_ON(ring->persistent_gnt_c != 0); } void xen_blkbk_unmap_purged_grants(struct work_struct *work) @@ -631,12 +636,7 @@ purge_gnt_list: void xen_blkbk_free_caches(struct xen_blkif_ring *ring) { /* Free all persistent grant pages */ - if (!RB_EMPTY_ROOT(&ring->persistent_gnts)) - free_persistent_gnts(ring, &ring->persistent_gnts, - ring->persistent_gnt_c); - - BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); - ring->persistent_gnt_c = 0; + free_persistent_gnts(ring); /* Since we are shutting down remove all pages from the buffer */ gnttab_page_cache_shrink(&ring->free_pages, 0 /* All */); @@ -891,7 +891,7 @@ next: out: for (i = last_map; i < num; i++) { /* Don't zap current batch's valid persistent grants. */ - if(i >= map_until) + if (i >= map_until) pages[i]->persistent_gnt = NULL; pages[i]->handle = BLKBACK_INVALID_HANDLE; } @@ -1072,7 +1072,111 @@ static void end_block_io_op(struct bio *bio) bio_put(bio); } +static void blkif_get_x86_32_req(struct blkif_request *dst, + const struct blkif_x86_32_request *src) +{ + unsigned int i, n; + + dst->operation = READ_ONCE(src->operation); + + switch (dst->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + dst->u.rw.nr_segments = READ_ONCE(src->u.rw.nr_segments); + dst->u.rw.handle = src->u.rw.handle; + dst->u.rw.id = src->u.rw.id; + dst->u.rw.sector_number = src->u.rw.sector_number; + n = min_t(unsigned int, BLKIF_MAX_SEGMENTS_PER_REQUEST, + dst->u.rw.nr_segments); + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->u.rw.seg[i]; + break; + + case BLKIF_OP_DISCARD: + dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; + dst->u.discard.sector_number = src->u.discard.sector_number; + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; + break; + + case BLKIF_OP_INDIRECT: + dst->u.indirect.indirect_op = src->u.indirect.indirect_op; + dst->u.indirect.nr_segments = + READ_ONCE(src->u.indirect.nr_segments); + dst->u.indirect.handle = src->u.indirect.handle; + dst->u.indirect.id = src->u.indirect.id; + dst->u.indirect.sector_number = src->u.indirect.sector_number; + n = min(MAX_INDIRECT_PAGES, + INDIRECT_PAGES(dst->u.indirect.nr_segments)); + for (i = 0; i < n; i++) + dst->u.indirect.indirect_grefs[i] = + src->u.indirect.indirect_grefs[i]; + break; + + default: + /* + * Don't know how to translate this op. Only get the + * ID so failure can be reported to the frontend. + */ + dst->u.other.id = src->u.other.id; + break; + } +} + +static void blkif_get_x86_64_req(struct blkif_request *dst, + const struct blkif_x86_64_request *src) +{ + unsigned int i, n; + + dst->operation = READ_ONCE(src->operation); + + switch (dst->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + dst->u.rw.nr_segments = READ_ONCE(src->u.rw.nr_segments); + dst->u.rw.handle = src->u.rw.handle; + dst->u.rw.id = src->u.rw.id; + dst->u.rw.sector_number = src->u.rw.sector_number; + n = min_t(unsigned int, BLKIF_MAX_SEGMENTS_PER_REQUEST, + dst->u.rw.nr_segments); + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->u.rw.seg[i]; + break; + + case BLKIF_OP_DISCARD: + dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; + dst->u.discard.sector_number = src->u.discard.sector_number; + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; + break; + + case BLKIF_OP_INDIRECT: + dst->u.indirect.indirect_op = src->u.indirect.indirect_op; + dst->u.indirect.nr_segments = + READ_ONCE(src->u.indirect.nr_segments); + dst->u.indirect.handle = src->u.indirect.handle; + dst->u.indirect.id = src->u.indirect.id; + dst->u.indirect.sector_number = src->u.indirect.sector_number; + n = min(MAX_INDIRECT_PAGES, + INDIRECT_PAGES(dst->u.indirect.nr_segments)); + for (i = 0; i < n; i++) + dst->u.indirect.indirect_grefs[i] = + src->u.indirect.indirect_grefs[i]; + break; + default: + /* + * Don't know how to translate this op. Only get the + * ID so failure can be reported to the frontend. + */ + dst->u.other.id = src->u.other.id; + break; + } +} /* * Function to copy the from the ring buffer the 'struct blkif_request' diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index a28473470e66..40f67bfc052d 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -296,7 +296,7 @@ struct xen_blkif_ring { struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; - struct xen_blkif *blkif; + struct xen_blkif *blkif; }; struct xen_blkif { @@ -315,7 +315,7 @@ struct xen_blkif { atomic_t drain; struct work_struct free_work; - unsigned int nr_ring_pages; + unsigned int nr_ring_pages; bool multi_ref; /* All rings for this device. */ struct xen_blkif_ring *rings; @@ -329,7 +329,7 @@ struct seg_buf { }; struct grant_page { - struct page *page; + struct page *page; struct persistent_gnt *persistent_gnt; grant_handle_t handle; grant_ref_t gref; @@ -384,7 +384,6 @@ void xen_blkif_xenbus_fini(void); irqreturn_t xen_blkif_be_int(int irq, void *dev_id); int xen_blkif_schedule(void *arg); -int xen_blkif_purge_persistent(void *arg); void xen_blkbk_free_caches(struct xen_blkif_ring *ring); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, @@ -395,100 +394,4 @@ int xen_blkbk_barrier(struct xenbus_transaction xbt, struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); void xen_blkbk_unmap_purged_grants(struct work_struct *work); -static inline void blkif_get_x86_32_req(struct blkif_request *dst, - struct blkif_x86_32_request *src) -{ - int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; - dst->operation = READ_ONCE(src->operation); - switch (dst->operation) { - case BLKIF_OP_READ: - case BLKIF_OP_WRITE: - case BLKIF_OP_WRITE_BARRIER: - case BLKIF_OP_FLUSH_DISKCACHE: - dst->u.rw.nr_segments = src->u.rw.nr_segments; - dst->u.rw.handle = src->u.rw.handle; - dst->u.rw.id = src->u.rw.id; - dst->u.rw.sector_number = src->u.rw.sector_number; - barrier(); - if (n > dst->u.rw.nr_segments) - n = dst->u.rw.nr_segments; - for (i = 0; i < n; i++) - dst->u.rw.seg[i] = src->u.rw.seg[i]; - break; - case BLKIF_OP_DISCARD: - dst->u.discard.flag = src->u.discard.flag; - dst->u.discard.id = src->u.discard.id; - dst->u.discard.sector_number = src->u.discard.sector_number; - dst->u.discard.nr_sectors = src->u.discard.nr_sectors; - break; - case BLKIF_OP_INDIRECT: - dst->u.indirect.indirect_op = src->u.indirect.indirect_op; - dst->u.indirect.nr_segments = src->u.indirect.nr_segments; - dst->u.indirect.handle = src->u.indirect.handle; - dst->u.indirect.id = src->u.indirect.id; - dst->u.indirect.sector_number = src->u.indirect.sector_number; - barrier(); - j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); - for (i = 0; i < j; i++) - dst->u.indirect.indirect_grefs[i] = - src->u.indirect.indirect_grefs[i]; - break; - default: - /* - * Don't know how to translate this op. Only get the - * ID so failure can be reported to the frontend. - */ - dst->u.other.id = src->u.other.id; - break; - } -} - -static inline void blkif_get_x86_64_req(struct blkif_request *dst, - struct blkif_x86_64_request *src) -{ - int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; - dst->operation = READ_ONCE(src->operation); - switch (dst->operation) { - case BLKIF_OP_READ: - case BLKIF_OP_WRITE: - case BLKIF_OP_WRITE_BARRIER: - case BLKIF_OP_FLUSH_DISKCACHE: - dst->u.rw.nr_segments = src->u.rw.nr_segments; - dst->u.rw.handle = src->u.rw.handle; - dst->u.rw.id = src->u.rw.id; - dst->u.rw.sector_number = src->u.rw.sector_number; - barrier(); - if (n > dst->u.rw.nr_segments) - n = dst->u.rw.nr_segments; - for (i = 0; i < n; i++) - dst->u.rw.seg[i] = src->u.rw.seg[i]; - break; - case BLKIF_OP_DISCARD: - dst->u.discard.flag = src->u.discard.flag; - dst->u.discard.id = src->u.discard.id; - dst->u.discard.sector_number = src->u.discard.sector_number; - dst->u.discard.nr_sectors = src->u.discard.nr_sectors; - break; - case BLKIF_OP_INDIRECT: - dst->u.indirect.indirect_op = src->u.indirect.indirect_op; - dst->u.indirect.nr_segments = src->u.indirect.nr_segments; - dst->u.indirect.handle = src->u.indirect.handle; - dst->u.indirect.id = src->u.indirect.id; - dst->u.indirect.sector_number = src->u.indirect.sector_number; - barrier(); - j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); - for (i = 0; i < j; i++) - dst->u.indirect.indirect_grefs[i] = - src->u.indirect.indirect_grefs[i]; - break; - default: - /* - * Don't know how to translate this op. Only get the - * ID so failure can be reported to the frontend. - */ - dst->u.other.id = src->u.other.id; - break; - } -} - #endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index aa490da3cef2..f6d90f1ba5cf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -54,9 +54,8 @@ static size_t huge_class_size; static const struct block_device_operations zram_devops; static void zram_free_page(struct zram *zram, size_t index); -static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio); - +static int zram_read_page(struct zram *zram, struct page *page, u32 index, + struct bio *parent); static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -148,6 +147,7 @@ static inline bool is_partial_io(struct bio_vec *bvec) { return bvec->bv_len != PAGE_SIZE; } +#define ZRAM_PARTIAL_IO 1 #else static inline bool is_partial_io(struct bio_vec *bvec) { @@ -174,36 +174,6 @@ static inline u32 zram_get_priority(struct zram *zram, u32 index) return prio & ZRAM_COMP_PRIORITY_MASK; } -/* - * Check if request is within bounds and aligned on zram logical blocks. - */ -static inline bool valid_io_request(struct zram *zram, - sector_t start, unsigned int size) -{ - u64 end, bound; - - /* unaligned request */ - if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return false; - if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return false; - - end = start + (size >> SECTOR_SHIFT); - bound = zram->disksize >> SECTOR_SHIFT; - /* out of range */ - if (unlikely(start >= bound || end > bound || start > end)) - return false; - - /* I/O request is valid */ - return true; -} - -static void update_position(u32 *index, int *offset, struct bio_vec *bvec) -{ - *index += (*offset + bvec->bv_len) / PAGE_SIZE; - *offset = (*offset + bvec->bv_len) % PAGE_SIZE; -} - static inline void update_used_max(struct zram *zram, const unsigned long pages) { @@ -606,41 +576,16 @@ static void free_block_bdev(struct zram *zram, unsigned long blk_idx) atomic64_dec(&zram->stats.bd_count); } -static void zram_page_end_io(struct bio *bio) -{ - struct page *page = bio_first_page_all(bio); - - page_endio(page, op_is_write(bio_op(bio)), - blk_status_to_errno(bio->bi_status)); - bio_put(bio); -} - -/* - * Returns 1 if the submission is successful. - */ -static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, +static void read_from_bdev_async(struct zram *zram, struct page *page, unsigned long entry, struct bio *parent) { struct bio *bio; - bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ, - GFP_NOIO); - if (!bio) - return -ENOMEM; - + bio = bio_alloc(zram->bdev, 1, parent->bi_opf, GFP_NOIO); bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9); - if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) { - bio_put(bio); - return -EIO; - } - - if (!parent) - bio->bi_end_io = zram_page_end_io; - else - bio_chain(bio, parent); - + __bio_add_page(bio, page, PAGE_SIZE, 0); + bio_chain(bio, parent); submit_bio(bio); - return 1; } #define PAGE_WB_SIG "page_index=" @@ -701,10 +646,6 @@ static ssize_t writeback_store(struct device *dev, } for (; nr_pages != 0; index++, nr_pages--) { - struct bio_vec bvec; - - bvec_set_page(&bvec, page, PAGE_SIZE, 0); - spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { spin_unlock(&zram->wb_limit_lock); @@ -748,7 +689,7 @@ static ssize_t writeback_store(struct device *dev, /* Need for hugepage writeback racing */ zram_set_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); - if (zram_bvec_read(zram, &bvec, index, 0, NULL)) { + if (zram_read_page(zram, page, index, NULL)) { zram_slot_lock(zram, index); zram_clear_flag(zram, index, ZRAM_UNDER_WB); zram_clear_flag(zram, index, ZRAM_IDLE); @@ -759,9 +700,8 @@ static ssize_t writeback_store(struct device *dev, bio_init(&bio, zram->bdev, &bio_vec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); + bio_add_page(&bio, page, PAGE_SIZE, 0); - bio_add_page(&bio, bvec.bv_page, bvec.bv_len, - bvec.bv_offset); /* * XXX: A single page IO would be inefficient for write * but it would be not bad as starter. @@ -829,19 +769,20 @@ struct zram_work { struct work_struct work; struct zram *zram; unsigned long entry; - struct bio *bio; - struct bio_vec bvec; + struct page *page; + int error; }; -#if PAGE_SIZE != 4096 static void zram_sync_read(struct work_struct *work) { struct zram_work *zw = container_of(work, struct zram_work, work); - struct zram *zram = zw->zram; - unsigned long entry = zw->entry; - struct bio *bio = zw->bio; + struct bio_vec bv; + struct bio bio; - read_from_bdev_async(zram, &zw->bvec, entry, bio); + bio_init(&bio, zw->zram->bdev, &bv, 1, REQ_OP_READ); + bio.bi_iter.bi_sector = zw->entry * (PAGE_SIZE >> 9); + __bio_add_page(&bio, zw->page, PAGE_SIZE, 0); + zw->error = submit_bio_wait(&bio); } /* @@ -849,45 +790,39 @@ static void zram_sync_read(struct work_struct *work) * chained IO with parent IO in same context, it's a deadlock. To avoid that, * use a worker thread context. */ -static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, - unsigned long entry, struct bio *bio) +static int read_from_bdev_sync(struct zram *zram, struct page *page, + unsigned long entry) { struct zram_work work; - work.bvec = *bvec; + work.page = page; work.zram = zram; work.entry = entry; - work.bio = bio; INIT_WORK_ONSTACK(&work.work, zram_sync_read); queue_work(system_unbound_wq, &work.work); flush_work(&work.work); destroy_work_on_stack(&work.work); - return 1; -} -#else -static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, - unsigned long entry, struct bio *bio) -{ - WARN_ON(1); - return -EIO; + return work.error; } -#endif -static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, - unsigned long entry, struct bio *parent, bool sync) +static int read_from_bdev(struct zram *zram, struct page *page, + unsigned long entry, struct bio *parent) { atomic64_inc(&zram->stats.bd_reads); - if (sync) - return read_from_bdev_sync(zram, bvec, entry, parent); - else - return read_from_bdev_async(zram, bvec, entry, parent); + if (!parent) { + if (WARN_ON_ONCE(!IS_ENABLED(ZRAM_PARTIAL_IO))) + return -EIO; + return read_from_bdev_sync(zram, page, entry); + } + read_from_bdev_async(zram, page, entry, parent); + return 0; } #else static inline void reset_bdev(struct zram *zram) {}; -static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, - unsigned long entry, struct bio *parent, bool sync) +static int read_from_bdev(struct zram *zram, struct page *page, + unsigned long entry, struct bio *parent) { return -EIO; } @@ -1190,10 +1125,9 @@ static ssize_t io_stat_show(struct device *dev, down_read(&zram->init_lock); ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8llu\n", + "%8llu %8llu 0 %8llu\n", (u64)atomic64_read(&zram->stats.failed_reads), (u64)atomic64_read(&zram->stats.failed_writes), - (u64)atomic64_read(&zram->stats.invalid_io), (u64)atomic64_read(&zram->stats.notify_free)); up_read(&zram->init_lock); @@ -1372,20 +1306,6 @@ out: } /* - * Reads a page from the writeback devices. Corresponding ZRAM slot - * should be unlocked. - */ -static int zram_bvec_read_from_bdev(struct zram *zram, struct page *page, - u32 index, struct bio *bio, bool partial_io) -{ - struct bio_vec bvec; - - bvec_set_page(&bvec, page, PAGE_SIZE, 0); - return read_from_bdev(zram, &bvec, zram_get_element(zram, index), bio, - partial_io); -} - -/* * Reads (decompresses if needed) a page from zspool (zsmalloc). * Corresponding ZRAM slot should be locked. */ @@ -1434,8 +1354,8 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, return ret; } -static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, - struct bio *bio, bool partial_io) +static int zram_read_page(struct zram *zram, struct page *page, u32 index, + struct bio *parent) { int ret; @@ -1445,11 +1365,14 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, ret = zram_read_from_zspool(zram, page, index); zram_slot_unlock(zram, index); } else { - /* Slot should be unlocked before the function call */ + /* + * The slot should be unlocked before reading from the backing + * device. + */ zram_slot_unlock(zram, index); - ret = zram_bvec_read_from_bdev(zram, page, index, bio, - partial_io); + ret = read_from_bdev(zram, page, zram_get_element(zram, index), + parent); } /* Should NEVER happen. Return bio error if it does. */ @@ -1459,39 +1382,34 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, return ret; } -static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) +/* + * Use a temporary buffer to decompress the page, as the decompressor + * always expects a full page for the output. + */ +static int zram_bvec_read_partial(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) { + struct page *page = alloc_page(GFP_NOIO); int ret; - struct page *page; - page = bvec->bv_page; - if (is_partial_io(bvec)) { - /* Use a temporary buffer to decompress the page */ - page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); - if (!page) - return -ENOMEM; - } - - ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec)); - if (unlikely(ret)) - goto out; - - if (is_partial_io(bvec)) { - void *src = kmap_atomic(page); + if (!page) + return -ENOMEM; + ret = zram_read_page(zram, page, index, NULL); + if (likely(!ret)) + memcpy_to_bvec(bvec, page_address(page) + offset); + __free_page(page); + return ret; +} - memcpy_to_bvec(bvec, src + offset); - kunmap_atomic(src); - } -out: +static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio) +{ if (is_partial_io(bvec)) - __free_page(page); - - return ret; + return zram_bvec_read_partial(zram, bvec, index, offset); + return zram_read_page(zram, bvec->bv_page, index, bio); } -static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, - u32 index, struct bio *bio) +static int zram_write_page(struct zram *zram, struct page *page, u32 index) { int ret = 0; unsigned long alloced_pages; @@ -1499,7 +1417,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, unsigned int comp_len = 0; void *src, *dst, *mem; struct zcomp_strm *zstrm; - struct page *page = bvec->bv_page; unsigned long element = 0; enum zram_pageflags flags = 0; @@ -1617,40 +1534,33 @@ out: return ret; } -static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) +/* + * This is a partial IO. Read the full page before writing the changes. + */ +static int zram_bvec_write_partial(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio) { + struct page *page = alloc_page(GFP_NOIO); int ret; - struct page *page = NULL; - struct bio_vec vec; - vec = *bvec; - if (is_partial_io(bvec)) { - void *dst; - /* - * This is a partial IO. We need to read the full page - * before to write the changes. - */ - page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); - if (!page) - return -ENOMEM; - - ret = __zram_bvec_read(zram, page, index, bio, true); - if (ret) - goto out; - - dst = kmap_atomic(page); - memcpy_from_bvec(dst + offset, bvec); - kunmap_atomic(dst); + if (!page) + return -ENOMEM; - bvec_set_page(&vec, page, PAGE_SIZE, 0); + ret = zram_read_page(zram, page, index, bio); + if (!ret) { + memcpy_from_bvec(page_address(page) + offset, bvec); + ret = zram_write_page(zram, page, index); } + __free_page(page); + return ret; +} - ret = __zram_bvec_write(zram, &vec, index, bio); -out: +static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio) +{ if (is_partial_io(bvec)) - __free_page(page); - return ret; + return zram_bvec_write_partial(zram, bvec, index, offset, bio); + return zram_write_page(zram, bvec->bv_page, index); } #ifdef CONFIG_ZRAM_MULTI_COMP @@ -1761,7 +1671,7 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, /* * No direct reclaim (slow path) for handle allocation and no - * re-compression attempt (unlike in __zram_bvec_write()) since + * re-compression attempt (unlike in zram_write_bvec()) since * we already have stored that object in zsmalloc. If we cannot * alloc memory for recompressed object then we bail out and * simply keep the old (existing) object in zsmalloc. @@ -1921,15 +1831,12 @@ release_init_lock: } #endif -/* - * zram_bio_discard - handler on discard request - * @index: physical block index in PAGE_SIZE units - * @offset: byte offset within physical block - */ -static void zram_bio_discard(struct zram *zram, u32 index, - int offset, struct bio *bio) +static void zram_bio_discard(struct zram *zram, struct bio *bio) { size_t n = bio->bi_iter.bi_size; + u32 index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; + u32 offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) << + SECTOR_SHIFT; /* * zram manages data in physical block size units. Because logical block @@ -1957,80 +1864,58 @@ static void zram_bio_discard(struct zram *zram, u32 index, index++; n -= PAGE_SIZE; } + + bio_endio(bio); } -/* - * Returns errno if it has some problem. Otherwise return 0 or 1. - * Returns 0 if IO request was done synchronously - * Returns 1 if IO request was successfully submitted. - */ -static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, enum req_op op, struct bio *bio) +static void zram_bio_read(struct zram *zram, struct bio *bio) { - int ret; - - if (!op_is_write(op)) { - ret = zram_bvec_read(zram, bvec, index, offset, bio); - flush_dcache_page(bvec->bv_page); - } else { - ret = zram_bvec_write(zram, bvec, index, offset, bio); - } + struct bvec_iter iter; + struct bio_vec bv; + unsigned long start_time; - zram_slot_lock(zram, index); - zram_accessed(zram, index); - zram_slot_unlock(zram, index); + start_time = bio_start_io_acct(bio); + bio_for_each_segment(bv, bio, iter) { + u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; + u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) << + SECTOR_SHIFT; - if (unlikely(ret < 0)) { - if (!op_is_write(op)) + if (zram_bvec_read(zram, &bv, index, offset, bio) < 0) { atomic64_inc(&zram->stats.failed_reads); - else - atomic64_inc(&zram->stats.failed_writes); - } + bio->bi_status = BLK_STS_IOERR; + break; + } + flush_dcache_page(bv.bv_page); - return ret; + zram_slot_lock(zram, index); + zram_accessed(zram, index); + zram_slot_unlock(zram, index); + } + bio_end_io_acct(bio, start_time); + bio_endio(bio); } -static void __zram_make_request(struct zram *zram, struct bio *bio) +static void zram_bio_write(struct zram *zram, struct bio *bio) { - int offset; - u32 index; - struct bio_vec bvec; struct bvec_iter iter; + struct bio_vec bv; unsigned long start_time; - index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; - offset = (bio->bi_iter.bi_sector & - (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_WRITE_ZEROES: - zram_bio_discard(zram, index, offset, bio); - bio_endio(bio); - return; - default: - break; - } - start_time = bio_start_io_acct(bio); - bio_for_each_segment(bvec, bio, iter) { - struct bio_vec bv = bvec; - unsigned int unwritten = bvec.bv_len; - - do { - bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset, - unwritten); - if (zram_bvec_rw(zram, &bv, index, offset, - bio_op(bio), bio) < 0) { - bio->bi_status = BLK_STS_IOERR; - break; - } + bio_for_each_segment(bv, bio, iter) { + u32 index = iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; + u32 offset = (iter.bi_sector & (SECTORS_PER_PAGE - 1)) << + SECTOR_SHIFT; - bv.bv_offset += bv.bv_len; - unwritten -= bv.bv_len; + if (zram_bvec_write(zram, &bv, index, offset, bio) < 0) { + atomic64_inc(&zram->stats.failed_writes); + bio->bi_status = BLK_STS_IOERR; + break; + } - update_position(&index, &offset, &bv); - } while (unwritten); + zram_slot_lock(zram, index); + zram_accessed(zram, index); + zram_slot_unlock(zram, index); } bio_end_io_acct(bio, start_time); bio_endio(bio); @@ -2043,14 +1928,21 @@ static void zram_submit_bio(struct bio *bio) { struct zram *zram = bio->bi_bdev->bd_disk->private_data; - if (!valid_io_request(zram, bio->bi_iter.bi_sector, - bio->bi_iter.bi_size)) { - atomic64_inc(&zram->stats.invalid_io); - bio_io_error(bio); - return; + switch (bio_op(bio)) { + case REQ_OP_READ: + zram_bio_read(zram, bio); + break; + case REQ_OP_WRITE: + zram_bio_write(zram, bio); + break; + case REQ_OP_DISCARD: + case REQ_OP_WRITE_ZEROES: + zram_bio_discard(zram, bio); + break; + default: + WARN_ON_ONCE(1); + bio_endio(bio); } - - __zram_make_request(zram, bio); } static void zram_slot_free_notify(struct block_device *bdev, @@ -2323,7 +2215,6 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); /* * To ensure that we always get PAGE_SIZE aligned @@ -2424,8 +2315,8 @@ static int zram_remove(struct zram *zram) * creates a new un-initialized zram device and returns back this device's * device_id (or an error code if it fails to create a new device). */ -static ssize_t hot_add_show(struct class *class, - struct class_attribute *attr, +static ssize_t hot_add_show(const struct class *class, + const struct class_attribute *attr, char *buf) { int ret; @@ -2438,11 +2329,12 @@ static ssize_t hot_add_show(struct class *class, return ret; return scnprintf(buf, PAGE_SIZE, "%d\n", ret); } +/* This attribute must be set to 0400, so CLASS_ATTR_RO() can not be used */ static struct class_attribute class_attr_hot_add = __ATTR(hot_add, 0400, hot_add_show, NULL); -static ssize_t hot_remove_store(struct class *class, - struct class_attribute *attr, +static ssize_t hot_remove_store(const struct class *class, + const struct class_attribute *attr, const char *buf, size_t count) { @@ -2481,7 +2373,6 @@ ATTRIBUTE_GROUPS(zram_control_class); static struct class zram_control_class = { .name = "zram-control", - .owner = THIS_MODULE, .class_groups = zram_control_class_groups, }; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c5254626f051..ca7a15bd4845 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -78,7 +78,6 @@ struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t failed_reads; /* can happen when memory is too low */ atomic64_t failed_writes; /* can happen when memory is too low */ - atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t same_pages; /* no. of same element filled pages */ atomic64_t huge_pages; /* no. of huge pages */ |