aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig6
-rw-r--r--arch/powerpc/platforms/pseries/Makefile1
-rw-r--r--arch/powerpc/platforms/pseries/plpks_sed_ops.c131
-rw-r--r--block/Kconfig1
-rw-r--r--block/partitions/ibm.c98
-rw-r--r--block/sed-opal.c18
-rw-r--r--drivers/block/ublk_drv.c333
-rw-r--r--drivers/md/dm-raid.c10
-rw-r--r--drivers/md/md-autodetect.c4
-rw-r--r--drivers/md/md-bitmap.c18
-rw-r--r--drivers/md/md-linear.c2
-rw-r--r--drivers/md/md.c399
-rw-r--r--drivers/md/md.h43
-rw-r--r--drivers/md/raid1.c3
-rw-r--r--drivers/md/raid5-cache.c64
-rw-r--r--drivers/md/raid5.c56
-rw-r--r--include/linux/io_uring.h18
-rw-r--r--include/linux/io_uring_types.h10
-rw-r--r--include/linux/sed-opal-key.h26
-rw-r--r--include/uapi/linux/io_uring.h8
-rw-r--r--io_uring/Makefile3
-rw-r--r--io_uring/cancel.c5
-rw-r--r--io_uring/io_uring.c43
-rw-r--r--io_uring/io_uring.h1
-rw-r--r--io_uring/kbuf.c58
-rw-r--r--io_uring/opdef.c24
-rw-r--r--io_uring/opdef.h2
-rw-r--r--io_uring/rsrc.c37
-rw-r--r--io_uring/rw.c92
-rw-r--r--io_uring/rw.h2
-rw-r--r--io_uring/uring_cmd.c49
-rw-r--r--io_uring/waitid.c372
-rw-r--r--io_uring/waitid.h15
-rw-r--r--kernel/exit.c131
-rw-r--r--kernel/exit.h30
35 files changed, 1532 insertions, 581 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 4ebf2ef2845d..afc0f6a61337 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -164,6 +164,12 @@ config PSERIES_PLPKS
# This option is selected by in-kernel consumers that require
# access to the PKS.
+config PSERIES_PLPKS_SED
+ depends on PPC_PSERIES
+ bool
+ # This option is selected by in-kernel consumers that require
+ # access to the SED PKS keystore.
+
config PAPR_SCM
depends on PPC_PSERIES && MEMORY_HOTPLUG && LIBNVDIMM
tristate "Support for the PAPR Storage Class Memory interface"
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 53c3b91af2f7..1476c5e4433c 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_PPC_SVM) += svm.o
obj-$(CONFIG_FA_DUMP) += rtas-fadump.o
obj-$(CONFIG_PSERIES_PLPKS) += plpks.o
obj-$(CONFIG_PPC_SECURE_BOOT) += plpks-secvar.o
+obj-$(CONFIG_PSERIES_PLPKS_SED) += plpks_sed_ops.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PPC_VAS) += vas.o vas-sysfs.o
diff --git a/arch/powerpc/platforms/pseries/plpks_sed_ops.c b/arch/powerpc/platforms/pseries/plpks_sed_ops.c
new file mode 100644
index 000000000000..7c873c9589ef
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/plpks_sed_ops.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POWER Platform specific code for non-volatile SED key access
+ * Copyright (C) 2022 IBM Corporation
+ *
+ * Define operations for SED Opal to read/write keys
+ * from POWER LPAR Platform KeyStore(PLPKS).
+ *
+ * Self Encrypting Drives(SED) key storage using PLPKS
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/ioctl.h>
+#include <linux/sed-opal-key.h>
+#include <asm/plpks.h>
+
+static bool plpks_sed_initialized = false;
+static bool plpks_sed_available = false;
+
+/*
+ * structure that contains all SED data
+ */
+struct plpks_sed_object_data {
+ u_char version;
+ u_char pad1[7];
+ u_long authority;
+ u_long range;
+ u_int key_len;
+ u_char key[32];
+};
+
+#define PLPKS_SED_OBJECT_DATA_V0 0
+#define PLPKS_SED_MANGLED_LABEL "/default/pri"
+#define PLPKS_SED_COMPONENT "sed-opal"
+#define PLPKS_SED_KEY "opal-boot-pin"
+
+/*
+ * authority is admin1 and range is global
+ */
+#define PLPKS_SED_AUTHORITY 0x0000000900010001
+#define PLPKS_SED_RANGE 0x0000080200000001
+
+static void plpks_init_var(struct plpks_var *var, char *keyname)
+{
+ if (!plpks_sed_initialized) {
+ plpks_sed_initialized = true;
+ plpks_sed_available = plpks_is_available();
+ if (!plpks_sed_available)
+ pr_err("SED: plpks not available\n");
+ }
+
+ var->name = keyname;
+ var->namelen = strlen(keyname);
+ if (strcmp(PLPKS_SED_KEY, keyname) == 0) {
+ var->name = PLPKS_SED_MANGLED_LABEL;
+ var->namelen = strlen(keyname);
+ }
+ var->policy = PLPKS_WORLDREADABLE;
+ var->os = PLPKS_VAR_COMMON;
+ var->data = NULL;
+ var->datalen = 0;
+ var->component = PLPKS_SED_COMPONENT;
+}
+
+/*
+ * Read the SED Opal key from PLPKS given the label
+ */
+int sed_read_key(char *keyname, char *key, u_int *keylen)
+{
+ struct plpks_var var;
+ struct plpks_sed_object_data data;
+ int ret;
+ u_int len;
+
+ plpks_init_var(&var, keyname);
+
+ if (!plpks_sed_available)
+ return -EOPNOTSUPP;
+
+ var.data = (u8 *)&data;
+ var.datalen = sizeof(data);
+
+ ret = plpks_read_os_var(&var);
+ if (ret != 0)
+ return ret;
+
+ len = min_t(u16, be32_to_cpu(data.key_len), var.datalen);
+ memcpy(key, data.key, len);
+ key[len] = '\0';
+ *keylen = len;
+
+ return 0;
+}
+
+/*
+ * Write the SED Opal key to PLPKS given the label
+ */
+int sed_write_key(char *keyname, char *key, u_int keylen)
+{
+ struct plpks_var var;
+ struct plpks_sed_object_data data;
+ struct plpks_var_name vname;
+
+ plpks_init_var(&var, keyname);
+
+ if (!plpks_sed_available)
+ return -EOPNOTSUPP;
+
+ var.datalen = sizeof(struct plpks_sed_object_data);
+ var.data = (u8 *)&data;
+
+ /* initialize SED object */
+ data.version = PLPKS_SED_OBJECT_DATA_V0;
+ data.authority = cpu_to_be64(PLPKS_SED_AUTHORITY);
+ data.range = cpu_to_be64(PLPKS_SED_RANGE);
+ memset(&data.pad1, '\0', sizeof(data.pad1));
+ data.key_len = cpu_to_be32(keylen);
+ memcpy(data.key, (char *)key, keylen);
+
+ /*
+ * Key update requires remove first. The return value
+ * is ignored since it's okay if the key doesn't exist.
+ */
+ vname.namelen = var.namelen;
+ vname.name = var.name;
+ plpks_remove_var(var.component, var.os, vname);
+
+ return plpks_write_var(var);
+}
diff --git a/block/Kconfig b/block/Kconfig
index f1364d1c0d93..55ae2286a4de 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -186,6 +186,7 @@ config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
depends on KEYS
select PSERIES_PLPKS if PPC_PSERIES
+ select PSERIES_PLPKS_SED if PPC_PSERIES
help
Builds Logic for interfacing with Opal enabled controllers.
Enabling this option enables users to setup/unlock/lock
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 403756dbd50d..82d9c4c3fb41 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -61,6 +61,47 @@ static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
ptr->b;
}
+/* Volume Label Type/ID Length */
+#define DASD_VOL_TYPE_LEN 4
+#define DASD_VOL_ID_LEN 6
+
+/* Volume Label Types */
+#define DASD_VOLLBL_TYPE_VOL1 0
+#define DASD_VOLLBL_TYPE_LNX1 1
+#define DASD_VOLLBL_TYPE_CMS1 2
+
+struct dasd_vollabel {
+ char *type;
+ int idx;
+};
+
+static struct dasd_vollabel dasd_vollabels[] = {
+ [DASD_VOLLBL_TYPE_VOL1] = {
+ .type = "VOL1",
+ .idx = DASD_VOLLBL_TYPE_VOL1,
+ },
+ [DASD_VOLLBL_TYPE_LNX1] = {
+ .type = "LNX1",
+ .idx = DASD_VOLLBL_TYPE_LNX1,
+ },
+ [DASD_VOLLBL_TYPE_CMS1] = {
+ .type = "CMS1",
+ .idx = DASD_VOLLBL_TYPE_CMS1,
+ },
+};
+
+static int get_label_by_type(const char *type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(dasd_vollabels); i++) {
+ if (!memcmp(type, dasd_vollabels[i].type, DASD_VOL_TYPE_LEN))
+ return dasd_vollabels[i].idx;
+ }
+
+ return -1;
+}
+
static int find_label(struct parsed_partitions *state,
dasd_information2_t *info,
struct hd_geometry *geo,
@@ -70,12 +111,10 @@ static int find_label(struct parsed_partitions *state,
char type[],
union label_t *label)
{
- Sector sect;
- unsigned char *data;
sector_t testsect[3];
- unsigned char temp[5];
- int found = 0;
int i, testcount;
+ Sector sect;
+ void *data;
/* There a three places where we may find a valid label:
* - on an ECKD disk it's block 2
@@ -103,31 +142,27 @@ static int find_label(struct parsed_partitions *state,
if (data == NULL)
continue;
memcpy(label, data, sizeof(*label));
- memcpy(temp, data, 4);
- temp[4] = 0;
- EBCASC(temp, 4);
+ memcpy(type, data, DASD_VOL_TYPE_LEN);
+ EBCASC(type, DASD_VOL_TYPE_LEN);
put_dev_sector(sect);
- if (!strcmp(temp, "VOL1") ||
- !strcmp(temp, "LNX1") ||
- !strcmp(temp, "CMS1")) {
- if (!strcmp(temp, "VOL1")) {
- strncpy(type, label->vol.vollbl, 4);
- strncpy(name, label->vol.volid, 6);
- } else {
- strncpy(type, label->lnx.vollbl, 4);
- strncpy(name, label->lnx.volid, 6);
- }
- EBCASC(type, 4);
- EBCASC(name, 6);
+ switch (get_label_by_type(type)) {
+ case DASD_VOLLBL_TYPE_VOL1:
+ memcpy(name, label->vol.volid, DASD_VOL_ID_LEN);
+ EBCASC(name, DASD_VOL_ID_LEN);
+ *labelsect = testsect[i];
+ return 1;
+ case DASD_VOLLBL_TYPE_LNX1:
+ case DASD_VOLLBL_TYPE_CMS1:
+ memcpy(name, label->lnx.volid, DASD_VOL_ID_LEN);
+ EBCASC(name, DASD_VOL_ID_LEN);
*labelsect = testsect[i];
- found = 1;
+ return 1;
+ default:
break;
}
}
- if (!found)
- memset(label, 0, sizeof(*label));
- return found;
+ return 0;
}
static int find_vol1_partitions(struct parsed_partitions *state,
@@ -297,8 +332,8 @@ int ibm_partition(struct parsed_partitions *state)
sector_t nr_sectors;
dasd_information2_t *info;
struct hd_geometry *geo;
- char type[5] = {0,};
- char name[7] = {0,};
+ char type[DASD_VOL_TYPE_LEN + 1] = "";
+ char name[DASD_VOL_ID_LEN + 1] = "";
sector_t labelsect;
union label_t *label;
@@ -330,18 +365,21 @@ int ibm_partition(struct parsed_partitions *state)
info = NULL;
}
- if (find_label(state, info, geo, blocksize, &labelsect, name, type,
- label)) {
- if (!strncmp(type, "VOL1", 4)) {
+ if (find_label(state, info, geo, blocksize, &labelsect, name, type, label)) {
+ switch (get_label_by_type(type)) {
+ case DASD_VOLLBL_TYPE_VOL1:
res = find_vol1_partitions(state, geo, blocksize, name,
label);
- } else if (!strncmp(type, "LNX1", 4)) {
+ break;
+ case DASD_VOLLBL_TYPE_LNX1:
res = find_lnx1_partitions(state, geo, blocksize, name,
label, labelsect, nr_sectors,
info);
- } else if (!strncmp(type, "CMS1", 4)) {
+ break;
+ case DASD_VOLLBL_TYPE_CMS1:
res = find_cms1_partitions(state, geo, blocksize, name,
label, labelsect);
+ break;
}
} else if (info) {
/*
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 6d7f25d1711b..fa23a6a60485 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <uapi/linux/sed-opal.h>
#include <linux/sed-opal.h>
+#include <linux/sed-opal-key.h>
#include <linux/string.h>
#include <linux/kdev_t.h>
#include <linux/key.h>
@@ -3019,7 +3020,13 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
if (ret)
return ret;
- /* update keyring with new password */
+ /* update keyring and key store with new password */
+ ret = sed_write_key(OPAL_AUTH_KEY,
+ opal_pw->new_user_pw.opal_key.key,
+ opal_pw->new_user_pw.opal_key.key_len);
+ if (ret != -EOPNOTSUPP)
+ pr_warn("error updating SED key: %d\n", ret);
+
ret = update_sed_opal_key(OPAL_AUTH_KEY,
opal_pw->new_user_pw.opal_key.key,
opal_pw->new_user_pw.opal_key.key_len);
@@ -3292,6 +3299,8 @@ EXPORT_SYMBOL_GPL(sed_ioctl);
static int __init sed_opal_init(void)
{
struct key *kr;
+ char init_sed_key[OPAL_KEY_MAX];
+ int keylen = OPAL_KEY_MAX - 1;
kr = keyring_alloc(".sed_opal",
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
@@ -3304,6 +3313,11 @@ static int __init sed_opal_init(void)
sed_opal_keyring = kr;
- return 0;
+ if (sed_read_key(OPAL_AUTH_KEY, init_sed_key, &keylen) < 0) {
+ memset(init_sed_key, '\0', sizeof(init_sed_key));
+ keylen = OPAL_KEY_MAX - 1;
+ }
+
+ return update_sed_opal_key(OPAL_AUTH_KEY, init_sed_key, keylen);
}
late_initcall(sed_opal_init);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 630ddfe6657b..c34474451908 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -75,6 +75,7 @@ struct ublk_rq_data {
struct ublk_uring_cmd_pdu {
struct ublk_queue *ubq;
+ u16 tag;
};
/*
@@ -115,6 +116,9 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+/* atomic RW with ubq->cancel_lock */
+#define UBLK_IO_FLAG_CANCELED 0x80000000
+
struct ublk_io {
/* userspace buffer address from io cmd */
__u64 addr;
@@ -138,13 +142,13 @@ struct ublk_queue {
unsigned int max_io_sz;
bool force_abort;
bool timeout;
+ bool canceling;
unsigned short nr_io_ready; /* how many ios setup */
+ spinlock_t cancel_lock;
struct ublk_device *dev;
struct ublk_io ios[];
};
-#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ)
-
struct ublk_device {
struct gendisk *ub_disk;
@@ -166,7 +170,7 @@ struct ublk_device {
struct mutex mutex;
- spinlock_t mm_lock;
+ spinlock_t lock;
struct mm_struct *mm;
struct ublk_params params;
@@ -175,11 +179,6 @@ struct ublk_device {
unsigned int nr_queues_ready;
unsigned int nr_privileged_daemon;
- /*
- * Our ubq->daemon may be killed without any notification, so
- * monitor each queue's daemon periodically
- */
- struct delayed_work monitor_work;
struct work_struct quiesce_work;
struct work_struct stop_work;
};
@@ -190,10 +189,11 @@ struct ublk_params_header {
__u32 types;
};
+static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq);
+
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
-
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_USER_COPY;
@@ -470,6 +470,7 @@ static DEFINE_MUTEX(ublk_ctl_mutex);
* It can be extended to one per-user limit in future or even controlled
* by cgroup.
*/
+#define UBLK_MAX_UBLKS UBLK_MINORS
static unsigned int ublks_max = 64;
static unsigned int ublks_added; /* protected by ublk_ctl_mutex */
@@ -1083,13 +1084,10 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
{
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
- if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
- io->flags |= UBLK_IO_FLAG_ABORTED;
- if (ublk_queue_can_use_recovery_reissue(ubq))
- blk_mq_requeue_request(req, false);
- else
- ublk_put_req_ref(ubq, req);
- }
+ if (ublk_queue_can_use_recovery_reissue(ubq))
+ blk_mq_requeue_request(req, false);
+ else
+ ublk_put_req_ref(ubq, req);
}
static void ubq_complete_io_cmd(struct ublk_io *io, int res,
@@ -1118,8 +1116,6 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_requeue_request(rq, false);
else
blk_mq_end_request(rq, BLK_STS_IOERR);
-
- mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
}
static inline void __ublk_rq_task_work(struct request *req,
@@ -1232,38 +1228,19 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
{
struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
- struct ublk_io *io;
- if (!llist_add(&data->node, &ubq->io_cmds))
- return;
+ if (llist_add(&data->node, &ubq->io_cmds)) {
+ struct ublk_io *io = &ubq->ios[rq->tag];
- io = &ubq->ios[rq->tag];
- /*
- * If the check pass, we know that this is a re-issued request aborted
- * previously in monitor_work because the ubq_daemon(cmd's task) is
- * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
- * because this ioucmd's io_uring context may be freed now if no inflight
- * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
- *
- * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
- * the tag). Then the request is re-started(allocating the tag) and we are here.
- * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
- * guarantees that here is a re-issued request aborted previously.
- */
- if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
- ublk_abort_io_cmds(ubq);
- } else {
- struct io_uring_cmd *cmd = io->cmd;
- struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
-
- pdu->ubq = ubq;
- io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+ io_uring_cmd_complete_in_task(io->cmd, ublk_rq_task_work_cb);
}
}
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
{
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
+ unsigned int nr_inflight = 0;
+ int i;
if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) {
if (!ubq->timeout) {
@@ -1274,6 +1251,29 @@ static enum blk_eh_timer_return ublk_timeout(struct request *rq)
return BLK_EH_DONE;
}
+ if (!ubq_daemon_is_dying(ubq))
+ return BLK_EH_RESET_TIMER;
+
+ for (i = 0; i < ubq->q_depth; i++) {
+ struct ublk_io *io = &ubq->ios[i];
+
+ if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
+ nr_inflight++;
+ }
+
+ /* cancelable uring_cmd can't help us if all commands are in-flight */
+ if (nr_inflight == ubq->q_depth) {
+ struct ublk_device *ub = ubq->dev;
+
+ if (ublk_abort_requests(ub, ubq)) {
+ if (ublk_can_use_recovery(ub))
+ schedule_work(&ub->quiesce_work);
+ else
+ schedule_work(&ub->stop_work);
+ }
+ return BLK_EH_DONE;
+ }
+
return BLK_EH_RESET_TIMER;
}
@@ -1301,13 +1301,12 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
return BLK_STS_IOERR;
- blk_mq_start_request(bd->rq);
-
- if (unlikely(ubq_daemon_is_dying(ubq))) {
+ if (unlikely(ubq->canceling)) {
__ublk_abort_rq(ubq, rq);
return BLK_STS_OK;
}
+ blk_mq_start_request(bd->rq);
ublk_queue_cmd(ubq, rq);
return BLK_STS_OK;
@@ -1357,12 +1356,12 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
int q_id, ret = 0;
- spin_lock(&ub->mm_lock);
+ spin_lock(&ub->lock);
if (!ub->mm)
ub->mm = current->mm;
if (current->mm != ub->mm)
ret = -EINVAL;
- spin_unlock(&ub->mm_lock);
+ spin_unlock(&ub->lock);
if (ret)
return ret;
@@ -1411,17 +1410,14 @@ static void ublk_commit_completion(struct ublk_device *ub,
}
/*
- * When ->ubq_daemon is exiting, either new request is ended immediately,
- * or any queued io command is drained, so it is safe to abort queue
- * lockless
+ * Called from ubq_daemon context via cancel fn, meantime quiesce ublk
+ * blk-mq queue, so we are called exclusively with blk-mq and ubq_daemon
+ * context, so everything is serialized.
*/
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;
- if (!ublk_get_device(ub))
- return;
-
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -1433,72 +1429,114 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
* will do it
*/
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq)
+ if (rq && blk_mq_request_started(rq)) {
+ io->flags |= UBLK_IO_FLAG_ABORTED;
__ublk_fail_req(ubq, io, rq);
+ }
}
}
- ublk_put_device(ub);
}
-static void ublk_daemon_monitor_work(struct work_struct *work)
+static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq)
{
- struct ublk_device *ub =
- container_of(work, struct ublk_device, monitor_work.work);
- int i;
+ struct gendisk *disk;
- for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
- struct ublk_queue *ubq = ublk_get_queue(ub, i);
+ spin_lock(&ubq->cancel_lock);
+ if (ubq->canceling) {
+ spin_unlock(&ubq->cancel_lock);
+ return false;
+ }
+ ubq->canceling = true;
+ spin_unlock(&ubq->cancel_lock);
- if (ubq_daemon_is_dying(ubq)) {
- if (ublk_queue_can_use_recovery(ubq))
- schedule_work(&ub->quiesce_work);
- else
- schedule_work(&ub->stop_work);
+ spin_lock(&ub->lock);
+ disk = ub->ub_disk;
+ if (disk)
+ get_device(disk_to_dev(disk));
+ spin_unlock(&ub->lock);
- /* abort queue is for making forward progress */
- ublk_abort_queue(ub, ubq);
- }
- }
+ /* Our disk has been dead */
+ if (!disk)
+ return false;
- /*
- * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
- * after ublk_remove() or __ublk_quiesce_dev() is started.
- *
- * No need ub->mutex, monitor work are canceled after state is marked
- * as not LIVE, so new state is observed reliably.
- */
- if (ub->dev_info.state == UBLK_S_DEV_LIVE)
- schedule_delayed_work(&ub->monitor_work,
- UBLK_DAEMON_MONITOR_PERIOD);
-}
+ /* Now we are serialized with ublk_queue_rq() */
+ blk_mq_quiesce_queue(disk->queue);
+ /* abort queue is for making forward progress */
+ ublk_abort_queue(ub, ubq);
+ blk_mq_unquiesce_queue(disk->queue);
+ put_device(disk_to_dev(disk));
-static inline bool ublk_queue_ready(struct ublk_queue *ubq)
-{
- return ubq->nr_io_ready == ubq->q_depth;
+ return true;
}
-static void ublk_cmd_cancel_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
+static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io,
+ unsigned int issue_flags)
{
- io_uring_cmd_done(cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
+ bool done;
+
+ if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
+ return;
+
+ spin_lock(&ubq->cancel_lock);
+ done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
+ if (!done)
+ io->flags |= UBLK_IO_FLAG_CANCELED;
+ spin_unlock(&ubq->cancel_lock);
+
+ if (!done)
+ io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
}
-static void ublk_cancel_queue(struct ublk_queue *ubq)
+/*
+ * The ublk char device won't be closed when calling cancel fn, so both
+ * ublk device and queue are guaranteed to be live
+ */
+static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
- int i;
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+ struct ublk_queue *ubq = pdu->ubq;
+ struct task_struct *task;
+ struct ublk_device *ub;
+ bool need_schedule;
+ struct ublk_io *io;
- if (!ublk_queue_ready(ubq))
+ if (WARN_ON_ONCE(!ubq))
return;
- for (i = 0; i < ubq->q_depth; i++) {
- struct ublk_io *io = &ubq->ios[i];
+ if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
+ return;
+
+ task = io_uring_cmd_get_task(cmd);
+ if (WARN_ON_ONCE(task && task != ubq->ubq_daemon))
+ return;
- if (io->flags & UBLK_IO_FLAG_ACTIVE)
- io_uring_cmd_complete_in_task(io->cmd,
- ublk_cmd_cancel_cb);
+ ub = ubq->dev;
+ need_schedule = ublk_abort_requests(ub, ubq);
+
+ io = &ubq->ios[pdu->tag];
+ WARN_ON_ONCE(io->cmd != cmd);
+ ublk_cancel_cmd(ubq, io, issue_flags);
+
+ if (need_schedule) {
+ if (ublk_can_use_recovery(ub))
+ schedule_work(&ub->quiesce_work);
+ else
+ schedule_work(&ub->stop_work);
}
+}
- /* all io commands are canceled */
- ubq->nr_io_ready = 0;
+static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+{
+ return ubq->nr_io_ready == ubq->q_depth;
+}
+
+static void ublk_cancel_queue(struct ublk_queue *ubq)
+{
+ int i;
+
+ for (i = 0; i < ubq->q_depth; i++)
+ ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED);
}
/* Cancel all pending commands, must be called after del_gendisk() returns */
@@ -1545,16 +1583,6 @@ static void __ublk_quiesce_dev(struct ublk_device *ub)
blk_mq_quiesce_queue(ub->ub_disk->queue);
ublk_wait_tagset_rqs_idle(ub);
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
- ublk_cancel_dev(ub);
- /* we are going to release task_struct of ubq_daemon and resets
- * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
- * Besides, monitor_work is not necessary in QUIESCED state since we have
- * already scheduled quiesce_work and quiesced all ubqs.
- *
- * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
- * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
- */
- cancel_delayed_work_sync(&ub->monitor_work);
}
static void ublk_quiesce_work_fn(struct work_struct *work)
@@ -1568,6 +1596,7 @@ static void ublk_quiesce_work_fn(struct work_struct *work)
__ublk_quiesce_dev(ub);
unlock:
mutex_unlock(&ub->mutex);
+ ublk_cancel_dev(ub);
}
static void ublk_unquiesce_dev(struct ublk_device *ub)
@@ -1593,6 +1622,8 @@ static void ublk_unquiesce_dev(struct ublk_device *ub)
static void ublk_stop_dev(struct ublk_device *ub)
{
+ struct gendisk *disk;
+
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
goto unlock;
@@ -1602,14 +1633,18 @@ static void ublk_stop_dev(struct ublk_device *ub)
ublk_unquiesce_dev(ub);
}
del_gendisk(ub->ub_disk);
+
+ /* Sync with ublk_abort_queue() by holding the lock */
+ spin_lock(&ub->lock);
+ disk = ub->ub_disk;
ub->dev_info.state = UBLK_S_DEV_DEAD;
ub->dev_info.ublksrv_pid = -1;
- put_disk(ub->ub_disk);
ub->ub_disk = NULL;
+ spin_unlock(&ub->lock);
+ put_disk(disk);
unlock:
- ublk_cancel_dev(ub);
mutex_unlock(&ub->mutex);
- cancel_delayed_work_sync(&ub->monitor_work);
+ ublk_cancel_dev(ub);
}
/* device can only be started after all IOs are ready */
@@ -1660,6 +1695,21 @@ static inline void ublk_fill_io_cmd(struct ublk_io *io,
io->addr = buf_addr;
}
+static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
+ unsigned int issue_flags,
+ struct ublk_queue *ubq, unsigned int tag)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ /*
+ * Safe to refer to @ubq since ublk_queue won't be died until its
+ * commands are completed
+ */
+ pdu->ubq = ubq;
+ pdu->tag = tag;
+ io_uring_cmd_mark_cancelable(cmd, issue_flags);
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1775,6 +1825,7 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
default:
goto out;
}
+ ublk_prep_cancel(cmd, issue_flags, ubq, tag);
return -EIOCBQUEUED;
out:
@@ -1814,7 +1865,8 @@ fail_put:
return NULL;
}
-static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+static inline int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
{
/*
* Not necessary for async retry, but let's keep it simple and always
@@ -1828,9 +1880,33 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
.addr = READ_ONCE(ub_src->addr)
};
+ WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
+
return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
}
+static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ ublk_ch_uring_cmd_local(cmd, issue_flags);
+}
+
+static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
+ ublk_uring_cmd_cancel_fn(cmd, issue_flags);
+ return 0;
+ }
+
+ /* well-implemented server won't run into unlocked */
+ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
+ io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
+ return -EIOCBQUEUED;
+ }
+
+ return ublk_ch_uring_cmd_local(cmd, issue_flags);
+}
+
static inline bool ublk_check_ubuf_dir(const struct request *req,
int ubuf_dir)
{
@@ -1962,6 +2038,7 @@ static int ublk_init_queue(struct ublk_device *ub, int q_id)
void *ptr;
int size;
+ spin_lock_init(&ubq->cancel_lock);
ubq->flags = ub->dev_info.flags;
ubq->q_id = q_id;
ubq->q_depth = ub->dev_info.queue_depth;
@@ -2026,7 +2103,8 @@ static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
if (err == -ENOSPC)
err = -EEXIST;
} else {
- err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
+ err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
+ GFP_NOWAIT);
}
spin_unlock(&ublk_idr_lock);
@@ -2151,8 +2229,6 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
- schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
-
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
test_bit(UB_STATE_USED, &ub->state)) {
@@ -2305,6 +2381,12 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
return -EINVAL;
}
+ if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
+ pr_warn("%s: dev id is too large. Max supported is %d\n",
+ __func__, UBLK_MAX_UBLKS - 1);
+ return -EINVAL;
+ }
+
ublk_dump_dev_info(&info);
ret = mutex_lock_killable(&ublk_ctl_mutex);
@@ -2320,10 +2402,9 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
if (!ub)
goto out_unlock;
mutex_init(&ub->mutex);
- spin_lock_init(&ub->mm_lock);
+ spin_lock_init(&ub->lock);
INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
- INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
ret = ublk_alloc_dev_number(ub, header->dev_id);
if (ret < 0)
@@ -2569,13 +2650,15 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
int i;
WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
+
/* All old ioucmds have to be completed */
- WARN_ON_ONCE(ubq->nr_io_ready);
+ ubq->nr_io_ready = 0;
/* old daemon is PF_EXITING, put it now */
put_task_struct(ubq->ubq_daemon);
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
ubq->ubq_daemon = NULL;
ubq->timeout = false;
+ ubq->canceling = false;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
@@ -2661,7 +2744,6 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub,
__func__, header->dev_id);
blk_mq_kick_requeue_list(ub->ub_disk->queue);
ub->dev_info.state = UBLK_S_DEV_LIVE;
- schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
ret = 0;
out_unlock:
mutex_unlock(&ub->mutex);
@@ -2932,7 +3014,22 @@ static void __exit ublk_exit(void)
module_init(ublk_init);
module_exit(ublk_exit);
-module_param(ublks_max, int, 0444);
+static int ublk_set_max_ublks(const char *buf, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
+}
+
+static int ublk_get_max_ublks(char *buf, const struct kernel_param *kp)
+{
+ return sysfs_emit(buf, "%u\n", ublks_max);
+}
+
+static const struct kernel_param_ops ublk_max_ublks_ops = {
+ .set = ublk_set_max_ublks,
+ .get = ublk_get_max_ublks,
+};
+
+module_param_cb(ublks_max, &ublk_max_ublks_ops, &ublks_max, 0644);
MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)");
MODULE_AUTHOR("Ming Lei <[email protected]>");
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 69805d37e113..a4692f8f98ee 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3244,7 +3244,7 @@ size_check:
set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */
- mddev_lock_nointr(&rs->md);
+ mddev_suspend_and_lock_nointr(&rs->md);
r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) {
@@ -3268,7 +3268,6 @@ size_check:
}
}
- mddev_suspend(&rs->md);
set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags);
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
@@ -3798,9 +3797,7 @@ static void raid_postsuspend(struct dm_target *ti)
if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
md_stop_writes(&rs->md);
- mddev_lock_nointr(&rs->md);
- mddev_suspend(&rs->md);
- mddev_unlock(&rs->md);
+ mddev_suspend(&rs->md, false);
}
}
@@ -4059,8 +4056,7 @@ static void raid_resume(struct dm_target *ti)
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0;
mddev->in_sync = 0;
- mddev_resume(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
}
}
diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
index 6eaa0eab40f9..4b80165afd23 100644
--- a/drivers/md/md-autodetect.c
+++ b/drivers/md/md-autodetect.c
@@ -175,7 +175,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
return;
}
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err) {
pr_err("md: failed to lock array %s\n", name);
goto out_mddev_put;
@@ -221,7 +221,7 @@ static void __init md_setup_drive(struct md_setup_args *args)
if (err)
pr_warn("md: starting %s failed\n", name);
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
out_mddev_put:
mddev_put(mddev);
}
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 0c661e5036bb..9672f75c3050 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1861,7 +1861,7 @@ void md_bitmap_destroy(struct mddev *mddev)
md_bitmap_wait_behind_writes(mddev);
if (!mddev->serialize_policy)
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
mutex_lock(&mddev->bitmap_info.mutex);
spin_lock(&mddev->lock);
@@ -1977,7 +1977,7 @@ int md_bitmap_load(struct mddev *mddev)
goto out;
rdev_for_each(rdev, mddev)
- mddev_create_serial_pool(mddev, rdev, true);
+ mddev_create_serial_pool(mddev, rdev);
if (mddev_is_clustered(mddev))
md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes);
@@ -2348,11 +2348,10 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
{
int rv;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
- mddev_suspend(mddev);
if (mddev->pers) {
if (mddev->recovery || mddev->sync_thread) {
rv = -EBUSY;
@@ -2429,8 +2428,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
}
rv = 0;
out:
- mddev_resume(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
if (rv)
return rv;
return len;
@@ -2539,7 +2537,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX)
return -EINVAL;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
@@ -2564,16 +2562,16 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (!backlog && mddev->serial_info_pool) {
/* serial_info_pool is not needed if backlog is zero */
if (!mddev->serialize_policy)
- mddev_destroy_serial_pool(mddev, NULL, false);
+ mddev_destroy_serial_pool(mddev, NULL);
} else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */
rdev_for_each(rdev, mddev)
- mddev_create_serial_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return len;
}
diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c
index ae2826e9645b..8eca7693b793 100644
--- a/drivers/md/md-linear.c
+++ b/drivers/md/md-linear.c
@@ -183,7 +183,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
* in linear_congested(), therefore kfree_rcu() is used to free
* oldconf until no one uses it anymore.
*/
- mddev_suspend(mddev);
oldconf = rcu_dereference_protected(mddev->private,
lockdep_is_held(&mddev->reconfig_mutex));
mddev->raid_disks++;
@@ -192,7 +191,6 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
- mddev_resume(mddev);
kfree_rcu(oldconf, rcu);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 76e2cf609883..8ee079c4dc1e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -206,8 +206,7 @@ static int rdev_need_serial(struct md_rdev *rdev)
* 1. rdev is the first device which return true from rdev_enable_serial.
* 2. rdev is NULL, means we want to enable serialization for all rdevs.
*/
-void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{
int ret = 0;
@@ -215,15 +214,12 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
!test_bit(CollisionCheck, &rdev->flags))
return;
- if (!is_suspend)
- mddev_suspend(mddev);
-
if (!rdev)
ret = rdevs_init_serial(mddev);
else
ret = rdev_init_serial(rdev);
if (ret)
- goto abort;
+ return;
if (mddev->serial_info_pool == NULL) {
/*
@@ -238,10 +234,6 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
pr_err("can't alloc memory pool for serialization\n");
}
}
-
-abort:
- if (!is_suspend)
- mddev_resume(mddev);
}
/*
@@ -250,8 +242,7 @@ abort:
* 2. when bitmap is destroyed while policy is not enabled.
* 3. for disable policy, the pool is destroyed only when no rdev needs it.
*/
-void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend)
+void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev)
{
if (rdev && !test_bit(CollisionCheck, &rdev->flags))
return;
@@ -260,8 +251,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
struct md_rdev *temp;
int num = 0; /* used to track if other rdevs need the pool */
- if (!is_suspend)
- mddev_suspend(mddev);
rdev_for_each(temp, mddev) {
if (!rdev) {
if (!mddev->serialize_policy ||
@@ -283,8 +272,6 @@ void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
mempool_destroy(mddev->serial_info_pool);
mddev->serial_info_pool = NULL;
}
- if (!is_suspend)
- mddev_resume(mddev);
}
}
@@ -359,11 +346,11 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true;
if (bio_data_dir(bio) != WRITE)
return false;
- if (mddev->suspend_lo >= mddev->suspend_hi)
+ if (READ_ONCE(mddev->suspend_lo) >= READ_ONCE(mddev->suspend_hi))
return false;
- if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
+ if (bio->bi_iter.bi_sector >= READ_ONCE(mddev->suspend_hi))
return false;
- if (bio_end_sector(bio) < mddev->suspend_lo)
+ if (bio_end_sector(bio) < READ_ONCE(mddev->suspend_lo))
return false;
return true;
}
@@ -431,42 +418,73 @@ static void md_submit_bio(struct bio *bio)
md_handle_request(mddev, bio);
}
-/* mddev_suspend makes sure no new requests are submitted
- * to the device, and that any requests that have been submitted
- * are completely handled.
- * Once mddev_detach() is called and completes, the module will be
- * completely unused.
+/*
+ * Make sure no new requests are submitted to the device, and any requests that
+ * have been submitted are completely handled.
*/
-void mddev_suspend(struct mddev *mddev)
+int mddev_suspend(struct mddev *mddev, bool interruptible)
{
- struct md_thread *thread = rcu_dereference_protected(mddev->thread,
- lockdep_is_held(&mddev->reconfig_mutex));
+ int err = 0;
- WARN_ON_ONCE(thread && current == thread->tsk);
- if (mddev->suspended++)
- return;
- wake_up(&mddev->sb_wait);
- set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
- percpu_ref_kill(&mddev->active_io);
+ /*
+ * hold reconfig_mutex to wait for normal io will deadlock, because
+ * other context can't update super_block, and normal io can rely on
+ * updating super_block.
+ */
+ lockdep_assert_not_held(&mddev->reconfig_mutex);
+
+ if (interruptible)
+ err = mutex_lock_interruptible(&mddev->suspend_mutex);
+ else
+ mutex_lock(&mddev->suspend_mutex);
+ if (err)
+ return err;
+
+ if (mddev->suspended) {
+ WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
+ mutex_unlock(&mddev->suspend_mutex);
+ return 0;
+ }
- if (mddev->pers && mddev->pers->prepare_suspend)
- mddev->pers->prepare_suspend(mddev);
+ percpu_ref_kill(&mddev->active_io);
+ if (interruptible)
+ err = wait_event_interruptible(mddev->sb_wait,
+ percpu_ref_is_zero(&mddev->active_io));
+ else
+ wait_event(mddev->sb_wait,
+ percpu_ref_is_zero(&mddev->active_io));
+ if (err) {
+ percpu_ref_resurrect(&mddev->active_io);
+ mutex_unlock(&mddev->suspend_mutex);
+ return err;
+ }
- wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
- clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
- wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
+ /*
+ * For raid456, io might be waiting for reshape to make progress,
+ * allow new reshape to start while waiting for io to be done to
+ * prevent deadlock.
+ */
+ WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
del_timer_sync(&mddev->safemode_timer);
/* restrict memory reclaim I/O during raid array is suspend */
mddev->noio_flag = memalloc_noio_save();
+
+ mutex_unlock(&mddev->suspend_mutex);
+ return 0;
}
EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
- lockdep_assert_held(&mddev->reconfig_mutex);
- if (--mddev->suspended)
+ lockdep_assert_not_held(&mddev->reconfig_mutex);
+
+ mutex_lock(&mddev->suspend_mutex);
+ WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
+ if (mddev->suspended) {
+ mutex_unlock(&mddev->suspend_mutex);
return;
+ }
/* entred the memalloc scope from mddev_suspend() */
memalloc_noio_restore(mddev->noio_flag);
@@ -477,6 +495,8 @@ void mddev_resume(struct mddev *mddev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+
+ mutex_unlock(&mddev->suspend_mutex);
}
EXPORT_SYMBOL_GPL(mddev_resume);
@@ -616,23 +636,28 @@ static inline struct mddev *mddev_get(struct mddev *mddev)
static void mddev_delayed_delete(struct work_struct *ws);
+static void __mddev_put(struct mddev *mddev)
+{
+ if (mddev->raid_disks || !list_empty(&mddev->disks) ||
+ mddev->ctime || mddev->hold_active)
+ return;
+
+ /* Array is not configured at all, and not held active, so destroy it */
+ set_bit(MD_DELETED, &mddev->flags);
+
+ /*
+ * Call queue_work inside the spinlock so that flush_workqueue() after
+ * mddev_find will succeed in waiting for the work to be done.
+ */
+ queue_work(md_misc_wq, &mddev->del_work);
+}
+
void mddev_put(struct mddev *mddev)
{
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
- if (!mddev->raid_disks && list_empty(&mddev->disks) &&
- mddev->ctime == 0 && !mddev->hold_active) {
- /* Array is not configured at all, and not held active,
- * so destroy it */
- set_bit(MD_DELETED, &mddev->flags);
- /*
- * Call queue_work inside the spinlock so that
- * flush_workqueue() after mddev_find will succeed in waiting
- * for the work to be done.
- */
- queue_work(md_misc_wq, &mddev->del_work);
- }
+ __mddev_put(mddev);
spin_unlock(&all_mddevs_lock);
}
@@ -667,6 +692,7 @@ int mddev_init(struct mddev *mddev)
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->sync_mutex);
+ mutex_init(&mddev->suspend_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -2454,7 +2480,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
pr_debug("md: bind<%s>\n", b);
if (mddev->raid_disks)
- mddev_create_serial_pool(mddev, rdev, false);
+ mddev_create_serial_pool(mddev, rdev);
if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
goto fail;
@@ -2507,7 +2533,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
- mddev_destroy_serial_pool(rdev->mddev, rdev, false);
+ mddev_destroy_serial_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
@@ -2837,11 +2863,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
*/
super_types[mddev->major_version].
validate_super(mddev, rdev);
- if (add_journal)
- mddev_suspend(mddev);
err = mddev->pers->hot_add_disk(mddev, rdev);
- if (add_journal)
- mddev_resume(mddev);
if (err) {
md_kick_rdev_from_array(rdev);
return err;
@@ -2978,11 +3000,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
- mddev_create_serial_pool(rdev->mddev, rdev, false);
+ mddev_create_serial_pool(rdev->mddev, rdev);
need_update_sb = true;
err = 0;
} else if (cmd_match(buf, "-writemostly")) {
- mddev_destroy_serial_pool(rdev->mddev, rdev, false);
+ mddev_destroy_serial_pool(rdev->mddev, rdev);
clear_bit(WriteMostly, &rdev->flags);
need_update_sb = true;
err = 0;
@@ -3594,6 +3616,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
struct kernfs_node *kn = NULL;
+ bool suspend = false;
ssize_t rv;
struct mddev *mddev = rdev->mddev;
@@ -3601,17 +3624,25 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+ if (!mddev)
+ return -ENODEV;
- if (entry->store == state_store && cmd_match(page, "remove"))
- kn = sysfs_break_active_protection(kobj, attr);
+ if (entry->store == state_store) {
+ if (cmd_match(page, "remove"))
+ kn = sysfs_break_active_protection(kobj, attr);
+ if (cmd_match(page, "remove") || cmd_match(page, "re-add") ||
+ cmd_match(page, "writemostly") ||
+ cmd_match(page, "-writemostly"))
+ suspend = true;
+ }
- rv = mddev ? mddev_lock(mddev) : -ENODEV;
+ rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
if (!rv) {
if (rdev->mddev == NULL)
rv = -ENODEV;
else
rv = entry->store(rdev, page, length);
- mddev_unlock(mddev);
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
}
if (kn)
@@ -3916,7 +3947,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
if (slen == 0 || slen >= sizeof(clevel))
return -EINVAL;
- rv = mddev_lock(mddev);
+ rv = mddev_suspend_and_lock(mddev);
if (rv)
return rv;
@@ -4009,7 +4040,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
/* Looks like we have a winner */
- mddev_suspend(mddev);
mddev_detach(mddev);
spin_lock(&mddev->lock);
@@ -4095,14 +4125,13 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
- mddev_resume(mddev);
if (!mddev->thread)
md_update_sb(mddev, 1);
sysfs_notify_dirent_safe(mddev->sysfs_level);
md_new_event();
rv = len;
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return rv;
}
@@ -4410,6 +4439,18 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
int err = 0;
enum array_state st = match_word(buf, array_states);
+ /* No lock dependent actions */
+ switch (st) {
+ case suspended: /* not supported yet */
+ case write_pending: /* cannot be set */
+ case active_idle: /* cannot be set */
+ case broken: /* cannot be set */
+ case bad_word:
+ return -EINVAL;
+ default:
+ break;
+ }
+
if (mddev->pers && (st == active || st == clean) &&
mddev->ro != MD_RDONLY) {
/* don't take reconfig_mutex when toggling between
@@ -4434,23 +4475,16 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = mddev_lock(mddev);
if (err)
return err;
- err = -EINVAL;
- switch(st) {
- case bad_word:
- break;
- case clear:
- /* stopping an active array */
- err = do_md_stop(mddev, 0, NULL);
- break;
+
+ switch (st) {
case inactive:
- /* stopping an active array */
+ /* stop an active array, return 0 otherwise */
if (mddev->pers)
err = do_md_stop(mddev, 2, NULL);
- else
- err = 0; /* already inactive */
break;
- case suspended:
- break; /* not supported yet */
+ case clear:
+ err = do_md_stop(mddev, 0, NULL);
+ break;
case readonly:
if (mddev->pers)
err = md_set_readonly(mddev, NULL);
@@ -4501,10 +4535,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = do_md_run(mddev);
}
break;
- case write_pending:
- case active_idle:
- case broken:
- /* these cannot be set */
+ default:
+ err = -EINVAL;
break;
}
@@ -4577,7 +4609,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
if (mddev->persistent) {
@@ -4598,14 +4630,14 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
rdev = md_import_device(dev, -1, -1);
if (IS_ERR(rdev)) {
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return PTR_ERR(rdev);
}
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
export_rdev(rdev, mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
if (!err)
md_new_event();
return err ? err : len;
@@ -5171,7 +5203,8 @@ __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
static ssize_t
suspend_lo_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)READ_ONCE(mddev->suspend_lo));
}
static ssize_t
@@ -5186,15 +5219,13 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
if (new != (sector_t)new)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend(mddev, true);
if (err)
return err;
- mddev_suspend(mddev);
- mddev->suspend_lo = new;
+ WRITE_ONCE(mddev->suspend_lo, new);
mddev_resume(mddev);
- mddev_unlock(mddev);
return len;
}
static struct md_sysfs_entry md_suspend_lo =
@@ -5203,7 +5234,8 @@ __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
static ssize_t
suspend_hi_show(struct mddev *mddev, char *page)
{
- return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
+ return sprintf(page, "%llu\n",
+ (unsigned long long)READ_ONCE(mddev->suspend_hi));
}
static ssize_t
@@ -5218,15 +5250,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
if (new != (sector_t)new)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend(mddev, true);
if (err)
return err;
- mddev_suspend(mddev);
- mddev->suspend_hi = new;
+ WRITE_ONCE(mddev->suspend_hi, new);
mddev_resume(mddev);
- mddev_unlock(mddev);
return len;
}
static struct md_sysfs_entry md_suspend_hi =
@@ -5474,7 +5504,7 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
if (value == mddev->serialize_policy)
return len;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
if (mddev->pers == NULL || (mddev->pers->level != 1)) {
@@ -5483,15 +5513,13 @@ serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
goto unlock;
}
- mddev_suspend(mddev);
if (value)
- mddev_create_serial_pool(mddev, NULL, true);
+ mddev_create_serial_pool(mddev, NULL);
else
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
mddev->serialize_policy = value;
- mddev_resume(mddev);
unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -6254,7 +6282,7 @@ static void __md_stop_writes(struct mddev *mddev)
}
/* disable policy to guarantee rdevs free resources for serialization */
mddev->serialize_policy = 0;
- mddev_destroy_serial_pool(mddev, NULL, true);
+ mddev_destroy_serial_pool(mddev, NULL);
}
void md_stop_writes(struct mddev *mddev)
@@ -6546,13 +6574,13 @@ static void autorun_devices(int part)
if (IS_ERR(mddev))
break;
- if (mddev_lock(mddev))
+ if (mddev_suspend_and_lock(mddev))
pr_warn("md: %s locked, cannot run\n", mdname(mddev));
else if (mddev->raid_disks || mddev->major_version
|| !list_empty(&mddev->disks)) {
pr_warn("md: %s already running, cannot run %pg\n",
mdname(mddev), rdev0->bdev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
} else {
pr_debug("md: created %s\n", mdname(mddev));
mddev->persistent = 1;
@@ -6562,7 +6590,7 @@ static void autorun_devices(int part)
export_rdev(rdev, mddev);
}
autorun_array(mddev);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
}
/* on success, candidates will be empty, on error
* it won't...
@@ -7112,7 +7140,6 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
struct bitmap *bitmap;
bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
err = md_bitmap_load(mddev);
@@ -7122,11 +7149,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
md_bitmap_destroy(mddev);
fd = -1;
}
- mddev_resume(mddev);
} else if (fd < 0) {
- mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
}
}
if (fd < 0) {
@@ -7415,7 +7439,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
bitmap = md_bitmap_create(mddev, -1);
- mddev_suspend(mddev);
if (!IS_ERR(bitmap)) {
mddev->bitmap = bitmap;
rv = md_bitmap_load(mddev);
@@ -7423,7 +7446,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
rv = PTR_ERR(bitmap);
if (rv)
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
} else {
/* remove the bitmap */
if (!mddev->bitmap) {
@@ -7448,9 +7470,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
module_put(md_cluster_mod);
mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY;
}
- mddev_suspend(mddev);
md_bitmap_destroy(mddev);
- mddev_resume(mddev);
mddev->bitmap_info.offset = 0;
}
}
@@ -7521,6 +7541,20 @@ static inline bool md_ioctl_valid(unsigned int cmd)
}
}
+static bool md_ioctl_need_suspend(unsigned int cmd)
+{
+ switch (cmd) {
+ case ADD_NEW_DISK:
+ case HOT_ADD_DISK:
+ case HOT_REMOVE_DISK:
+ case SET_BITMAP_FILE:
+ case SET_ARRAY_INFO:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int __md_set_array_info(struct mddev *mddev, void __user *argp)
{
mdu_array_info_t info;
@@ -7653,7 +7687,8 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
if (!md_is_rdwr(mddev))
flush_work(&mddev->sync_work);
- err = mddev_lock(mddev);
+ err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
+ mddev_lock(mddev);
if (err) {
pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
err, cmd);
@@ -7781,7 +7816,10 @@ unlock:
if (mddev->hold_active == UNTIL_IOCTL &&
err != -EINVAL)
mddev->hold_active = 0;
- mddev_unlock(mddev);
+
+ md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
+ mddev_unlock(mddev);
+
out:
if(did_set_md_closing)
clear_bit(MD_CLOSING, &mddev->flags);
@@ -8208,105 +8246,46 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
}
static void *md_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&all_mddevs_lock)
{
- struct list_head *tmp;
- loff_t l = *pos;
- struct mddev *mddev;
+ struct md_personality *pers;
- if (l == 0x10000) {
- ++*pos;
- return (void *)2;
- }
- if (l > 0x10000)
- return NULL;
- if (!l--)
- /* header */
- return (void*)1;
+ seq_puts(seq, "Personalities : ");
+ spin_lock(&pers_lock);
+ list_for_each_entry(pers, &pers_list, list)
+ seq_printf(seq, "[%s] ", pers->name);
+
+ spin_unlock(&pers_lock);
+ seq_puts(seq, "\n");
+ seq->poll_event = atomic_read(&md_event_count);
spin_lock(&all_mddevs_lock);
- list_for_each(tmp,&all_mddevs)
- if (!l--) {
- mddev = list_entry(tmp, struct mddev, all_mddevs);
- if (!mddev_get(mddev))
- continue;
- spin_unlock(&all_mddevs_lock);
- return mddev;
- }
- spin_unlock(&all_mddevs_lock);
- if (!l--)
- return (void*)2;/* tail */
- return NULL;
+
+ return seq_list_start(&all_mddevs, *pos);
}
static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct list_head *tmp;
- struct mddev *next_mddev, *mddev = v;
- struct mddev *to_put = NULL;
-
- ++*pos;
- if (v == (void*)2)
- return NULL;
-
- spin_lock(&all_mddevs_lock);
- if (v == (void*)1) {
- tmp = all_mddevs.next;
- } else {
- to_put = mddev;
- tmp = mddev->all_mddevs.next;
- }
-
- for (;;) {
- if (tmp == &all_mddevs) {
- next_mddev = (void*)2;
- *pos = 0x10000;
- break;
- }
- next_mddev = list_entry(tmp, struct mddev, all_mddevs);
- if (mddev_get(next_mddev))
- break;
- mddev = next_mddev;
- tmp = mddev->all_mddevs.next;
- }
- spin_unlock(&all_mddevs_lock);
-
- if (to_put)
- mddev_put(to_put);
- return next_mddev;
-
+ return seq_list_next(v, &all_mddevs, pos);
}
static void md_seq_stop(struct seq_file *seq, void *v)
+ __releases(&all_mddevs_lock)
{
- struct mddev *mddev = v;
-
- if (mddev && v != (void*)1 && v != (void*)2)
- mddev_put(mddev);
+ status_unused(seq);
+ spin_unlock(&all_mddevs_lock);
}
static int md_seq_show(struct seq_file *seq, void *v)
{
- struct mddev *mddev = v;
+ struct mddev *mddev = list_entry(v, struct mddev, all_mddevs);
sector_t sectors;
struct md_rdev *rdev;
- if (v == (void*)1) {
- struct md_personality *pers;
- seq_printf(seq, "Personalities : ");
- spin_lock(&pers_lock);
- list_for_each_entry(pers, &pers_list, list)
- seq_printf(seq, "[%s] ", pers->name);
-
- spin_unlock(&pers_lock);
- seq_printf(seq, "\n");
- seq->poll_event = atomic_read(&md_event_count);
- return 0;
- }
- if (v == (void*)2) {
- status_unused(seq);
+ if (!mddev_get(mddev))
return 0;
- }
+ spin_unlock(&all_mddevs_lock);
spin_lock(&mddev->lock);
if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
seq_printf(seq, "%s : %sactive", mdname(mddev),
@@ -8377,6 +8356,9 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\n");
}
spin_unlock(&mddev->lock);
+ spin_lock(&all_mddevs_lock);
+ if (atomic_dec_and_test(&mddev->active))
+ __mddev_put(mddev);
return 0;
}
@@ -9371,8 +9353,13 @@ static void md_start_sync(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, sync_work);
int spares = 0;
+ bool suspend = false;
- mddev_lock_nointr(mddev);
+ if (md_spares_need_change(mddev))
+ suspend = true;
+
+ suspend ? mddev_suspend_and_lock_nointr(mddev) :
+ mddev_lock_nointr(mddev);
if (!md_is_rdwr(mddev)) {
/*
@@ -9408,7 +9395,7 @@ static void md_start_sync(struct work_struct *ws)
goto not_running;
}
- mddev_unlock(mddev);
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
md_new_event();
@@ -9420,7 +9407,7 @@ not_running:
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
- mddev_unlock(mddev);
+ suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
wake_up(&resync_wait);
if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
@@ -9452,19 +9439,7 @@ not_running:
*/
void md_check_recovery(struct mddev *mddev)
{
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
- /* Write superblock - thread that called mddev_suspend()
- * holds reconfig_mutex for us.
- */
- set_bit(MD_UPDATING_SB, &mddev->flags);
- smp_mb__after_atomic();
- if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
- md_update_sb(mddev, 0);
- clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
- wake_up(&mddev->sb_wait);
- }
-
- if (is_md_suspended(mddev))
+ if (READ_ONCE(mddev->suspended))
return;
if (mddev->bitmap)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b628c292506e..55d01d431418 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -248,10 +248,6 @@ struct md_cluster_info;
* become failed.
* @MD_HAS_PPL: The raid array has PPL feature set.
* @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
- * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
- * without taking reconfig_mutex.
- * @MD_UPDATING_SB: md_check_recovery is updating the metadata without
- * explicitly holding reconfig_mutex.
* @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
* array is ready yet.
* @MD_BROKEN: This is used to stop writes and mark array as failed.
@@ -268,8 +264,6 @@ enum mddev_flags {
MD_FAILFAST_SUPPORTED,
MD_HAS_PPL,
MD_HAS_MULTIPLE_PPLS,
- MD_ALLOW_SB_UPDATE,
- MD_UPDATING_SB,
MD_NOT_READY,
MD_BROKEN,
MD_DELETED,
@@ -316,6 +310,7 @@ struct mddev {
unsigned long sb_flags;
int suspended;
+ struct mutex suspend_mutex;
struct percpu_ref active_io;
int ro;
int sysfs_active; /* set when sysfs deletes
@@ -809,15 +804,14 @@ extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);
extern void md_handle_request(struct mddev *mddev, struct bio *bio);
-extern void mddev_suspend(struct mddev *mddev);
+extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
-extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
-extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
- bool is_suspend);
+extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev);
+extern void mddev_destroy_serial_pool(struct mddev *mddev,
+ struct md_rdev *rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
@@ -855,6 +849,33 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
mddev->queue->limits.max_write_zeroes_sectors = 0;
}
+static inline int mddev_suspend_and_lock(struct mddev *mddev)
+{
+ int ret;
+
+ ret = mddev_suspend(mddev, true);
+ if (ret)
+ return ret;
+
+ ret = mddev_lock(mddev);
+ if (ret)
+ mddev_resume(mddev);
+
+ return ret;
+}
+
+static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
+{
+ mddev_suspend(mddev, false);
+ mutex_lock(&mddev->reconfig_mutex);
+}
+
+static inline void mddev_unlock_and_resume(struct mddev *mddev)
+{
+ mddev_unlock(mddev);
+ mddev_resume(mddev);
+}
+
struct mdu_array_info_s;
struct mdu_disk_info_s;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3a78f79ee6d5..35d12948e0a9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1345,6 +1345,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int first_clone;
int max_sectors;
bool write_behind = false;
+ bool is_discard = (bio_op(bio) == REQ_OP_DISCARD);
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
@@ -1405,7 +1406,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
* write-mostly, which means we could allocate write behind
* bio later.
*/
- if (rdev && test_bit(WriteMostly, &rdev->flags))
+ if (!is_discard && rdev && test_bit(WriteMostly, &rdev->flags))
write_behind = true;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 518b7cfa78b9..6157f5beb9fe 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -327,8 +327,9 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
void r5c_check_stripe_cache_usage(struct r5conf *conf)
{
int total_cached;
+ struct r5l_log *log = READ_ONCE(conf->log);
- if (!r5c_is_writeback(conf->log))
+ if (!r5c_is_writeback(log))
return;
total_cached = atomic_read(&conf->r5c_cached_partial_stripes) +
@@ -344,7 +345,7 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf)
*/
if (total_cached > conf->min_nr_stripes * 1 / 2 ||
atomic_read(&conf->empty_inactive_list_nr) > 0)
- r5l_wake_reclaim(conf->log, 0);
+ r5l_wake_reclaim(log, 0);
}
/*
@@ -353,7 +354,9 @@ void r5c_check_stripe_cache_usage(struct r5conf *conf)
*/
void r5c_check_cached_full_stripe(struct r5conf *conf)
{
- if (!r5c_is_writeback(conf->log))
+ struct r5l_log *log = READ_ONCE(conf->log);
+
+ if (!r5c_is_writeback(log))
return;
/*
@@ -363,7 +366,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
if (atomic_read(&conf->r5c_cached_full_stripes) >=
min(R5C_FULL_STRIPE_FLUSH_BATCH(conf),
conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf)))
- r5l_wake_reclaim(conf->log, 0);
+ r5l_wake_reclaim(log, 0);
}
/*
@@ -396,7 +399,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
*/
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!r5c_is_writeback(log))
return 0;
@@ -449,7 +452,7 @@ static inline void r5c_update_log_state(struct r5l_log *log)
void r5c_make_stripe_write_out(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
BUG_ON(!r5c_is_writeback(log));
@@ -491,7 +494,7 @@ static void r5c_handle_parity_cached(struct stripe_head *sh)
*/
static void r5c_finish_cache_stripe(struct stripe_head *sh)
{
- struct r5l_log *log = sh->raid_conf->log;
+ struct r5l_log *log = READ_ONCE(sh->raid_conf->log);
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
BUG_ON(test_bit(STRIPE_R5C_CACHING, &sh->state));
@@ -683,7 +686,6 @@ static void r5c_disable_writeback_async(struct work_struct *work)
disable_writeback_work);
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
- int locked = 0;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
@@ -692,14 +694,14 @@ static void r5c_disable_writeback_async(struct work_struct *work)
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
- conf->log == NULL ||
- (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
- (locked = mddev_trylock(mddev))));
- if (locked) {
- mddev_suspend(mddev);
+ !READ_ONCE(conf->log) ||
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
+ log = READ_ONCE(conf->log);
+ if (log) {
+ mddev_suspend(mddev, false);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
- mddev_unlock(mddev);
}
}
@@ -1151,7 +1153,7 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
static sector_t r5c_calculate_new_cp(struct r5conf *conf)
{
struct stripe_head *sh;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
sector_t new_cp;
unsigned long flags;
@@ -1159,12 +1161,12 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf)
return log->next_checkpoint;
spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
- if (list_empty(&conf->log->stripe_in_journal_list)) {
+ if (list_empty(&log->stripe_in_journal_list)) {
/* all stripes flushed */
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
return log->next_checkpoint;
}
- sh = list_first_entry(&conf->log->stripe_in_journal_list,
+ sh = list_first_entry(&log->stripe_in_journal_list,
struct stripe_head, r5c);
new_cp = sh->log_start;
spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
@@ -1399,7 +1401,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
struct stripe_head *sh, *next;
lockdep_assert_held(&conf->device_lock);
- if (!conf->log)
+ if (!READ_ONCE(conf->log))
return;
count = 0;
@@ -1420,7 +1422,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
static void r5c_do_reclaim(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
struct stripe_head *sh;
int count = 0;
unsigned long flags;
@@ -1549,7 +1551,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!log)
return;
@@ -1591,7 +1593,7 @@ void r5l_quiesce(struct r5l_log *log, int quiesce)
bool r5l_log_disk_error(struct r5conf *conf)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
/* don't allow write if journal disk is missing */
if (!log)
@@ -2583,9 +2585,7 @@ int r5c_journal_mode_set(struct mddev *mddev, int mode)
mode == R5C_JOURNAL_MODE_WRITE_BACK)
return -EINVAL;
- mddev_suspend(mddev);
conf->log->r5c_journal_mode = mode;
- mddev_resume(mddev);
pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n",
mdname(mddev), mode, r5c_journal_mode_str[mode]);
@@ -2610,11 +2610,11 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
if (strlen(r5c_journal_mode_str[mode]) == len &&
!strncmp(page, r5c_journal_mode_str[mode], len))
break;
- ret = mddev_lock(mddev);
+ ret = mddev_suspend_and_lock(mddev);
if (ret)
return ret;
ret = r5c_journal_mode_set(mddev, mode);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return ret ?: length;
}
@@ -2635,7 +2635,7 @@ int r5c_try_caching_write(struct r5conf *conf,
struct stripe_head_state *s,
int disks)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
int i;
struct r5dev *dev;
int to_cache = 0;
@@ -2802,7 +2802,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
struct stripe_head *sh,
struct stripe_head_state *s)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
int i;
int do_wakeup = 0;
sector_t tree_index;
@@ -2941,7 +2941,7 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
/* check whether this big stripe is in write back cache. */
bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
{
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
sector_t tree_index;
void *slot;
@@ -3049,14 +3049,14 @@ int r5l_start(struct r5l_log *log)
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = READ_ONCE(conf->log);
if (!log)
return;
if ((raid5_calc_degraded(conf) > 0 ||
test_bit(Journal, &rdev->flags)) &&
- conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
+ log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
@@ -3145,7 +3145,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
spin_lock_init(&log->stripe_in_journal_lock);
atomic_set(&log->stripe_in_journal_count, 0);
- conf->log = log;
+ WRITE_ONCE(conf->log, log);
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0;
@@ -3173,7 +3173,7 @@ void r5l_exit_log(struct r5conf *conf)
* 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
* ensure disable_writeback_work wakes up and exits.
*/
- conf->log = NULL;
+ WRITE_ONCE(conf->log, NULL);
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6383723468e5..d6de084a85e5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,6 +70,8 @@ MODULE_PARM_DESC(devices_handle_discard_safely,
"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
static struct workqueue_struct *raid5_wq;
+static void raid5_quiesce(struct mddev *mddev, int quiesce);
+
static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
{
int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK;
@@ -2492,15 +2494,12 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
unsigned long cpu;
int err = 0;
- /*
- * Never shrink. And mddev_suspend() could deadlock if this is called
- * from raid5d. In that case, scribble_disks and scribble_sectors
- * should equal to new_disks and new_sectors
- */
+ /* Never shrink. */
if (conf->scribble_disks >= new_disks &&
conf->scribble_sectors >= new_sectors)
return 0;
- mddev_suspend(conf->mddev);
+
+ raid5_quiesce(conf->mddev, true);
cpus_read_lock();
for_each_present_cpu(cpu) {
@@ -2514,7 +2513,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
}
cpus_read_unlock();
- mddev_resume(conf->mddev);
+ raid5_quiesce(conf->mddev, false);
+
if (!err) {
conf->scribble_disks = new_disks;
conf->scribble_sectors = new_sectors;
@@ -7025,7 +7025,7 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
new != roundup_pow_of_two(new))
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
@@ -7049,7 +7049,6 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
goto out_unlock;
}
- mddev_suspend(mddev);
mutex_lock(&conf->cache_size_mutex);
size = conf->max_nr_stripes;
@@ -7064,10 +7063,9 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
err = -ENOMEM;
}
mutex_unlock(&conf->cache_size_mutex);
- mddev_resume(mddev);
out_unlock:
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7153,7 +7151,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
new = !!new;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
@@ -7162,15 +7160,13 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
else if (new != conf->skip_copy) {
struct request_queue *q = mddev->queue;
- mddev_suspend(mddev);
conf->skip_copy = new;
if (new)
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
else
blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
- mddev_resume(mddev);
}
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -7225,15 +7221,13 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
if (new > 8192)
return -EINVAL;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf)
err = -ENODEV;
else if (new != conf->worker_cnt_per_group) {
- mddev_suspend(mddev);
-
old_groups = conf->worker_groups;
if (old_groups)
flush_workqueue(raid5_wq);
@@ -7250,9 +7244,8 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len)
kfree(old_groups[0].workers);
kfree(old_groups);
}
- mddev_resume(mddev);
}
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err ?: len;
}
@@ -8558,8 +8551,8 @@ static int raid5_start_reshape(struct mddev *mddev)
* the reshape wasn't running - like Discard or Read - have
* completed.
*/
- mddev_suspend(mddev);
- mddev_resume(mddev);
+ raid5_quiesce(mddev, true);
+ raid5_quiesce(mddev, false);
/* Add some new drives, as many as will fit.
* We know there are enough to make the newly sized array work.
@@ -8974,12 +8967,12 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
struct r5conf *conf;
int err;
- err = mddev_lock(mddev);
+ err = mddev_suspend_and_lock(mddev);
if (err)
return err;
conf = mddev->private;
if (!conf) {
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return -ENODEV;
}
@@ -8989,19 +8982,14 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
err = log_init(conf, NULL, true);
if (!err) {
err = resize_stripes(conf, conf->pool_size);
- if (err) {
- mddev_suspend(mddev);
+ if (err)
log_exit(conf);
- mddev_resume(mddev);
- }
}
} else
err = -EINVAL;
} else if (strncmp(buf, "resync", 6) == 0) {
if (raid5_has_ppl(conf)) {
- mddev_suspend(mddev);
log_exit(conf);
- mddev_resume(mddev);
err = resize_stripes(conf, conf->pool_size);
} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
r5l_log_disk_error(conf)) {
@@ -9014,11 +9002,9 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
break;
}
- if (!journal_dev_exists) {
- mddev_suspend(mddev);
+ if (!journal_dev_exists)
clear_bit(MD_HAS_JOURNAL, &mddev->flags);
- mddev_resume(mddev);
- } else /* need remove journal device first */
+ else /* need remove journal device first */
err = -EBUSY;
} else
err = -EINVAL;
@@ -9029,7 +9015,7 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
if (!err)
md_update_sb(mddev, 1);
- mddev_unlock(mddev);
+ mddev_unlock_and_resume(mddev);
return err;
}
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 106cdc55ff3b..b4391e0a9bc8 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -20,8 +20,15 @@ enum io_uring_cmd_flags {
IO_URING_F_SQE128 = (1 << 8),
IO_URING_F_CQE32 = (1 << 9),
IO_URING_F_IOPOLL = (1 << 10),
+
+ /* set when uring wants to cancel a previously issued command */
+ IO_URING_F_CANCEL = (1 << 11),
};
+/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
+#define IORING_URING_CMD_CANCELABLE (1U << 30)
+#define IORING_URING_CMD_POLLED (1U << 31)
+
struct io_uring_cmd {
struct file *file;
const struct io_uring_sqe *sqe;
@@ -82,6 +89,9 @@ static inline void io_uring_free(struct task_struct *tsk)
__io_uring_free(tsk);
}
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags);
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd);
#else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
@@ -122,6 +132,14 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
{
return -EOPNOTSUPP;
}
+static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+}
+static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+ return NULL;
+}
#endif
#endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 13d19b9be9f4..e4e67899b134 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -265,6 +265,12 @@ struct io_ring_ctx {
*/
struct io_wq_work_list iopoll_list;
bool poll_multi_queue;
+
+ /*
+ * Any cancelable uring_cmd is added to this list in
+ * ->uring_cmd() by io_uring_cmd_insert_cancelable()
+ */
+ struct hlist_head cancelable_uring_cmd;
} ____cacheline_aligned_in_smp;
struct {
@@ -313,6 +319,8 @@ struct io_ring_ctx {
struct list_head cq_overflow_list;
struct io_hash_table cancel_table;
+ struct hlist_head waitid_list;
+
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
@@ -342,8 +350,6 @@ struct io_ring_ctx {
struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce;
- struct list_head io_buffers_pages;
-
#if defined(CONFIG_UNIX)
struct socket *ring_sock;
#endif
diff --git a/include/linux/sed-opal-key.h b/include/linux/sed-opal-key.h
new file mode 100644
index 000000000000..0ca03054e8f6
--- /dev/null
+++ b/include/linux/sed-opal-key.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SED key operations.
+ *
+ * Copyright (C) 2023 IBM Corporation
+ *
+ * These are the accessor functions (read/write) for SED Opal
+ * keys. Specific keystores can provide overrides.
+ *
+ */
+
+#include <linux/kernel.h>
+
+#ifdef CONFIG_PSERIES_PLPKS_SED
+int sed_read_key(char *keyname, char *key, u_int *keylen);
+int sed_write_key(char *keyname, char *key, u_int keylen);
+#else
+static inline
+int sed_read_key(char *keyname, char *key, u_int *keylen) {
+ return -EOPNOTSUPP;
+}
+static inline
+int sed_write_key(char *keyname, char *key, u_int keylen) {
+ return -EOPNOTSUPP;
+}
+#endif
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 8e61f8b7c2ce..425f64eee44e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -65,6 +65,7 @@ struct io_uring_sqe {
__u32 xattr_flags;
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
+ __u32 waitid_flags;
};
__u64 user_data; /* data to be passed back at completion time */
/* pack this to avoid bogus arm OABI complaints */
@@ -240,19 +241,20 @@ enum io_uring_op {
IORING_OP_URING_CMD,
IORING_OP_SEND_ZC,
IORING_OP_SENDMSG_ZC,
+ IORING_OP_READ_MULTISHOT,
+ IORING_OP_WAITID,
/* this goes last, obviously */
IORING_OP_LAST,
};
/*
- * sqe->uring_cmd_flags
+ * sqe->uring_cmd_flags top 8bits aren't available for userspace
* IORING_URING_CMD_FIXED use registered buffer; pass this flag
* along with setting sqe->buf_index.
- * IORING_URING_CMD_POLLED driver use only
*/
#define IORING_URING_CMD_FIXED (1U << 0)
-#define IORING_URING_CMD_POLLED (1U << 31)
+#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED
/*
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 8cc8e5387a75..7bd64e442567 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
openclose.o uring_cmd.o epoll.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
- cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
+ cancel.o kbuf.o rsrc.o rw.o opdef.o \
+ notif.o waitid.o
obj-$(CONFIG_IO_WQ) += io-wq.o
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 7b23607cf4af..eb77a51c5a79 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -15,6 +15,7 @@
#include "tctx.h"
#include "poll.h"
#include "timeout.h"
+#include "waitid.h"
#include "cancel.h"
struct io_cancel {
@@ -119,6 +120,10 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
if (ret != -ENOENT)
return ret;
+ ret = io_waitid_cancel(ctx, cd, issue_flags);
+ if (ret != -ENOENT)
+ return ret;
+
spin_lock(&ctx->completion_lock);
if (!(cd->flags & IORING_ASYNC_CANCEL_FD))
ret = io_timeout_cancel(ctx, cd);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 783ed0fff71b..b9e1af5772f3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -92,6 +92,7 @@
#include "cancel.h"
#include "net.h"
#include "notif.h"
+#include "waitid.h"
#include "timeout.h"
#include "poll.h"
@@ -338,7 +339,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
- INIT_LIST_HEAD(&ctx->io_buffers_pages);
INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
@@ -348,8 +348,10 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_WQ_LIST(&ctx->locked_free_list);
+ INIT_HLIST_HEAD(&ctx->waitid_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+ INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
return ctx;
err:
kfree(ctx->cancel_table.hbs);
@@ -3256,6 +3258,37 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
return ret;
}
+static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
+ struct task_struct *task, bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool ret = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
+ hash_node) {
+ struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
+ struct io_uring_cmd);
+ struct file *file = req->file;
+
+ if (!cancel_all && req->task != task)
+ continue;
+
+ if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
+ /* ->sqe isn't available if no async data */
+ if (!req_has_async_data(req))
+ cmd->sqe = NULL;
+ file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL);
+ ret = true;
+ }
+ }
+ io_submit_flush_completions(ctx);
+
+ return ret;
+}
+
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
@@ -3303,6 +3336,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
ret |= io_cancel_defer_files(ctx, task, cancel_all);
mutex_lock(&ctx->uring_lock);
ret |= io_poll_remove_all(ctx, task, cancel_all);
+ ret |= io_waitid_remove_all(ctx, task, cancel_all);
+ ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all);
mutex_unlock(&ctx->uring_lock);
ret |= io_kill_timeouts(ctx, task, cancel_all);
if (task)
@@ -4666,6 +4701,9 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
+ /* top 8bits are for internal use */
+ BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+
io_uring_optable_init();
/*
@@ -4681,6 +4719,9 @@ static int __init io_uring_init(void)
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
offsetof(struct io_kiocb, cmd.data),
sizeof_field(struct io_kiocb, cmd.data), NULL);
+ io_buf_cachep = kmem_cache_create("io_buffer", sizeof(struct io_buffer), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
+ NULL);
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 547c30582fb8..2ff719ae1b57 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -330,6 +330,7 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
}
extern struct kmem_cache *req_cachep;
+extern struct kmem_cache *io_buf_cachep;
static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
{
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 556f4df25b0f..d5a04467666f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -19,12 +19,17 @@
#define BGID_ARRAY 64
+/* BIDs are addressed by a 16-bit field in a CQE */
+#define MAX_BIDS_PER_BGID (1 << 16)
+
+struct kmem_cache *io_buf_cachep;
+
struct io_provide_buf {
struct file *file;
__u64 addr;
__u32 len;
__u32 bgid;
- __u16 nbufs;
+ __u32 nbufs;
__u16 bid;
};
@@ -255,6 +260,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
void io_destroy_buffers(struct io_ring_ctx *ctx)
{
struct io_buffer_list *bl;
+ struct list_head *item, *tmp;
+ struct io_buffer *buf;
unsigned long index;
int i;
@@ -270,12 +277,9 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
kfree(bl);
}
- while (!list_empty(&ctx->io_buffers_pages)) {
- struct page *page;
-
- page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
- list_del_init(&page->lru);
- __free_page(page);
+ list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
+ buf = list_entry(item, struct io_buffer, list);
+ kmem_cache_free(io_buf_cachep, buf);
}
}
@@ -289,7 +293,7 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
- if (!tmp || tmp > USHRT_MAX)
+ if (!tmp || tmp > MAX_BIDS_PER_BGID)
return -EINVAL;
memset(p, 0, sizeof(*p));
@@ -332,7 +336,7 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
tmp = READ_ONCE(sqe->fd);
- if (!tmp || tmp > USHRT_MAX)
+ if (!tmp || tmp > MAX_BIDS_PER_BGID)
return -E2BIG;
p->nbufs = tmp;
p->addr = READ_ONCE(sqe->addr);
@@ -352,17 +356,18 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
tmp = READ_ONCE(sqe->off);
if (tmp > USHRT_MAX)
return -E2BIG;
- if (tmp + p->nbufs >= USHRT_MAX)
+ if (tmp + p->nbufs > MAX_BIDS_PER_BGID)
return -EINVAL;
p->bid = tmp;
return 0;
}
+#define IO_BUFFER_ALLOC_BATCH 64
+
static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
{
- struct io_buffer *buf;
- struct page *page;
- int bufs_in_page;
+ struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH];
+ int allocated;
/*
* Completions that don't happen inline (eg not under uring_lock) will
@@ -382,22 +387,25 @@ static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
/*
* No free buffers and no completion entries either. Allocate a new
- * page worth of buffer entries and add those to our freelist.
+ * batch of buffer entries and add those to our freelist.
*/
- page = alloc_page(GFP_KERNEL_ACCOUNT);
- if (!page)
- return -ENOMEM;
-
- list_add(&page->lru, &ctx->io_buffers_pages);
- buf = page_address(page);
- bufs_in_page = PAGE_SIZE / sizeof(*buf);
- while (bufs_in_page) {
- list_add_tail(&buf->list, &ctx->io_buffers_cache);
- buf++;
- bufs_in_page--;
+ allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT,
+ ARRAY_SIZE(bufs), (void **) bufs);
+ if (unlikely(!allocated)) {
+ /*
+ * Bulk alloc is all-or-nothing. If we fail to get a batch,
+ * retry single alloc to be on the safe side.
+ */
+ bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL);
+ if (!bufs[0])
+ return -ENOMEM;
+ allocated = 1;
}
+ while (allocated)
+ list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache);
+
return 0;
}
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 3b9c6489b8b6..aadcbf7136b0 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -33,6 +33,7 @@
#include "poll.h"
#include "cancel.h"
#include "rw.h"
+#include "waitid.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
@@ -63,6 +64,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
+ .vectored = 1,
.prep = io_prep_rw,
.issue = io_read,
},
@@ -76,6 +78,7 @@ const struct io_issue_def io_issue_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
+ .vectored = 1,
.prep = io_prep_rw,
.issue = io_write,
},
@@ -428,9 +431,21 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_READ_MULTISHOT] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollin = 1,
+ .buffer_select = 1,
+ .audit_skip = 1,
+ .prep = io_read_mshot_prep,
+ .issue = io_read_mshot,
+ },
+ [IORING_OP_WAITID] = {
+ .prep = io_waitid_prep,
+ .issue = io_waitid,
+ },
};
-
const struct io_cold_def io_cold_defs[] = {
[IORING_OP_NOP] = {
.name = "NOP",
@@ -648,6 +663,13 @@ const struct io_cold_def io_cold_defs[] = {
.fail = io_sendrecv_fail,
#endif
},
+ [IORING_OP_READ_MULTISHOT] = {
+ .name = "READ_MULTISHOT",
+ },
+ [IORING_OP_WAITID] = {
+ .name = "WAITID",
+ .async_size = sizeof(struct io_waitid_async),
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/opdef.h b/io_uring/opdef.h
index c22c8696e749..9e5435ec27d0 100644
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@@ -29,6 +29,8 @@ struct io_issue_def {
unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1;
+ /* vectored opcode, set if 1) vectored, and 2) handler needs to know */
+ unsigned vectored : 1;
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d9c853d10587..7034be555334 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1037,39 +1037,36 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
{
unsigned long start, end, nr_pages;
struct page **pages = NULL;
- int pret, ret = -ENOMEM;
+ int ret;
end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
start = ubuf >> PAGE_SHIFT;
nr_pages = end - start;
+ WARN_ON(!nr_pages);
pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
- goto done;
+ return ERR_PTR(-ENOMEM);
- ret = 0;
mmap_read_lock(current->mm);
- pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
- pages);
- if (pret == nr_pages)
+ ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages);
+ mmap_read_unlock(current->mm);
+
+ /* success, mapped all pages */
+ if (ret == nr_pages) {
*npages = nr_pages;
- else
- ret = pret < 0 ? pret : -EFAULT;
+ return pages;
+ }
- mmap_read_unlock(current->mm);
- if (ret) {
+ /* partial map, or didn't map anything */
+ if (ret >= 0) {
/* if we did partial map, release any pages we did get */
- if (pret > 0)
- unpin_user_pages(pages, pret);
- goto done;
- }
- ret = 0;
-done:
- if (ret < 0) {
- kvfree(pages);
- pages = ERR_PTR(ret);
+ if (ret)
+ unpin_user_pages(pages, ret);
+ ret = -EFAULT;
}
- return pages;
+ kvfree(pages);
+ return ERR_PTR(ret);
}
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index c8c822fa7980..ec0cc38ea682 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -123,6 +123,22 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
+/*
+ * Multishot read is prepared just like a normal read/write request, only
+ * difference is that we set the MULTISHOT flag.
+ */
+int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ int ret;
+
+ ret = io_prep_rw(req, sqe);
+ if (unlikely(ret))
+ return ret;
+
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ return 0;
+}
+
void io_readv_writev_cleanup(struct io_kiocb *req)
{
struct io_async_rw *io = req->async_data;
@@ -388,8 +404,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
buf = u64_to_user_ptr(rw->addr);
sqe_len = rw->len;
- if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE ||
- (req->flags & REQ_F_BUFFER_SELECT)) {
+ if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
if (io_do_buffer_select(req)) {
buf = io_buffer_select(req, &sqe_len, issue_flags);
if (!buf)
@@ -708,7 +723,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
return 0;
}
-int io_read(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct io_rw_state __s, *s = &__s;
@@ -776,8 +791,11 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
- /* if we can poll, just do that */
- if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+ /*
+ * If we can poll, just do that. For a vectored read, we'll
+ * need to copy state first.
+ */
+ if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -853,7 +871,69 @@ done:
/* it's faster to check here then delegate to kfree */
if (iovec)
kfree(iovec);
- return kiocb_done(req, ret, issue_flags);
+ return ret;
+}
+
+int io_read(struct io_kiocb *req, unsigned int issue_flags)
+{
+ int ret;
+
+ ret = __io_read(req, issue_flags);
+ if (ret >= 0)
+ return kiocb_done(req, ret, issue_flags);
+
+ return ret;
+}
+
+int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
+{
+ unsigned int cflags = 0;
+ int ret;
+
+ /*
+ * Multishot MUST be used on a pollable file
+ */
+ if (!file_can_poll(req->file))
+ return -EBADFD;
+
+ ret = __io_read(req, issue_flags);
+
+ /*
+ * If we get -EAGAIN, recycle our buffer and just let normal poll
+ * handling arm it.
+ */
+ if (ret == -EAGAIN) {
+ io_kbuf_recycle(req, issue_flags);
+ return -EAGAIN;
+ }
+
+ /*
+ * Any successful return value will keep the multishot read armed.
+ */
+ if (ret > 0) {
+ /*
+ * Put our buffer and post a CQE. If we fail to post a CQE, then
+ * jump to the termination path. This request is then done.
+ */
+ cflags = io_put_kbuf(req, issue_flags);
+
+ if (io_fill_cqe_req_aux(req,
+ issue_flags & IO_URING_F_COMPLETE_DEFER,
+ ret, cflags | IORING_CQE_F_MORE)) {
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ return IOU_ISSUE_SKIP_COMPLETE;
+ return -EAGAIN;
+ }
+ }
+
+ /*
+ * Either an error, or we've hit overflow posting the CQE. For any
+ * multishot request, hitting overflow will terminate it.
+ */
+ io_req_set_res(req, ret, cflags);
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ return IOU_STOP_MULTISHOT;
+ return IOU_OK;
}
int io_write(struct io_kiocb *req, unsigned int issue_flags)
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 4b89f9659366..c5aed03d42a4 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -23,3 +23,5 @@ int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
+int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 537795fddc87..00a5e5621a28 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -13,6 +13,51 @@
#include "rsrc.h"
#include "uring_cmd.h"
+static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
+ return;
+
+ cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_del(&req->hash_node);
+ io_ring_submit_unlock(ctx, issue_flags);
+}
+
+/*
+ * Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
+ * will try to cancel this issued command by sending ->uring_cmd() with
+ * issue_flags of IO_URING_F_CANCEL.
+ *
+ * The command is guaranteed to not be done when calling ->uring_cmd()
+ * with IO_URING_F_CANCEL, but it is driver's responsibility to deal
+ * with race between io_uring canceling and normal completion.
+ */
+void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(cmd);
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
+ cmd->flags |= IORING_URING_CMD_CANCELABLE;
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
+ io_ring_submit_unlock(ctx, issue_flags);
+ }
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
+
+struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
+{
+ return cmd_to_io_kiocb(cmd)->task;
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_get_task);
+
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -56,6 +101,8 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ io_uring_cmd_del_cancelable(ioucmd, issue_flags);
+
if (ret < 0)
req_set_fail(req);
@@ -91,7 +138,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
- if (ioucmd->flags & ~IORING_URING_CMD_FIXED)
+ if (ioucmd->flags & ~IORING_URING_CMD_MASK)
return -EINVAL;
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
new file mode 100644
index 000000000000..6f851978606d
--- /dev/null
+++ b/io_uring/waitid.c
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for async notification of waitid
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "cancel.h"
+#include "waitid.h"
+#include "../kernel/exit.h"
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts);
+
+#define IO_WAITID_CANCEL_FLAG BIT(31)
+#define IO_WAITID_REF_MASK GENMASK(30, 0)
+
+struct io_waitid {
+ struct file *file;
+ int which;
+ pid_t upid;
+ int options;
+ atomic_t refs;
+ struct wait_queue_head *head;
+ struct siginfo __user *infop;
+ struct waitid_info info;
+};
+
+static void io_waitid_free(struct io_kiocb *req)
+{
+ struct io_waitid_async *iwa = req->async_data;
+
+ put_pid(iwa->wo.wo_pid);
+ kfree(req->async_data);
+ req->async_data = NULL;
+ req->flags &= ~REQ_F_ASYNC_DATA;
+}
+
+#ifdef CONFIG_COMPAT
+static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
+{
+ struct compat_siginfo __user *infop;
+ bool ret;
+
+ infop = (struct compat_siginfo __user *) iw->infop;
+
+ if (!user_write_access_begin(infop, sizeof(*infop)))
+ return false;
+
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user(iw->info.cause, &infop->si_code, Efault);
+ unsafe_put_user(iw->info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(iw->info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(iw->info.status, &infop->si_status, Efault);
+ ret = true;
+done:
+ user_write_access_end();
+ return ret;
+Efault:
+ ret = false;
+ goto done;
+}
+#endif
+
+static bool io_waitid_copy_si(struct io_kiocb *req, int signo)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ bool ret;
+
+ if (!iw->infop)
+ return true;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ return io_waitid_compat_copy_si(iw, signo);
+#endif
+
+ if (!user_write_access_begin(iw->infop, sizeof(*iw->infop)))
+ return false;
+
+ unsafe_put_user(signo, &iw->infop->si_signo, Efault);
+ unsafe_put_user(0, &iw->infop->si_errno, Efault);
+ unsafe_put_user(iw->info.cause, &iw->infop->si_code, Efault);
+ unsafe_put_user(iw->info.pid, &iw->infop->si_pid, Efault);
+ unsafe_put_user(iw->info.uid, &iw->infop->si_uid, Efault);
+ unsafe_put_user(iw->info.status, &iw->infop->si_status, Efault);
+ ret = true;
+done:
+ user_write_access_end();
+ return ret;
+Efault:
+ ret = false;
+ goto done;
+}
+
+static int io_waitid_finish(struct io_kiocb *req, int ret)
+{
+ int signo = 0;
+
+ if (ret > 0) {
+ signo = SIGCHLD;
+ ret = 0;
+ }
+
+ if (!io_waitid_copy_si(req, signo))
+ ret = -EFAULT;
+ io_waitid_free(req);
+ return ret;
+}
+
+static void io_waitid_complete(struct io_kiocb *req, int ret)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_tw_state ts = { .locked = true };
+
+ /* anyone completing better be holding a reference */
+ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
+
+ lockdep_assert_held(&req->ctx->uring_lock);
+
+ /*
+ * Did cancel find it meanwhile?
+ */
+ if (hlist_unhashed(&req->hash_node))
+ return;
+
+ hlist_del_init(&req->hash_node);
+
+ ret = io_waitid_finish(req, ret);
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ io_req_task_complete(req, &ts);
+}
+
+static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_waitid_async *iwa = req->async_data;
+
+ /*
+ * Mark us canceled regardless of ownership. This will prevent a
+ * potential retry from a spurious wakeup.
+ */
+ atomic_or(IO_WAITID_CANCEL_FLAG, &iw->refs);
+
+ /* claim ownership */
+ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+ return false;
+
+ spin_lock_irq(&iw->head->lock);
+ list_del_init(&iwa->wo.child_wait.entry);
+ spin_unlock_irq(&iw->head->lock);
+ io_waitid_complete(req, -ECANCELED);
+ return true;
+}
+
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ int nr = 0;
+
+ if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED))
+ return -ENOENT;
+
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+ if (req->cqe.user_data != cd->data &&
+ !(cd->flags & IORING_ASYNC_CANCEL_ANY))
+ continue;
+ if (__io_waitid_cancel(ctx, req))
+ nr++;
+ if (!(cd->flags & IORING_ASYNC_CANCEL_ALL))
+ break;
+ }
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (nr)
+ return nr;
+
+ return -ENOENT;
+}
+
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *req;
+ bool found = false;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) {
+ if (!io_match_task_safe(req, task, cancel_all))
+ continue;
+ __io_waitid_cancel(ctx, req);
+ found = true;
+ }
+
+ return found;
+}
+
+static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_waitid_async *iwa = req->async_data;
+
+ if (!atomic_sub_return(1, &iw->refs))
+ return false;
+
+ /*
+ * Wakeup triggered, racing with us. It was prevented from
+ * completing because of that, queue up the tw to do that.
+ */
+ req->io_task_work.func = io_waitid_cb;
+ io_req_task_work_add(req);
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ return true;
+}
+
+static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts)
+{
+ struct io_waitid_async *iwa = req->async_data;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret;
+
+ io_tw_lock(ctx, ts);
+
+ ret = __do_wait(&iwa->wo);
+
+ /*
+ * If we get -ERESTARTSYS here, we need to re-arm and check again
+ * to ensure we get another callback. If the retry works, then we can
+ * just remove ourselves from the waitqueue again and finish the
+ * request.
+ */
+ if (unlikely(ret == -ERESTARTSYS)) {
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+ /* Don't retry if cancel found it meanwhile */
+ ret = -ECANCELED;
+ if (!(atomic_read(&iw->refs) & IO_WAITID_CANCEL_FLAG)) {
+ iw->head = &current->signal->wait_chldexit;
+ add_wait_queue(iw->head, &iwa->wo.child_wait);
+ ret = __do_wait(&iwa->wo);
+ if (ret == -ERESTARTSYS) {
+ /* retry armed, drop our ref */
+ io_waitid_drop_issue_ref(req);
+ return;
+ }
+
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ }
+ }
+
+ io_waitid_complete(req, ret);
+}
+
+static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+{
+ struct wait_opts *wo = container_of(wait, struct wait_opts, child_wait);
+ struct io_waitid_async *iwa = container_of(wo, struct io_waitid_async, wo);
+ struct io_kiocb *req = iwa->req;
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct task_struct *p = key;
+
+ if (!pid_child_should_wake(wo, p))
+ return 0;
+
+ /* cancel is in progress */
+ if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK)
+ return 1;
+
+ req->io_task_work.func = io_waitid_cb;
+ io_req_task_work_add(req);
+ list_del_init(&wait->entry);
+ return 1;
+}
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+
+ if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags)
+ return -EINVAL;
+
+ iw->which = READ_ONCE(sqe->len);
+ iw->upid = READ_ONCE(sqe->fd);
+ iw->options = READ_ONCE(sqe->file_index);
+ iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ return 0;
+}
+
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_waitid_async *iwa;
+ int ret;
+
+ if (io_alloc_async_data(req))
+ return -ENOMEM;
+
+ iwa = req->async_data;
+ iwa->req = req;
+
+ ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,
+ iw->options, NULL);
+ if (ret)
+ goto done;
+
+ /*
+ * Mark the request as busy upfront, in case we're racing with the
+ * wakeup. If we are, then we'll notice when we drop this initial
+ * reference again after arming.
+ */
+ atomic_set(&iw->refs, 1);
+
+ /*
+ * Cancel must hold the ctx lock, so there's no risk of cancelation
+ * finding us until a) we remain on the list, and b) the lock is
+ * dropped. We only need to worry about racing with the wakeup
+ * callback.
+ */
+ io_ring_submit_lock(ctx, issue_flags);
+ hlist_add_head(&req->hash_node, &ctx->waitid_list);
+
+ init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
+ iwa->wo.child_wait.private = req->task;
+ iw->head = &current->signal->wait_chldexit;
+ add_wait_queue(iw->head, &iwa->wo.child_wait);
+
+ ret = __do_wait(&iwa->wo);
+ if (ret == -ERESTARTSYS) {
+ /*
+ * Nobody else grabbed a reference, it'll complete when we get
+ * a waitqueue callback, or if someone cancels it.
+ */
+ if (!io_waitid_drop_issue_ref(req)) {
+ io_ring_submit_unlock(ctx, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+ /*
+ * Wakeup triggered, racing with us. It was prevented from
+ * completing because of that, queue up the tw to do that.
+ */
+ io_ring_submit_unlock(ctx, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+ hlist_del_init(&req->hash_node);
+ remove_wait_queue(iw->head, &iwa->wo.child_wait);
+ ret = io_waitid_finish(req, ret);
+
+ io_ring_submit_unlock(ctx, issue_flags);
+done:
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/waitid.h b/io_uring/waitid.h
new file mode 100644
index 000000000000..956a8adafe8c
--- /dev/null
+++ b/io_uring/waitid.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "../kernel/exit.h"
+
+struct io_waitid_async {
+ struct io_kiocb *req;
+ struct wait_opts wo;
+};
+
+int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_waitid(struct io_kiocb *req, unsigned int issue_flags);
+int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
+ unsigned int issue_flags);
+bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
+ bool cancel_all);
diff --git a/kernel/exit.c b/kernel/exit.c
index edb50b4c9972..2b4a232f2f68 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -74,6 +74,8 @@
#include <asm/unistd.h>
#include <asm/mmu_context.h>
+#include "exit.h"
+
/*
* The default value should be high enough to not crash a system that randomly
* crashes its kernel from time to time, but low enough to at least not permit
@@ -1037,26 +1039,6 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
-struct waitid_info {
- pid_t pid;
- uid_t uid;
- int status;
- int cause;
-};
-
-struct wait_opts {
- enum pid_type wo_type;
- int wo_flags;
- struct pid *wo_pid;
-
- struct waitid_info *wo_info;
- int wo_stat;
- struct rusage *wo_rusage;
-
- wait_queue_entry_t child_wait;
- int notask_error;
-};
-
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
@@ -1520,6 +1502,17 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
return 0;
}
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
+{
+ if (!eligible_pid(wo, p))
+ return false;
+
+ if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
+ return false;
+
+ return true;
+}
+
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
@@ -1527,13 +1520,10 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
child_wait);
struct task_struct *p = key;
- if (!eligible_pid(wo, p))
- return 0;
+ if (pid_child_should_wake(wo, p))
+ return default_wake_function(wait, mode, sync, key);
- if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
- return 0;
-
- return default_wake_function(wait, mode, sync, key);
+ return 0;
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
@@ -1582,16 +1572,10 @@ static int do_wait_pid(struct wait_opts *wo)
return 0;
}
-static long do_wait(struct wait_opts *wo)
+long __do_wait(struct wait_opts *wo)
{
- int retval;
+ long retval;
- trace_sched_process_wait(wo->wo_pid);
-
- init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
- wo->child_wait.private = current;
- add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
-repeat:
/*
* If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
@@ -1603,24 +1587,23 @@ repeat:
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
- set_current_state(TASK_INTERRUPTIBLE);
read_lock(&tasklist_lock);
if (wo->wo_type == PIDTYPE_PID) {
retval = do_wait_pid(wo);
if (retval)
- goto end;
+ return retval;
} else {
struct task_struct *tsk = current;
do {
retval = do_wait_thread(wo, tsk);
if (retval)
- goto end;
+ return retval;
retval = ptrace_do_wait(wo, tsk);
if (retval)
- goto end;
+ return retval;
if (wo->wo_flags & __WNOTHREAD)
break;
@@ -1630,27 +1613,44 @@ repeat:
notask:
retval = wo->notask_error;
- if (!retval && !(wo->wo_flags & WNOHANG)) {
- retval = -ERESTARTSYS;
- if (!signal_pending(current)) {
- schedule();
- goto repeat;
- }
- }
-end:
+ if (!retval && !(wo->wo_flags & WNOHANG))
+ return -ERESTARTSYS;
+
+ return retval;
+}
+
+static long do_wait(struct wait_opts *wo)
+{
+ int retval;
+
+ trace_sched_process_wait(wo->wo_pid);
+
+ init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
+ wo->child_wait.private = current;
+ add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
+
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ retval = __do_wait(wo);
+ if (retval != -ERESTARTSYS)
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ } while (1);
+
__set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
return retval;
}
-static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
- int options, struct rusage *ru)
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru)
{
- struct wait_opts wo;
+ unsigned int f_flags = 0;
struct pid *pid = NULL;
enum pid_type type;
- long ret;
- unsigned int f_flags = 0;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
@@ -1693,19 +1693,32 @@ static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
return -EINVAL;
}
- wo.wo_type = type;
- wo.wo_pid = pid;
- wo.wo_flags = options;
- wo.wo_info = infop;
- wo.wo_rusage = ru;
+ wo->wo_type = type;
+ wo->wo_pid = pid;
+ wo->wo_flags = options;
+ wo->wo_info = infop;
+ wo->wo_rusage = ru;
if (f_flags & O_NONBLOCK)
- wo.wo_flags |= WNOHANG;
+ wo->wo_flags |= WNOHANG;
+
+ return 0;
+}
+
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
+{
+ struct wait_opts wo;
+ long ret;
+
+ ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
+ if (ret)
+ return ret;
ret = do_wait(&wo);
- if (!ret && !(options & WNOHANG) && (f_flags & O_NONBLOCK))
+ if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
ret = -EAGAIN;
- put_pid(pid);
+ put_pid(wo.wo_pid);
return ret;
}
diff --git a/kernel/exit.h b/kernel/exit.h
new file mode 100644
index 000000000000..278faa26a653
--- /dev/null
+++ b/kernel/exit.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef LINUX_WAITID_H
+#define LINUX_WAITID_H
+
+struct waitid_info {
+ pid_t pid;
+ uid_t uid;
+ int status;
+ int cause;
+};
+
+struct wait_opts {
+ enum pid_type wo_type;
+ int wo_flags;
+ struct pid *wo_pid;
+
+ struct waitid_info *wo_info;
+ int wo_stat;
+ struct rusage *wo_rusage;
+
+ wait_queue_entry_t child_wait;
+ int notask_error;
+};
+
+bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p);
+long __do_wait(struct wait_opts *wo);
+int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
+ struct waitid_info *infop, int options,
+ struct rusage *ru);
+#endif