aboutsummaryrefslogtreecommitdiff
path: root/drivers/vfio/vfio.c
diff options
context:
space:
mode:
authorAlex Williamson <alex.williamson@redhat.com>2022-03-03 09:51:03 -0700
committerAlex Williamson <alex.williamson@redhat.com>2022-03-03 09:51:03 -0700
commitb042b27868c00142da1a7e31f4740a5fbd5628ed (patch)
tree1f4da31129aaf99885388a518eb2ae98e12ba94d /drivers/vfio/vfio.c
parentcfb92440ee71adcc2105b0890bb01ac3cddb8507 (diff)
parent88faa5e8ead62a2a559b1246bba7aa982fde7c67 (diff)
Merge tag 'mlx5-vfio-v10' of https://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux into v5.18/vfio/next/mlx5-migration-v10
Add mlx5 live migration driver and v2 migration protocol This series adds mlx5 live migration driver for VFs that are migration capable and includes the v2 migration protocol definition and mlx5 implementation. The mlx5 driver uses the vfio_pci_core split to create a specific VFIO PCI driver that matches the mlx5 virtual functions. The driver provides the same experience as normal vfio-pci with the addition of migration support. In HW the migration is controlled by the PF function, using its mlx5_core driver, and the VFIO PCI VF driver co-ordinates with the PF to execute the migration actions. The bulk of the v2 migration protocol is semantically the same v1, however it has been recast into a FSM for the device_state and the actual syscall interface uses normal ioctl(), read() and write() instead of building a syscall interface using the region. Several bits of infrastructure work are included here: - pci_iov_vf_id() to help drivers like mlx5 figure out the VF index from a BDF - pci_iov_get_pf_drvdata() to clarify the tricky locking protocol when a VF reaches into its PF's driver - mlx5_core uses the normal SRIOV lifecycle and disables SRIOV before driver remove, to be compatible with pci_iov_get_pf_drvdata() - Lifting VFIO_DEVICE_FEATURE into core VFIO code This series comes after alot of discussion. Some major points: - v1 ABI compatible migration defined using the same FSM approach: https://lore.kernel.org/all/0-v1-a4f7cab64938+3f-vfio_mig_states_jgg@nvidia.com/ - Attempts to clarify how the v1 API works: Alex's: https://lore.kernel.org/kvm/163909282574.728533.7460416142511440919.stgit@omen/ Jason's: https://lore.kernel.org/all/0-v3-184b374ad0a8+24c-vfio_mig_doc_jgg@nvidia.com/ - Etherpad exploring the scope and questions of general VFIO migration: https://lore.kernel.org/kvm/87mtm2loml.fsf@redhat.com/ NOTE: As this series touched mlx5_core parts we need to send this in a pull request format to VFIO to avoid conflicts. Matching qemu changes can be previewed here: https://github.com/jgunthorpe/qemu/commits/vfio_migration_v2 Link: https://lore.kernel.org/all/20220224142024.147653-1-yishaih@nvidia.com Signed-of-by: Leon Romanovsky <leonro@nvidia.com>
Diffstat (limited to 'drivers/vfio/vfio.c')
-rw-r--r--drivers/vfio/vfio.c296
1 files changed, 292 insertions, 4 deletions
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 735d1d344af9..a4555014bd1e 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1557,15 +1557,303 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep)
return 0;
}
+/*
+ * vfio_mig_get_next_state - Compute the next step in the FSM
+ * @cur_fsm - The current state the device is in
+ * @new_fsm - The target state to reach
+ * @next_fsm - Pointer to the next step to get to new_fsm
+ *
+ * Return 0 upon success, otherwise -errno
+ * Upon success the next step in the state progression between cur_fsm and
+ * new_fsm will be set in next_fsm.
+ *
+ * This breaks down requests for combination transitions into smaller steps and
+ * returns the next step to get to new_fsm. The function may need to be called
+ * multiple times before reaching new_fsm.
+ *
+ */
+int vfio_mig_get_next_state(struct vfio_device *device,
+ enum vfio_device_mig_state cur_fsm,
+ enum vfio_device_mig_state new_fsm,
+ enum vfio_device_mig_state *next_fsm)
+{
+ enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_RUNNING_P2P + 1 };
+ /*
+ * The coding in this table requires the driver to implement the
+ * following FSM arcs:
+ * RESUMING -> STOP
+ * STOP -> RESUMING
+ * STOP -> STOP_COPY
+ * STOP_COPY -> STOP
+ *
+ * If P2P is supported then the driver must also implement these FSM
+ * arcs:
+ * RUNNING -> RUNNING_P2P
+ * RUNNING_P2P -> RUNNING
+ * RUNNING_P2P -> STOP
+ * STOP -> RUNNING_P2P
+ * Without P2P the driver must implement:
+ * RUNNING -> STOP
+ * STOP -> RUNNING
+ *
+ * The coding will step through multiple states for some combination
+ * transitions; if all optional features are supported, this means the
+ * following ones:
+ * RESUMING -> STOP -> RUNNING_P2P
+ * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
+ * RESUMING -> STOP -> STOP_COPY
+ * RUNNING -> RUNNING_P2P -> STOP
+ * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
+ * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
+ * RUNNING_P2P -> STOP -> RESUMING
+ * RUNNING_P2P -> STOP -> STOP_COPY
+ * STOP -> RUNNING_P2P -> RUNNING
+ * STOP_COPY -> STOP -> RESUMING
+ * STOP_COPY -> STOP -> RUNNING_P2P
+ * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
+ */
+ static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
+ [VFIO_DEVICE_STATE_STOP] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+ },
+ [VFIO_DEVICE_STATE_RUNNING] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+ },
+ [VFIO_DEVICE_STATE_STOP_COPY] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+ },
+ [VFIO_DEVICE_STATE_RESUMING] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+ },
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
+ [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+ },
+ [VFIO_DEVICE_STATE_ERROR] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
+ [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
+ },
+ };
+
+ static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
+ [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
+ [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
+ [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
+ [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
+ [VFIO_DEVICE_STATE_RUNNING_P2P] =
+ VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
+ [VFIO_DEVICE_STATE_ERROR] = ~0U,
+ };
+
+ if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
+ (state_flags_table[cur_fsm] & device->migration_flags) !=
+ state_flags_table[cur_fsm]))
+ return -EINVAL;
+
+ if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
+ (state_flags_table[new_fsm] & device->migration_flags) !=
+ state_flags_table[new_fsm])
+ return -EINVAL;
+
+ /*
+ * Arcs touching optional and unsupported states are skipped over. The
+ * driver will instead see an arc from the original state to the next
+ * logical state, as per the above comment.
+ */
+ *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
+ while ((state_flags_table[*next_fsm] & device->migration_flags) !=
+ state_flags_table[*next_fsm])
+ *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
+
+ return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
+
+/*
+ * Convert the drivers's struct file into a FD number and return it to userspace
+ */
+static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
+ struct vfio_device_feature_mig_state *mig)
+{
+ int ret;
+ int fd;
+
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0) {
+ ret = fd;
+ goto out_fput;
+ }
+
+ mig->data_fd = fd;
+ if (copy_to_user(arg, mig, sizeof(*mig))) {
+ ret = -EFAULT;
+ goto out_put_unused;
+ }
+ fd_install(fd, filp);
+ return 0;
+
+out_put_unused:
+ put_unused_fd(fd);
+out_fput:
+ fput(filp);
+ return ret;
+}
+
+static int
+vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
+ u32 flags, void __user *arg,
+ size_t argsz)
+{
+ size_t minsz =
+ offsetofend(struct vfio_device_feature_mig_state, data_fd);
+ struct vfio_device_feature_mig_state mig;
+ struct file *filp = NULL;
+ int ret;
+
+ if (!device->ops->migration_set_state ||
+ !device->ops->migration_get_state)
+ return -ENOTTY;
+
+ ret = vfio_check_feature(flags, argsz,
+ VFIO_DEVICE_FEATURE_SET |
+ VFIO_DEVICE_FEATURE_GET,
+ sizeof(mig));
+ if (ret != 1)
+ return ret;
+
+ if (copy_from_user(&mig, arg, minsz))
+ return -EFAULT;
+
+ if (flags & VFIO_DEVICE_FEATURE_GET) {
+ enum vfio_device_mig_state curr_state;
+
+ ret = device->ops->migration_get_state(device, &curr_state);
+ if (ret)
+ return ret;
+ mig.device_state = curr_state;
+ goto out_copy;
+ }
+
+ /* Handle the VFIO_DEVICE_FEATURE_SET */
+ filp = device->ops->migration_set_state(device, mig.device_state);
+ if (IS_ERR(filp) || !filp)
+ goto out_copy;
+
+ return vfio_ioct_mig_return_fd(filp, arg, &mig);
+out_copy:
+ mig.data_fd = -1;
+ if (copy_to_user(arg, &mig, sizeof(mig)))
+ return -EFAULT;
+ if (IS_ERR(filp))
+ return PTR_ERR(filp);
+ return 0;
+}
+
+static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
+ u32 flags, void __user *arg,
+ size_t argsz)
+{
+ struct vfio_device_feature_migration mig = {
+ .flags = device->migration_flags,
+ };
+ int ret;
+
+ if (!device->ops->migration_set_state ||
+ !device->ops->migration_get_state)
+ return -ENOTTY;
+
+ ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
+ sizeof(mig));
+ if (ret != 1)
+ return ret;
+ if (copy_to_user(arg, &mig, sizeof(mig)))
+ return -EFAULT;
+ return 0;
+}
+
+static int vfio_ioctl_device_feature(struct vfio_device *device,
+ struct vfio_device_feature __user *arg)
+{
+ size_t minsz = offsetofend(struct vfio_device_feature, flags);
+ struct vfio_device_feature feature;
+
+ if (copy_from_user(&feature, arg, minsz))
+ return -EFAULT;
+
+ if (feature.argsz < minsz)
+ return -EINVAL;
+
+ /* Check unknown flags */
+ if (feature.flags &
+ ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
+ VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
+ return -EINVAL;
+
+ /* GET & SET are mutually exclusive except with PROBE */
+ if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
+ (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
+ (feature.flags & VFIO_DEVICE_FEATURE_GET))
+ return -EINVAL;
+
+ switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
+ case VFIO_DEVICE_FEATURE_MIGRATION:
+ return vfio_ioctl_device_feature_migration(
+ device, feature.flags, arg->data,
+ feature.argsz - minsz);
+ case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
+ return vfio_ioctl_device_feature_mig_device_state(
+ device, feature.flags, arg->data,
+ feature.argsz - minsz);
+ default:
+ if (unlikely(!device->ops->device_feature))
+ return -EINVAL;
+ return device->ops->device_feature(device, feature.flags,
+ arg->data,
+ feature.argsz - minsz);
+ }
+}
+
static long vfio_device_fops_unl_ioctl(struct file *filep,
unsigned int cmd, unsigned long arg)
{
struct vfio_device *device = filep->private_data;
- if (unlikely(!device->ops->ioctl))
- return -EINVAL;
-
- return device->ops->ioctl(device, cmd, arg);
+ switch (cmd) {
+ case VFIO_DEVICE_FEATURE:
+ return vfio_ioctl_device_feature(device, (void __user *)arg);
+ default:
+ if (unlikely(!device->ops->ioctl))
+ return -EINVAL;
+ return device->ops->ioctl(device, cmd, arg);
+ }
}
static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,