From af6605f87ca585f426272e4a5096771ae273f30f Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 1 Apr 2024 11:02:22 -0600 Subject: MAINTAINERS: Orphan vfio fsl-mc bus driver Email to Diana is bouncing. I've reached out through other channels but not been successful for a couple months. Lore shows no email from Diana for approximately 18 months. Mark this driver as orphaned. Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240401170224.3700774-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index ebf03f5f0619..aa415ddc6c03 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23225,9 +23225,8 @@ F: include/linux/vfio_pci_core.h F: include/uapi/linux/vfio.h VFIO FSL-MC DRIVER -M: Diana Craciun L: kvm@vger.kernel.org -S: Maintained +S: Orphan F: drivers/vfio/fsl-mc/ VFIO HISILICON PCI DRIVER -- cgit From 071e7310e69368de551388088827c57df05f59aa Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 1 Apr 2024 13:54:02 -0600 Subject: vfio/pci: Pass eventfd context to IRQ handler Create a link back to the vfio_pci_core_device on the eventfd context object to avoid lookups in the interrupt path. The context is known valid in the interrupt handler. Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240401195406.3720453-2-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index fb5392b749ff..31dbc2d061f3 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -23,11 +23,12 @@ #include "vfio_pci_priv.h" struct vfio_pci_irq_ctx { - struct eventfd_ctx *trigger; - struct virqfd *unmask; - struct virqfd *mask; - char *name; - bool masked; + struct vfio_pci_core_device *vdev; + struct eventfd_ctx *trigger; + struct virqfd *unmask; + struct virqfd *mask; + char *name; + bool masked; struct irq_bypass_producer producer; }; @@ -228,15 +229,11 @@ void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) static irqreturn_t vfio_intx_handler(int irq, void *dev_id) { - struct vfio_pci_core_device *vdev = dev_id; - struct vfio_pci_irq_ctx *ctx; + struct vfio_pci_irq_ctx *ctx = dev_id; + struct vfio_pci_core_device *vdev = ctx->vdev; unsigned long flags; int ret = IRQ_NONE; - ctx = vfio_irq_ctx_get(vdev, 0); - if (WARN_ON_ONCE(!ctx)) - return ret; - spin_lock_irqsave(&vdev->irqlock, flags); if (!vdev->pci_2_3) { @@ -282,6 +279,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, ctx->name = name; ctx->trigger = trigger; + ctx->vdev = vdev; /* * Fill the initial masked state based on virq_disabled. After @@ -312,7 +310,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; ret = request_irq(pdev->irq, vfio_intx_handler, - irqflags, ctx->name, vdev); + irqflags, ctx->name, ctx); if (ret) { vdev->irq_type = VFIO_PCI_NUM_IRQS; kfree(name); @@ -358,7 +356,7 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev) if (ctx) { vfio_virqfd_disable(&ctx->unmask); vfio_virqfd_disable(&ctx->mask); - free_irq(pdev->irq, vdev); + free_irq(pdev->irq, ctx); if (ctx->trigger) eventfd_ctx_put(ctx->trigger); kfree(ctx->name); -- cgit From d530531936482f97b7b3784793d3514185b820d4 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 1 Apr 2024 13:54:03 -0600 Subject: vfio/pci: Pass eventfd context object through irqfd Further avoid lookup of the context object by passing it through the irqfd data field. Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240401195406.3720453-3-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 31dbc2d061f3..d52825a096ad 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -85,19 +85,14 @@ vfio_irq_ctx_alloc(struct vfio_pci_core_device *vdev, unsigned long index) /* * INTx */ -static void vfio_send_intx_eventfd(void *opaque, void *unused) +static void vfio_send_intx_eventfd(void *opaque, void *data) { struct vfio_pci_core_device *vdev = opaque; if (likely(is_intx(vdev) && !vdev->virq_disabled)) { - struct vfio_pci_irq_ctx *ctx; - struct eventfd_ctx *trigger; + struct vfio_pci_irq_ctx *ctx = data; + struct eventfd_ctx *trigger = READ_ONCE(ctx->trigger); - ctx = vfio_irq_ctx_get(vdev, 0); - if (WARN_ON_ONCE(!ctx)) - return; - - trigger = READ_ONCE(ctx->trigger); if (likely(trigger)) eventfd_signal(trigger); } @@ -167,11 +162,11 @@ bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) * a signal is necessary, which can then be handled via a work queue * or directly depending on the caller. */ -static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) +static int vfio_pci_intx_unmask_handler(void *opaque, void *data) { struct vfio_pci_core_device *vdev = opaque; struct pci_dev *pdev = vdev->pdev; - struct vfio_pci_irq_ctx *ctx; + struct vfio_pci_irq_ctx *ctx = data; unsigned long flags; int ret = 0; @@ -187,10 +182,6 @@ static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) goto out_unlock; } - ctx = vfio_irq_ctx_get(vdev, 0); - if (WARN_ON_ONCE(!ctx)) - goto out_unlock; - if (ctx->masked && !vdev->virq_disabled) { /* * A pending interrupt here would immediately trigger, @@ -214,10 +205,12 @@ out_unlock: static void __vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) { + struct vfio_pci_irq_ctx *ctx = vfio_irq_ctx_get(vdev, 0); + lockdep_assert_held(&vdev->igate); - if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) - vfio_send_intx_eventfd(vdev, NULL); + if (vfio_pci_intx_unmask_handler(vdev, ctx) > 0) + vfio_send_intx_eventfd(vdev, ctx); } void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev) @@ -249,7 +242,7 @@ static irqreturn_t vfio_intx_handler(int irq, void *dev_id) spin_unlock_irqrestore(&vdev->irqlock, flags); if (ret == IRQ_HANDLED) - vfio_send_intx_eventfd(vdev, NULL); + vfio_send_intx_eventfd(vdev, ctx); return ret; } @@ -604,7 +597,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_core_device *vdev, if (fd >= 0) return vfio_virqfd_enable((void *) vdev, vfio_pci_intx_unmask_handler, - vfio_send_intx_eventfd, NULL, + vfio_send_intx_eventfd, ctx, &ctx->unmask, fd); vfio_virqfd_disable(&ctx->unmask); @@ -671,11 +664,11 @@ static int vfio_pci_set_intx_trigger(struct vfio_pci_core_device *vdev, return -EINVAL; if (flags & VFIO_IRQ_SET_DATA_NONE) { - vfio_send_intx_eventfd(vdev, NULL); + vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0)); } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger = *(uint8_t *)data; if (trigger) - vfio_send_intx_eventfd(vdev, NULL); + vfio_send_intx_eventfd(vdev, vfio_irq_ctx_get(vdev, 0)); } return 0; } -- cgit From 82b951e6fbd31d85ae7f4feb5f00ddd4c5d256e2 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 15 Apr 2024 09:50:29 +0800 Subject: vfio/pci: fix potential memory leak in vfio_intx_enable() If vfio_irq_ctx_alloc() failed will lead to 'name' memory leak. Fixes: 18c198c96a81 ("vfio/pci: Create persistent INTx handler") Signed-off-by: Ye Bin Reviewed-by: Kevin Tian Acked-by: Reinette Chatre Link: https://lore.kernel.org/r/20240415015029.3699844-1-yebin10@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index d52825a096ad..8382c5834335 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -267,8 +267,10 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, return -ENOMEM; ctx = vfio_irq_ctx_alloc(vdev, 0); - if (!ctx) + if (!ctx) { + kfree(name); return -ENOMEM; + } ctx->name = name; ctx->trigger = trigger; -- cgit From 06fe8fd6808562971637c6b133c806bcf49097ad Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Tue, 23 Apr 2024 16:40:20 +0530 Subject: genirq/msi: Add MSI allocation helper and export MSI functions MSI functions for allocation and free can be directly used by the device drivers without any wrapper provided by bus drivers. So export these MSI functions. Also, add a wrapper API to allocate MSIs providing only the number of interrupts rather than range for simpler driver usage. Signed-off-by: Nipun Gupta Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20240423111021.1686144-1-nipun.gupta@amd.com Signed-off-by: Alex Williamson --- include/linux/msi.h | 6 ++++++ kernel/irq/msi.c | 2 ++ 2 files changed, 8 insertions(+) diff --git a/include/linux/msi.h b/include/linux/msi.h index 26d07e23052e..765a65581a66 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -676,6 +676,12 @@ int platform_device_msi_init_and_alloc_irqs(struct device *dev, unsigned int nve void platform_device_msi_free_irqs_all(struct device *dev); bool msi_device_has_isolated_msi(struct device *dev); + +static inline int msi_domain_alloc_irqs(struct device *dev, unsigned int domid, int nirqs) +{ + return msi_domain_alloc_irqs_range(dev, domid, 0, nirqs - 1); +} + #else /* CONFIG_GENERIC_MSI_IRQ */ static inline bool msi_device_has_isolated_msi(struct device *dev) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index f90952ebc494..2024f89baea4 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -1434,6 +1434,7 @@ int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, msi_unlock_descs(dev); return ret; } +EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range); /** * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain @@ -1680,6 +1681,7 @@ void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, msi_domain_free_irqs_range_locked(dev, domid, first, last); msi_unlock_descs(dev); } +EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all); /** * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain -- cgit From 848e447e000c41894ff931dc7c004fd42c8840f8 Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Tue, 23 Apr 2024 16:40:21 +0530 Subject: vfio/cdx: add interrupt support Support the following ioctls for CDX devices: - VFIO_DEVICE_GET_IRQ_INFO - VFIO_DEVICE_SET_IRQS This allows user to set an eventfd for cdx device interrupts and trigger this interrupt eventfd from userspace. All CDX device interrupts are MSIs. The MSIs are allocated from the CDX-MSI domain. Signed-off-by: Nipun Gupta Reviewed-by: Pieter Jansen van Vuuren Link: https://lore.kernel.org/r/20240423111021.1686144-2-nipun.gupta@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/Makefile | 2 +- drivers/vfio/cdx/intr.c | 217 +++++++++++++++++++++++++++++++++++++++++++++ drivers/vfio/cdx/main.c | 63 ++++++++++++- drivers/vfio/cdx/private.h | 18 ++++ 4 files changed, 298 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/cdx/intr.c diff --git a/drivers/vfio/cdx/Makefile b/drivers/vfio/cdx/Makefile index cd4a2e6fe609..df92b320122a 100644 --- a/drivers/vfio/cdx/Makefile +++ b/drivers/vfio/cdx/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_VFIO_CDX) += vfio-cdx.o -vfio-cdx-objs := main.o +vfio-cdx-objs := main.o intr.o diff --git a/drivers/vfio/cdx/intr.c b/drivers/vfio/cdx/intr.c new file mode 100644 index 000000000000..986fa2a45fa4 --- /dev/null +++ b/drivers/vfio/cdx/intr.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + */ + +#include +#include +#include +#include +#include +#include + +#include "linux/cdx/cdx_bus.h" +#include "private.h" + +static irqreturn_t vfio_cdx_msihandler(int irq_no, void *arg) +{ + struct eventfd_ctx *trigger = arg; + + eventfd_signal(trigger); + return IRQ_HANDLED; +} + +static int vfio_cdx_msi_enable(struct vfio_cdx_device *vdev, int nvec) +{ + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + struct device *dev = vdev->vdev.dev; + int msi_idx, ret; + + vdev->cdx_irqs = kcalloc(nvec, sizeof(struct vfio_cdx_irq), GFP_KERNEL); + if (!vdev->cdx_irqs) + return -ENOMEM; + + ret = cdx_enable_msi(cdx_dev); + if (ret) { + kfree(vdev->cdx_irqs); + return ret; + } + + /* Allocate cdx MSIs */ + ret = msi_domain_alloc_irqs(dev, MSI_DEFAULT_DOMAIN, nvec); + if (ret) { + cdx_disable_msi(cdx_dev); + kfree(vdev->cdx_irqs); + return ret; + } + + for (msi_idx = 0; msi_idx < nvec; msi_idx++) + vdev->cdx_irqs[msi_idx].irq_no = msi_get_virq(dev, msi_idx); + + vdev->msi_count = nvec; + vdev->config_msi = 1; + + return 0; +} + +static int vfio_cdx_msi_set_vector_signal(struct vfio_cdx_device *vdev, + int vector, int fd) +{ + struct eventfd_ctx *trigger; + int irq_no, ret; + + if (vector < 0 || vector >= vdev->msi_count) + return -EINVAL; + + irq_no = vdev->cdx_irqs[vector].irq_no; + + if (vdev->cdx_irqs[vector].trigger) { + free_irq(irq_no, vdev->cdx_irqs[vector].trigger); + kfree(vdev->cdx_irqs[vector].name); + eventfd_ctx_put(vdev->cdx_irqs[vector].trigger); + vdev->cdx_irqs[vector].trigger = NULL; + } + + if (fd < 0) + return 0; + + vdev->cdx_irqs[vector].name = kasprintf(GFP_KERNEL, "vfio-msi[%d](%s)", + vector, dev_name(vdev->vdev.dev)); + if (!vdev->cdx_irqs[vector].name) + return -ENOMEM; + + trigger = eventfd_ctx_fdget(fd); + if (IS_ERR(trigger)) { + kfree(vdev->cdx_irqs[vector].name); + return PTR_ERR(trigger); + } + + ret = request_irq(irq_no, vfio_cdx_msihandler, 0, + vdev->cdx_irqs[vector].name, trigger); + if (ret) { + kfree(vdev->cdx_irqs[vector].name); + eventfd_ctx_put(trigger); + return ret; + } + + vdev->cdx_irqs[vector].trigger = trigger; + + return 0; +} + +static int vfio_cdx_msi_set_block(struct vfio_cdx_device *vdev, + unsigned int start, unsigned int count, + int32_t *fds) +{ + int i, j, ret = 0; + + if (start >= vdev->msi_count || start + count > vdev->msi_count) + return -EINVAL; + + for (i = 0, j = start; i < count && !ret; i++, j++) { + int fd = fds ? fds[i] : -1; + + ret = vfio_cdx_msi_set_vector_signal(vdev, j, fd); + } + + if (ret) { + for (--j; j >= (int)start; j--) + vfio_cdx_msi_set_vector_signal(vdev, j, -1); + } + + return ret; +} + +static void vfio_cdx_msi_disable(struct vfio_cdx_device *vdev) +{ + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + struct device *dev = vdev->vdev.dev; + + vfio_cdx_msi_set_block(vdev, 0, vdev->msi_count, NULL); + + if (!vdev->config_msi) + return; + + msi_domain_free_irqs_all(dev, MSI_DEFAULT_DOMAIN); + cdx_disable_msi(cdx_dev); + kfree(vdev->cdx_irqs); + + vdev->cdx_irqs = NULL; + vdev->msi_count = 0; + vdev->config_msi = 0; +} + +static int vfio_cdx_set_msi_trigger(struct vfio_cdx_device *vdev, + unsigned int index, unsigned int start, + unsigned int count, u32 flags, + void *data) +{ + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + int i; + + if (start + count > cdx_dev->num_msi) + return -EINVAL; + + if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) { + vfio_cdx_msi_disable(vdev); + return 0; + } + + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + s32 *fds = data; + int ret; + + if (vdev->config_msi) + return vfio_cdx_msi_set_block(vdev, start, count, + fds); + ret = vfio_cdx_msi_enable(vdev, cdx_dev->num_msi); + if (ret) + return ret; + + ret = vfio_cdx_msi_set_block(vdev, start, count, fds); + if (ret) + vfio_cdx_msi_disable(vdev); + + return ret; + } + + for (i = start; i < start + count; i++) { + if (!vdev->cdx_irqs[i].trigger) + continue; + if (flags & VFIO_IRQ_SET_DATA_NONE) { + eventfd_signal(vdev->cdx_irqs[i].trigger); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + u8 *bools = data; + + if (bools[i - start]) + eventfd_signal(vdev->cdx_irqs[i].trigger); + } + } + + return 0; +} + +int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev, + u32 flags, unsigned int index, + unsigned int start, unsigned int count, + void *data) +{ + if (flags & VFIO_IRQ_SET_ACTION_TRIGGER) + return vfio_cdx_set_msi_trigger(vdev, index, start, + count, flags, data); + else + return -EINVAL; +} + +/* Free All IRQs for the given device */ +void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev) +{ + /* + * Device does not support any interrupt or the interrupts + * were not configured + */ + if (!vdev->cdx_irqs) + return; + + vfio_cdx_set_msi_trigger(vdev, 0, 0, 0, VFIO_IRQ_SET_DATA_NONE, NULL); +} diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 9cff8d75789e..67465fad5b4b 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -61,6 +61,7 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev) kfree(vdev->regions); cdx_dev_reset(core_vdev->dev); + vfio_cdx_irqs_cleanup(vdev); } static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags, @@ -123,7 +124,7 @@ static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev, info.flags |= VFIO_DEVICE_FLAGS_RESET; info.num_regions = cdx_dev->res_count; - info.num_irqs = 0; + info.num_irqs = cdx_dev->num_msi ? 1 : 0; return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } @@ -152,6 +153,62 @@ static int vfio_cdx_ioctl_get_region_info(struct vfio_cdx_device *vdev, return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } +static int vfio_cdx_ioctl_get_irq_info(struct vfio_cdx_device *vdev, + struct vfio_irq_info __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_irq_info, count); + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + struct vfio_irq_info info; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= 1) + return -EINVAL; + + if (!cdx_dev->num_msi) + return -EINVAL; + + info.flags = VFIO_IRQ_INFO_EVENTFD | VFIO_IRQ_INFO_NORESIZE; + info.count = cdx_dev->num_msi; + + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; +} + +static int vfio_cdx_ioctl_set_irqs(struct vfio_cdx_device *vdev, + struct vfio_irq_set __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_irq_set, count); + struct cdx_device *cdx_dev = to_cdx_device(vdev->vdev.dev); + struct vfio_irq_set hdr; + size_t data_size = 0; + u8 *data = NULL; + int ret = 0; + + if (copy_from_user(&hdr, arg, minsz)) + return -EFAULT; + + ret = vfio_set_irqs_validate_and_prepare(&hdr, cdx_dev->num_msi, + 1, &data_size); + if (ret) + return ret; + + if (data_size) { + data = memdup_user(arg->data, data_size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + ret = vfio_cdx_set_irqs_ioctl(vdev, hdr.flags, hdr.index, + hdr.start, hdr.count, data); + kfree(data); + + return ret; +} + static long vfio_cdx_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { @@ -164,6 +221,10 @@ static long vfio_cdx_ioctl(struct vfio_device *core_vdev, return vfio_cdx_ioctl_get_info(vdev, uarg); case VFIO_DEVICE_GET_REGION_INFO: return vfio_cdx_ioctl_get_region_info(vdev, uarg); + case VFIO_DEVICE_GET_IRQ_INFO: + return vfio_cdx_ioctl_get_irq_info(vdev, uarg); + case VFIO_DEVICE_SET_IRQS: + return vfio_cdx_ioctl_set_irqs(vdev, uarg); case VFIO_DEVICE_RESET: return cdx_dev_reset(core_vdev->dev); default: diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h index 8e9d25913728..dc56729b3114 100644 --- a/drivers/vfio/cdx/private.h +++ b/drivers/vfio/cdx/private.h @@ -13,6 +13,14 @@ static inline u64 vfio_cdx_index_to_offset(u32 index) return ((u64)(index) << VFIO_CDX_OFFSET_SHIFT); } +struct vfio_cdx_irq { + u32 flags; + u32 count; + int irq_no; + struct eventfd_ctx *trigger; + char *name; +}; + struct vfio_cdx_region { u32 flags; u32 type; @@ -23,8 +31,18 @@ struct vfio_cdx_region { struct vfio_cdx_device { struct vfio_device vdev; struct vfio_cdx_region *regions; + struct vfio_cdx_irq *cdx_irqs; u32 flags; #define BME_SUPPORT BIT(0) + u32 msi_count; + u8 config_msi; }; +int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev, + u32 flags, unsigned int index, + unsigned int start, unsigned int count, + void *data); + +void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev); + #endif /* VFIO_CDX_PRIVATE_H */ -- cgit From bb208810b1abf1c84870cfbe1cc9cf1a1d35c607 Mon Sep 17 00:00:00 2001 From: Xin Zeng Date: Fri, 26 Apr 2024 14:40:51 +0800 Subject: vfio/qat: Add vfio_pci driver for Intel QAT SR-IOV VF devices Add vfio pci variant driver for Intel QAT SR-IOV VF devices. This driver registers to the vfio subsystem through the interfaces exposed by the subsystem. It follows the live migration protocol v2 defined in uapi/linux/vfio.h and interacts with Intel QAT PF driver through a set of interfaces defined in qat/qat_mig_dev.h to support live migration of Intel QAT VF devices. This version only covers migration for Intel QAT GEN4 VF devices. Co-developed-by: Yahui Cao Signed-off-by: Yahui Cao Signed-off-by: Xin Zeng Reviewed-by: Giovanni Cabiddu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240426064051.2859652-1-xin.zeng@intel.com Signed-off-by: Alex Williamson --- MAINTAINERS | 8 + drivers/vfio/pci/Kconfig | 2 + drivers/vfio/pci/Makefile | 2 + drivers/vfio/pci/qat/Kconfig | 12 + drivers/vfio/pci/qat/Makefile | 3 + drivers/vfio/pci/qat/main.c | 702 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 729 insertions(+) create mode 100644 drivers/vfio/pci/qat/Kconfig create mode 100644 drivers/vfio/pci/qat/Makefile create mode 100644 drivers/vfio/pci/qat/main.c diff --git a/MAINTAINERS b/MAINTAINERS index aa415ddc6c03..909d85678f79 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23280,6 +23280,14 @@ L: kvm@vger.kernel.org S: Maintained F: drivers/vfio/platform/ +VFIO QAT PCI DRIVER +M: Xin Zeng +M: Giovanni Cabiddu +L: kvm@vger.kernel.org +L: qat-linux@intel.com +S: Supported +F: drivers/vfio/pci/qat/ + VFIO VIRTIO PCI DRIVER M: Yishai Hadas L: kvm@vger.kernel.org diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 15821a2d77d2..bf50ffa10bde 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -69,4 +69,6 @@ source "drivers/vfio/pci/virtio/Kconfig" source "drivers/vfio/pci/nvgrace-gpu/Kconfig" +source "drivers/vfio/pci/qat/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index ce7a61f1d912..cf00c0a7e55c 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -17,3 +17,5 @@ obj-$(CONFIG_PDS_VFIO_PCI) += pds/ obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/ obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/ + +obj-$(CONFIG_QAT_VFIO_PCI) += qat/ diff --git a/drivers/vfio/pci/qat/Kconfig b/drivers/vfio/pci/qat/Kconfig new file mode 100644 index 000000000000..bf52cfa4b595 --- /dev/null +++ b/drivers/vfio/pci/qat/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config QAT_VFIO_PCI + tristate "VFIO support for QAT VF PCI devices" + select VFIO_PCI_CORE + depends on CRYPTO_DEV_QAT_4XXX + help + This provides migration support for Intel(R) QAT Virtual Function + using the VFIO framework. + + To compile this as a module, choose M here: the module + will be called qat_vfio_pci. If you don't know what to do here, + say N. diff --git a/drivers/vfio/pci/qat/Makefile b/drivers/vfio/pci/qat/Makefile new file mode 100644 index 000000000000..5fe5c4ec19d3 --- /dev/null +++ b/drivers/vfio/pci/qat/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_QAT_VFIO_PCI) += qat_vfio_pci.o +qat_vfio_pci-y := main.o diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c new file mode 100644 index 000000000000..e36740a282e7 --- /dev/null +++ b/drivers/vfio/pci/qat/main.c @@ -0,0 +1,702 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2024 Intel Corporation */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The migration data of each Intel QAT VF device is encapsulated into a + * 4096 bytes block. The data consists of two parts. + * The first is a pre-configured set of attributes of the VF being migrated, + * which are only set when it is created. This can be migrated during pre-copy + * stage and used for a device compatibility check. + * The second is the VF state. This includes the required MMIO regions and + * the shadow states maintained by the QAT PF driver. This part can only be + * saved when the VF is fully quiesced and be migrated during stop-copy stage. + * Both these 2 parts of data are saved in hierarchical structures including + * a preamble section and several raw state sections. + * When the pre-configured part of the migration data is fully retrieved from + * user space, the preamble section are used to validate the correctness of + * the data blocks and check the version compatibility. The raw state sections + * are then used to do a device compatibility check. + * When the device transits from RESUMING state, the VF states are extracted + * from the raw state sections of the VF state part of the migration data and + * then loaded into the device. + */ + +struct qat_vf_migration_file { + struct file *filp; + /* protects migration region context */ + struct mutex lock; + bool disabled; + struct qat_vf_core_device *qat_vdev; + ssize_t filled_size; +}; + +struct qat_vf_core_device { + struct vfio_pci_core_device core_device; + struct qat_mig_dev *mdev; + /* protects migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + struct qat_vf_migration_file *resuming_migf; + struct qat_vf_migration_file *saving_migf; +}; + +static int qat_vf_pci_open_device(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = + container_of(core_vdev, struct qat_vf_core_device, + core_device.vdev); + struct vfio_pci_core_device *vdev = &qat_vdev->core_device; + int ret; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + ret = qat_vfmig_open(qat_vdev->mdev); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; + } + qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + + vfio_pci_core_finish_enable(vdev); + + return 0; +} + +static void qat_vf_disable_fd(struct qat_vf_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->disabled = true; + migf->filp->f_pos = 0; + migf->filled_size = 0; + mutex_unlock(&migf->lock); +} + +static void qat_vf_disable_fds(struct qat_vf_core_device *qat_vdev) +{ + if (qat_vdev->resuming_migf) { + qat_vf_disable_fd(qat_vdev->resuming_migf); + fput(qat_vdev->resuming_migf->filp); + qat_vdev->resuming_migf = NULL; + } + + if (qat_vdev->saving_migf) { + qat_vf_disable_fd(qat_vdev->saving_migf); + fput(qat_vdev->saving_migf->filp); + qat_vdev->saving_migf = NULL; + } +} + +static void qat_vf_pci_close_device(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + + qat_vfmig_close(qat_vdev->mdev); + qat_vf_disable_fds(qat_vdev); + vfio_pci_core_close_device(core_vdev); +} + +static long qat_vf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct qat_vf_migration_file *migf = filp->private_data; + struct qat_vf_core_device *qat_vdev = migf->qat_vdev; + struct qat_mig_dev *mig_dev = qat_vdev->mdev; + struct vfio_precopy_info info; + loff_t *pos = &filp->f_pos; + unsigned long minsz; + int ret = 0; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&qat_vdev->state_mutex); + if (qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + mutex_unlock(&qat_vdev->state_mutex); + return -EINVAL; + } + + mutex_lock(&migf->lock); + if (migf->disabled) { + ret = -ENODEV; + goto out; + } + + if (*pos > mig_dev->setup_size) { + ret = -EINVAL; + goto out; + } + + info.dirty_bytes = 0; + info.initial_bytes = mig_dev->setup_size - *pos; + +out: + mutex_unlock(&migf->lock); + mutex_unlock(&qat_vdev->state_mutex); + if (ret) + return ret; + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} + +static ssize_t qat_vf_save_read(struct file *filp, char __user *buf, + size_t len, loff_t *pos) +{ + struct qat_vf_migration_file *migf = filp->private_data; + struct qat_mig_dev *mig_dev = migf->qat_vdev->mdev; + ssize_t done = 0; + loff_t *offs; + int ret; + + if (pos) + return -ESPIPE; + offs = &filp->f_pos; + + mutex_lock(&migf->lock); + if (*offs > migf->filled_size || *offs < 0) { + done = -EINVAL; + goto out_unlock; + } + + if (migf->disabled) { + done = -ENODEV; + goto out_unlock; + } + + len = min_t(size_t, migf->filled_size - *offs, len); + if (len) { + ret = copy_to_user(buf, mig_dev->state + *offs, len); + if (ret) { + done = -EFAULT; + goto out_unlock; + } + *offs += len; + done = len; + } + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static int qat_vf_release_file(struct inode *inode, struct file *filp) +{ + struct qat_vf_migration_file *migf = filp->private_data; + + qat_vf_disable_fd(migf); + mutex_destroy(&migf->lock); + kfree(migf); + + return 0; +} + +static const struct file_operations qat_vf_save_fops = { + .owner = THIS_MODULE, + .read = qat_vf_save_read, + .unlocked_ioctl = qat_vf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .release = qat_vf_release_file, + .llseek = no_llseek, +}; + +static int qat_vf_save_state(struct qat_vf_core_device *qat_vdev, + struct qat_vf_migration_file *migf) +{ + int ret; + + ret = qat_vfmig_save_state(qat_vdev->mdev); + if (ret) + return ret; + migf->filled_size = qat_vdev->mdev->state_size; + + return 0; +} + +static int qat_vf_save_setup(struct qat_vf_core_device *qat_vdev, + struct qat_vf_migration_file *migf) +{ + int ret; + + ret = qat_vfmig_save_setup(qat_vdev->mdev); + if (ret) + return ret; + migf->filled_size = qat_vdev->mdev->setup_size; + + return 0; +} + +/* + * Allocate a file handler for user space and then save the migration data for + * the device being migrated. If this is called in the pre-copy stage, save the + * pre-configured device data. Otherwise, if this is called in the stop-copy + * stage, save the device state. In both cases, update the data size which can + * then be read from user space. + */ +static struct qat_vf_migration_file * +qat_vf_save_device_data(struct qat_vf_core_device *qat_vdev, bool pre_copy) +{ + struct qat_vf_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_save_fops, + migf, O_RDONLY); + ret = PTR_ERR_OR_ZERO(migf->filp); + if (ret) { + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + + if (pre_copy) + ret = qat_vf_save_setup(qat_vdev, migf); + else + ret = qat_vf_save_state(qat_vdev, migf); + if (ret) { + fput(migf->filp); + return ERR_PTR(ret); + } + + migf->qat_vdev = qat_vdev; + + return migf; +} + +static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct qat_vf_migration_file *migf = filp->private_data; + struct qat_mig_dev *mig_dev = migf->qat_vdev->mdev; + loff_t end, *offs; + ssize_t done = 0; + int ret; + + if (pos) + return -ESPIPE; + offs = &filp->f_pos; + + if (*offs < 0 || + check_add_overflow((loff_t)len, *offs, &end)) + return -EOVERFLOW; + + if (end > mig_dev->state_size) + return -ENOMEM; + + mutex_lock(&migf->lock); + if (migf->disabled) { + done = -ENODEV; + goto out_unlock; + } + + ret = copy_from_user(mig_dev->state + *offs, buf, len); + if (ret) { + done = -EFAULT; + goto out_unlock; + } + *offs += len; + migf->filled_size += len; + + /* + * Load the pre-configured device data first to check if the target + * device is compatible with the source device. + */ + ret = qat_vfmig_load_setup(mig_dev, migf->filled_size); + if (ret && ret != -EAGAIN) { + done = ret; + goto out_unlock; + } + done = len; + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static const struct file_operations qat_vf_resume_fops = { + .owner = THIS_MODULE, + .write = qat_vf_resume_write, + .release = qat_vf_release_file, + .llseek = no_llseek, +}; + +static struct qat_vf_migration_file * +qat_vf_resume_device_data(struct qat_vf_core_device *qat_vdev) +{ + struct qat_vf_migration_file *migf; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("qat_vf_mig", &qat_vf_resume_fops, migf, O_WRONLY); + ret = PTR_ERR_OR_ZERO(migf->filp); + if (ret) { + kfree(migf); + return ERR_PTR(ret); + } + + migf->qat_vdev = qat_vdev; + migf->filled_size = 0; + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + + return migf; +} + +static int qat_vf_load_device_data(struct qat_vf_core_device *qat_vdev) +{ + return qat_vfmig_load_state(qat_vdev->mdev); +} + +static struct file *qat_vf_pci_step_device_state(struct qat_vf_core_device *qat_vdev, u32 new) +{ + u32 cur = qat_vdev->mig_state; + int ret; + + /* + * As the device is not capable of just stopping P2P DMAs, suspend the + * device completely once any of the P2P states are reached. + * When it is suspended, all its MMIO registers can still be operated + * correctly, jobs submitted through ring are queued while no jobs are + * processed by the device. The MMIO states can be safely migrated to + * the target VF during stop-copy stage and restored correctly in the + * target VF. All queued jobs can be resumed then. + */ + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + ret = qat_vfmig_suspend(qat_vdev->mdev); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { + qat_vfmig_resume(qat_vdev->mdev); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P)) + return NULL; + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct qat_vf_migration_file *migf; + + migf = qat_vf_save_device_data(qat_vdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + qat_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { + struct qat_vf_migration_file *migf; + + migf = qat_vf_resume_device_data(qat_vdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + qat_vdev->resuming_migf = migf; + return migf->filp; + } + + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) { + qat_vf_disable_fds(qat_vdev); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct qat_vf_migration_file *migf; + + migf = qat_vf_save_device_data(qat_vdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + qat_vdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct qat_vf_migration_file *migf = qat_vdev->saving_migf; + + if (!migf) + return ERR_PTR(-EINVAL); + ret = qat_vf_save_state(qat_vdev, migf); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { + ret = qat_vf_load_device_data(qat_vdev); + if (ret) + return ERR_PTR(ret); + + qat_vf_disable_fds(qat_vdev); + return NULL; + } + + /* vfio_mig_get_next_state() does not use arcs other than the above */ + WARN_ON(true); + return ERR_PTR(-EINVAL); +} + +static void qat_vf_reset_done(struct qat_vf_core_device *qat_vdev) +{ + qat_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + qat_vfmig_reset(qat_vdev->mdev); + qat_vf_disable_fds(qat_vdev); +} + +static struct file *qat_vf_pci_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *res = NULL; + int ret; + + mutex_lock(&qat_vdev->state_mutex); + while (new_state != qat_vdev->mig_state) { + ret = vfio_mig_get_next_state(vdev, qat_vdev->mig_state, + new_state, &next_state); + if (ret) { + res = ERR_PTR(ret); + break; + } + res = qat_vf_pci_step_device_state(qat_vdev, next_state); + if (IS_ERR(res)) + break; + qat_vdev->mig_state = next_state; + if (WARN_ON(res && new_state != qat_vdev->mig_state)) { + fput(res); + res = ERR_PTR(-EINVAL); + break; + } + } + mutex_unlock(&qat_vdev->state_mutex); + + return res; +} + +static int qat_vf_pci_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *curr_state) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + + mutex_lock(&qat_vdev->state_mutex); + *curr_state = qat_vdev->mig_state; + mutex_unlock(&qat_vdev->state_mutex); + + return 0; +} + +static int qat_vf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct qat_vf_core_device *qat_vdev = container_of(vdev, + struct qat_vf_core_device, core_device.vdev); + + mutex_lock(&qat_vdev->state_mutex); + *stop_copy_length = qat_vdev->mdev->state_size; + mutex_unlock(&qat_vdev->state_mutex); + + return 0; +} + +static const struct vfio_migration_ops qat_vf_pci_mig_ops = { + .migration_set_state = qat_vf_pci_set_device_state, + .migration_get_state = qat_vf_pci_get_device_state, + .migration_get_data_size = qat_vf_pci_get_data_size, +}; + +static void qat_vf_pci_release_dev(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + + qat_vfmig_cleanup(qat_vdev->mdev); + qat_vfmig_destroy(qat_vdev->mdev); + mutex_destroy(&qat_vdev->state_mutex); + vfio_pci_core_release_dev(core_vdev); +} + +static int qat_vf_pci_init_dev(struct vfio_device *core_vdev) +{ + struct qat_vf_core_device *qat_vdev = container_of(core_vdev, + struct qat_vf_core_device, core_device.vdev); + struct qat_mig_dev *mdev; + struct pci_dev *parent; + int ret, vf_id; + + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY; + core_vdev->mig_ops = &qat_vf_pci_mig_ops; + + ret = vfio_pci_core_init_dev(core_vdev); + if (ret) + return ret; + + mutex_init(&qat_vdev->state_mutex); + + parent = pci_physfn(qat_vdev->core_device.pdev); + vf_id = pci_iov_vf_id(qat_vdev->core_device.pdev); + if (vf_id < 0) { + ret = -ENODEV; + goto err_rel; + } + + mdev = qat_vfmig_create(parent, vf_id); + if (IS_ERR(mdev)) { + ret = PTR_ERR(mdev); + goto err_rel; + } + + ret = qat_vfmig_init(mdev); + if (ret) + goto err_destroy; + + qat_vdev->mdev = mdev; + + return 0; + +err_destroy: + qat_vfmig_destroy(mdev); +err_rel: + vfio_pci_core_release_dev(core_vdev); + return ret; +} + +static const struct vfio_device_ops qat_vf_pci_ops = { + .name = "qat-vf-vfio-pci", + .init = qat_vf_pci_init_dev, + .release = qat_vf_pci_release_dev, + .open_device = qat_vf_pci_open_device, + .close_device = qat_vf_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static struct qat_vf_core_device *qat_vf_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = pci_get_drvdata(pdev); + + return container_of(core_device, struct qat_vf_core_device, core_device); +} + +static void qat_vf_pci_aer_reset_done(struct pci_dev *pdev) +{ + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); + + if (!qat_vdev->mdev) + return; + + mutex_lock(&qat_vdev->state_mutex); + qat_vf_reset_done(qat_vdev); + mutex_unlock(&qat_vdev->state_mutex); +} + +static int +qat_vf_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct device *dev = &pdev->dev; + struct qat_vf_core_device *qat_vdev; + int ret; + + qat_vdev = vfio_alloc_device(qat_vf_core_device, core_device.vdev, dev, &qat_vf_pci_ops); + if (IS_ERR(qat_vdev)) + return PTR_ERR(qat_vdev); + + pci_set_drvdata(pdev, &qat_vdev->core_device); + ret = vfio_pci_core_register_device(&qat_vdev->core_device); + if (ret) + goto out_put_device; + + return 0; + +out_put_device: + vfio_put_device(&qat_vdev->core_device.vdev); + return ret; +} + +static void qat_vf_vfio_pci_remove(struct pci_dev *pdev) +{ + struct qat_vf_core_device *qat_vdev = qat_vf_drvdata(pdev); + + vfio_pci_core_unregister_device(&qat_vdev->core_device); + vfio_put_device(&qat_vdev->core_device.vdev); +} + +static const struct pci_device_id qat_vf_vfio_pci_table[] = { + /* Intel QAT GEN4 4xxx VF device */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) }, + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) }, + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) }, + {} +}; +MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table); + +static const struct pci_error_handlers qat_vf_err_handlers = { + .reset_done = qat_vf_pci_aer_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static struct pci_driver qat_vf_vfio_pci_driver = { + .name = "qat_vfio_pci", + .id_table = qat_vf_vfio_pci_table, + .probe = qat_vf_vfio_pci_probe, + .remove = qat_vf_vfio_pci_remove, + .err_handler = &qat_vf_err_handlers, + .driver_managed_dma = true, +}; +module_pci_driver(qat_vf_vfio_pci_driver); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Xin Zeng "); +MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT GEN4 device family"); +MODULE_IMPORT_NS(CRYPTO_QAT); -- cgit From f6944d4a0b87c16bc34ae589169e1ded3d4db08e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 3 May 2024 08:31:36 -0600 Subject: vfio/pci: Collect hot-reset devices to local buffer Lockdep reports the below circular locking dependency issue. The mmap_lock acquisition while holding pci_bus_sem is due to the use of copy_to_user() from within a pci_walk_bus() callback. Building the devices array directly into the user buffer is only for convenience. Instead we can allocate a local buffer for the array, bounded by the number of devices on the bus/slot, fill the device information into this local buffer, then copy it into the user buffer outside the bus walk callback. ====================================================== WARNING: possible circular locking dependency detected 6.9.0-rc5+ #39 Not tainted ------------------------------------------------------ CPU 0/KVM/4113 is trying to acquire lock: ffff99a609ee18a8 (&vdev->vma_lock){+.+.}-{4:4}, at: vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core] but task is already holding lock: ffff99a243a052a0 (&mm->mmap_lock){++++}-{4:4}, at: vaddr_get_pfns+0x3f/0x170 [vfio_iommu_type1] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&mm->mmap_lock){++++}-{4:4}: __lock_acquire+0x4e4/0xb90 lock_acquire+0xbc/0x2d0 __might_fault+0x5c/0x80 _copy_to_user+0x1e/0x60 vfio_pci_fill_devs+0x9f/0x130 [vfio_pci_core] vfio_pci_walk_wrapper+0x45/0x60 [vfio_pci_core] __pci_walk_bus+0x6b/0xb0 vfio_pci_ioctl_get_pci_hot_reset_info+0x10b/0x1d0 [vfio_pci_core] vfio_pci_core_ioctl+0x1cb/0x400 [vfio_pci_core] vfio_device_fops_unl_ioctl+0x7e/0x140 [vfio] __x64_sys_ioctl+0x8a/0xc0 do_syscall_64+0x8d/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #2 (pci_bus_sem){++++}-{4:4}: __lock_acquire+0x4e4/0xb90 lock_acquire+0xbc/0x2d0 down_read+0x3e/0x160 pci_bridge_wait_for_secondary_bus.part.0+0x33/0x2d0 pci_reset_bus+0xdd/0x160 vfio_pci_dev_set_hot_reset+0x256/0x270 [vfio_pci_core] vfio_pci_ioctl_pci_hot_reset_groups+0x1a3/0x280 [vfio_pci_core] vfio_pci_core_ioctl+0x3b5/0x400 [vfio_pci_core] vfio_device_fops_unl_ioctl+0x7e/0x140 [vfio] __x64_sys_ioctl+0x8a/0xc0 do_syscall_64+0x8d/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #1 (&vdev->memory_lock){+.+.}-{4:4}: __lock_acquire+0x4e4/0xb90 lock_acquire+0xbc/0x2d0 down_write+0x3b/0xc0 vfio_pci_zap_and_down_write_memory_lock+0x1c/0x30 [vfio_pci_core] vfio_basic_config_write+0x281/0x340 [vfio_pci_core] vfio_config_do_rw+0x1fa/0x300 [vfio_pci_core] vfio_pci_config_rw+0x75/0xe50 [vfio_pci_core] vfio_pci_rw+0xea/0x1a0 [vfio_pci_core] vfs_write+0xea/0x520 __x64_sys_pwrite64+0x90/0xc0 do_syscall_64+0x8d/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e -> #0 (&vdev->vma_lock){+.+.}-{4:4}: check_prev_add+0xeb/0xcc0 validate_chain+0x465/0x530 __lock_acquire+0x4e4/0xb90 lock_acquire+0xbc/0x2d0 __mutex_lock+0x97/0xde0 vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core] __do_fault+0x31/0x160 do_pte_missing+0x65/0x3b0 __handle_mm_fault+0x303/0x720 handle_mm_fault+0x10f/0x460 fixup_user_fault+0x7f/0x1f0 follow_fault_pfn+0x66/0x1c0 [vfio_iommu_type1] vaddr_get_pfns+0xf2/0x170 [vfio_iommu_type1] vfio_pin_pages_remote+0x348/0x4e0 [vfio_iommu_type1] vfio_pin_map_dma+0xd2/0x330 [vfio_iommu_type1] vfio_dma_do_map+0x2c0/0x440 [vfio_iommu_type1] vfio_iommu_type1_ioctl+0xc5/0x1d0 [vfio_iommu_type1] __x64_sys_ioctl+0x8a/0xc0 do_syscall_64+0x8d/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e other info that might help us debug this: Chain exists of: &vdev->vma_lock --> pci_bus_sem --> &mm->mmap_lock Possible unsafe locking scenario: block dm-0: the capability attribute has been deprecated. CPU0 CPU1 ---- ---- rlock(&mm->mmap_lock); lock(pci_bus_sem); lock(&mm->mmap_lock); lock(&vdev->vma_lock); *** DEADLOCK *** 2 locks held by CPU 0/KVM/4113: #0: ffff99a25f294888 (&iommu->lock#2){+.+.}-{4:4}, at: vfio_dma_do_map+0x60/0x440 [vfio_iommu_type1] #1: ffff99a243a052a0 (&mm->mmap_lock){++++}-{4:4}, at: vaddr_get_pfns+0x3f/0x170 [vfio_iommu_type1] stack backtrace: CPU: 1 PID: 4113 Comm: CPU 0/KVM Not tainted 6.9.0-rc5+ #39 Hardware name: Dell Inc. PowerEdge T640/04WYPY, BIOS 2.15.1 06/16/2022 Call Trace: dump_stack_lvl+0x64/0xa0 check_noncircular+0x131/0x150 check_prev_add+0xeb/0xcc0 ? add_chain_cache+0x10a/0x2f0 ? __lock_acquire+0x4e4/0xb90 validate_chain+0x465/0x530 __lock_acquire+0x4e4/0xb90 lock_acquire+0xbc/0x2d0 ? vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core] ? lock_is_held_type+0x9a/0x110 __mutex_lock+0x97/0xde0 ? vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core] ? lock_acquire+0xbc/0x2d0 ? vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core] ? find_held_lock+0x2b/0x80 ? vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core] vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core] __do_fault+0x31/0x160 do_pte_missing+0x65/0x3b0 __handle_mm_fault+0x303/0x720 handle_mm_fault+0x10f/0x460 fixup_user_fault+0x7f/0x1f0 follow_fault_pfn+0x66/0x1c0 [vfio_iommu_type1] vaddr_get_pfns+0xf2/0x170 [vfio_iommu_type1] vfio_pin_pages_remote+0x348/0x4e0 [vfio_iommu_type1] vfio_pin_map_dma+0xd2/0x330 [vfio_iommu_type1] vfio_dma_do_map+0x2c0/0x440 [vfio_iommu_type1] vfio_iommu_type1_ioctl+0xc5/0x1d0 [vfio_iommu_type1] __x64_sys_ioctl+0x8a/0xc0 do_syscall_64+0x8d/0x170 ? rcu_core+0x8d/0x250 ? __lock_release+0x5e/0x160 ? rcu_core+0x8d/0x250 ? lock_release+0x5f/0x120 ? sched_clock+0xc/0x30 ? sched_clock_cpu+0xb/0x190 ? irqtime_account_irq+0x40/0xc0 ? __local_bh_enable+0x54/0x60 ? __do_softirq+0x315/0x3ca ? lockdep_hardirqs_on_prepare.part.0+0x97/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f8300d0357b Code: ff ff ff 85 c0 79 9b 49 c7 c4 ff ff ff ff 5b 5d 4c 89 e0 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 75 68 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007f82ef3fb948 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f8300d0357b RDX: 00007f82ef3fb990 RSI: 0000000000003b71 RDI: 0000000000000023 RBP: 00007f82ef3fb9c0 R08: 0000000000000000 R09: 0000561b7e0bcac2 R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 R13: 0000000200000000 R14: 0000381800000000 R15: 0000000000000000 Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20240503143138.3562116-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 78 +++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index d94d61b92c1a..d8c95cc16be8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -778,25 +778,26 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) } struct vfio_pci_fill_info { - struct vfio_pci_dependent_device __user *devices; - struct vfio_pci_dependent_device __user *devices_end; struct vfio_device *vdev; + struct vfio_pci_dependent_device *devices; + int nr_devices; u32 count; u32 flags; }; static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) { - struct vfio_pci_dependent_device info = { - .segment = pci_domain_nr(pdev->bus), - .bus = pdev->bus->number, - .devfn = pdev->devfn, - }; + struct vfio_pci_dependent_device *info; struct vfio_pci_fill_info *fill = data; - fill->count++; - if (fill->devices >= fill->devices_end) - return 0; + /* The topology changed since we counted devices */ + if (fill->count >= fill->nr_devices) + return -EAGAIN; + + info = &fill->devices[fill->count++]; + info->segment = pci_domain_nr(pdev->bus); + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); @@ -809,19 +810,19 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) */ vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); if (!vdev) { - info.devid = VFIO_PCI_DEVID_NOT_OWNED; + info->devid = VFIO_PCI_DEVID_NOT_OWNED; } else { int id = vfio_iommufd_get_dev_id(vdev, iommufd); if (id > 0) - info.devid = id; + info->devid = id; else if (id == -ENOENT) - info.devid = VFIO_PCI_DEVID_OWNED; + info->devid = VFIO_PCI_DEVID_OWNED; else - info.devid = VFIO_PCI_DEVID_NOT_OWNED; + info->devid = VFIO_PCI_DEVID_NOT_OWNED; } /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ - if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) + if (info->devid == VFIO_PCI_DEVID_NOT_OWNED) fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; } else { struct iommu_group *iommu_group; @@ -830,13 +831,10 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) if (!iommu_group) return -EPERM; /* Cannot reset non-isolated devices */ - info.group_id = iommu_group_id(iommu_group); + info->group_id = iommu_group_id(iommu_group); iommu_group_put(iommu_group); } - if (copy_to_user(fill->devices, &info, sizeof(info))) - return -EFAULT; - fill->devices++; return 0; } @@ -1258,10 +1256,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); + struct vfio_pci_dependent_device *devices = NULL; struct vfio_pci_hot_reset_info hdr; struct vfio_pci_fill_info fill = {}; bool slot = false; - int ret = 0; + int ret, count; if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; @@ -1277,9 +1276,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( else if (pci_probe_reset_bus(vdev->pdev->bus)) return -ENODEV; - fill.devices = arg->devices; - fill.devices_end = arg->devices + - (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; + + if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) { + hdr.count = count; + ret = -ENOSPC; + goto header; + } + + devices = kcalloc(count, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; + + fill.devices = devices; + fill.nr_devices = count; fill.vdev = &vdev->vdev; if (vfio_device_cdev_opened(&vdev->vdev)) @@ -1291,16 +1304,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( &fill, slot); mutex_unlock(&vdev->vdev.dev_set->lock); if (ret) - return ret; + goto out; + + if (copy_to_user(arg->devices, devices, + sizeof(*devices) * fill.count)) { + ret = -EFAULT; + goto out; + } hdr.count = fill.count; hdr.flags = fill.flags; - if (copy_to_user(arg, &hdr, minsz)) - return -EFAULT; - if (fill.count > fill.devices - arg->devices) - return -ENOSPC; - return 0; +header: + if (copy_to_user(arg, &hdr, minsz)) + ret = -EFAULT; +out: + kfree(devices); + return ret; } static int -- cgit From dda057ad8c9c5f1dcd6cc33ada0fa5fceb5acacc Mon Sep 17 00:00:00 2001 From: "foryun.ma" Date: Fri, 10 May 2024 14:37:35 +1400 Subject: vfio: remove an extra semicolon remove an extra semicolon from the example code Signed-off-by: foryun.ma Link: https://lore.kernel.org/r/20240510003735.2766-1-foryun.ma@jaguarmicro.com Signed-off-by: Alex Williamson --- Documentation/driver-api/vfio.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst index 633d11c7fa71..2a21a42c9386 100644 --- a/Documentation/driver-api/vfio.rst +++ b/Documentation/driver-api/vfio.rst @@ -364,7 +364,7 @@ IOMMUFD IOAS/HWPT to enable userspace DMA:: MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); map.iova = 0; /* 1MB starting at 0x0 from device view */ map.length = 1024 * 1024; - map.ioas_id = alloc_data.out_ioas_id;; + map.ioas_id = alloc_data.out_ioas_id; ioctl(iommufd, IOMMU_IOAS_MAP, &map); -- cgit From cbb325e77fbe62a06184175aa98c9eb98736c3e8 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 16 May 2024 11:48:30 -0600 Subject: vfio/pci: Restore zero affected bus reset devices warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Yi notes relative to commit f6944d4a0b87 ("vfio/pci: Collect hot-reset devices to local buffer") that we previously tested the resulting device count with a WARN_ON, which was removed when we switched to the in-loop user copy in commit b56b7aabcf3c ("vfio/pci: Copy hot-reset device info to userspace in the devices loop"). Finding no devices in the bus/slot would be an unexpected condition, so let's restore the warning and trigger a -ERANGE error here as success with no devices would be an unexpected result to userspace as well. Suggested-by: Yi Liu Reviewed-by: Cédric Le Goater Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20240516174831.2257970-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index d8c95cc16be8..80cae87fff36 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1281,6 +1281,9 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( if (ret) return ret; + if (WARN_ON(!count)) /* Should always be at least one */ + return -ERANGE; + if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) { hdr.count = count; ret = -ENOSPC; -- cgit