From bf8d2dd2ed0825a58f31cc510245a1eb46f8a87e Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 25 Oct 2022 13:56:52 +0200 Subject: iommu/s390: Fix duplicate domain attachments Since commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") we can end up with duplicates in the list of devices attached to a domain. This is inefficient and confusing since only one domain can actually be in control of the IOMMU translations for a device. Fix this by detaching the device from the previous domain, if any, on attach. Add a WARN_ON() in case we still have attached devices on freeing the domain. While here remove the re-attach on failure dance as it was determined to be unlikely to help and may confuse debug and recovery. Fixes: fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev calls") Reviewed-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221025115657.1666860-2-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 106 +++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 61 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 3c071782f6f1..c2e5e81d609e 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -79,10 +79,36 @@ static void s390_domain_free(struct iommu_domain *domain) { struct s390_domain *s390_domain = to_s390_domain(domain); + WARN_ON(!list_empty(&s390_domain->devices)); dma_cleanup_tables(s390_domain->dma_table); kfree(s390_domain); } +static void __s390_iommu_detach_device(struct zpci_dev *zdev) +{ + struct s390_domain *s390_domain = zdev->s390_domain; + struct s390_domain_device *domain_device, *tmp; + unsigned long flags; + + if (!s390_domain) + return; + + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices, + list) { + if (domain_device->zdev == zdev) { + list_del(&domain_device->list); + kfree(domain_device); + break; + } + } + spin_unlock_irqrestore(&s390_domain->list_lock, flags); + + zpci_unregister_ioat(zdev, 0); + zdev->s390_domain = NULL; + zdev->dma_table = NULL; +} + static int s390_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { @@ -90,7 +116,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, struct zpci_dev *zdev = to_zpci_dev(dev); struct s390_domain_device *domain_device; unsigned long flags; - int cc, rc; + int cc, rc = 0; if (!zdev) return -ENODEV; @@ -99,24 +125,18 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, if (!domain_device) return -ENOMEM; - if (zdev->dma_table && !zdev->s390_domain) { - cc = zpci_dma_exit_device(zdev); - if (cc) { - rc = -EIO; - goto out_free; - } - } - if (zdev->s390_domain) - zpci_unregister_ioat(zdev, 0); + __s390_iommu_detach_device(zdev); + else if (zdev->dma_table) + zpci_dma_exit_device(zdev); - zdev->dma_table = s390_domain->dma_table; cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); + virt_to_phys(s390_domain->dma_table)); if (cc) { rc = -EIO; - goto out_restore; + goto out_free; } + zdev->dma_table = s390_domain->dma_table; spin_lock_irqsave(&s390_domain->list_lock, flags); /* First device defines the DMA range limits */ @@ -127,9 +147,9 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, /* Allow only devices with identical DMA range limits */ } else if (domain->geometry.aperture_start != zdev->start_dma || domain->geometry.aperture_end != zdev->end_dma) { - rc = -EINVAL; spin_unlock_irqrestore(&s390_domain->list_lock, flags); - goto out_restore; + rc = -EINVAL; + goto out_unregister; } domain_device->zdev = zdev; zdev->s390_domain = s390_domain; @@ -138,14 +158,9 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, return 0; -out_restore: - if (!zdev->s390_domain) { - zpci_dma_init_device(zdev); - } else { - zdev->dma_table = zdev->s390_domain->dma_table; - zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); - } +out_unregister: + zpci_unregister_ioat(zdev, 0); + zdev->dma_table = NULL; out_free: kfree(domain_device); @@ -155,32 +170,12 @@ out_free: static void s390_iommu_detach_device(struct iommu_domain *domain, struct device *dev) { - struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); - struct s390_domain_device *domain_device, *tmp; - unsigned long flags; - int found = 0; - if (!zdev) - return; + WARN_ON(zdev->s390_domain != to_s390_domain(domain)); - spin_lock_irqsave(&s390_domain->list_lock, flags); - list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices, - list) { - if (domain_device->zdev == zdev) { - list_del(&domain_device->list); - kfree(domain_device); - found = 1; - break; - } - } - spin_unlock_irqrestore(&s390_domain->list_lock, flags); - - if (found && (zdev->s390_domain == s390_domain)) { - zdev->s390_domain = NULL; - zpci_unregister_ioat(zdev, 0); - zpci_dma_init_device(zdev); - } + __s390_iommu_detach_device(zdev); + zpci_dma_init_device(zdev); } static struct iommu_device *s390_iommu_probe_device(struct device *dev) @@ -198,24 +193,13 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev) static void s390_iommu_release_device(struct device *dev) { struct zpci_dev *zdev = to_zpci_dev(dev); - struct iommu_domain *domain; /* - * This is a workaround for a scenario where the IOMMU API common code - * "forgets" to call the detach_dev callback: After binding a device - * to vfio-pci and completing the VFIO_SET_IOMMU ioctl (which triggers - * the attach_dev), removing the device via - * "echo 1 > /sys/bus/pci/devices/.../remove" won't trigger detach_dev, - * only release_device will be called via the BUS_NOTIFY_REMOVED_DEVICE - * notifier. - * - * So let's call detach_dev from here if it hasn't been called before. + * release_device is expected to detach any domain currently attached + * to the device, but keep it attached to other devices in the group. */ - if (zdev && zdev->s390_domain) { - domain = iommu_get_domain_for_dev(dev); - if (domain) - s390_iommu_detach_device(domain, dev); - } + if (zdev) + __s390_iommu_detach_device(zdev); } static int s390_iommu_update_trans(struct s390_domain *s390_domain, -- cgit From 1a3a7d64bbce3179401f4e691522ff992aa1b8a1 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 25 Oct 2022 13:56:53 +0200 Subject: iommu/s390: Get rid of s390_domain_device The struct s390_domain_device serves the sole purpose as list entry for the devices list of a struct s390_domain. As it contains no additional information besides a list_head and a pointer to the struct zpci_dev we can simplify things and just thread the device list through struct zpci_dev directly. This removes the need to allocate during domain attach and gets rid of one level of indirection during mapping operations. Reviewed-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221025115657.1666860-3-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/include/asm/pci.h | 1 + drivers/iommu/s390-iommu.c | 37 +++++++------------------------------ 2 files changed, 8 insertions(+), 30 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 108e732d7b14..15f8714ca9b7 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -117,6 +117,7 @@ struct zpci_bus { struct zpci_dev { struct zpci_bus *zbus; struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ + struct list_head iommu_list; struct kref kref; struct hotplug_slot hotplug_slot; diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index c2e5e81d609e..af83ccde16a4 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -29,11 +29,6 @@ struct s390_domain { spinlock_t list_lock; }; -struct s390_domain_device { - struct list_head list; - struct zpci_dev *zdev; -}; - static struct s390_domain *to_s390_domain(struct iommu_domain *dom) { return container_of(dom, struct s390_domain, domain); @@ -87,21 +82,13 @@ static void s390_domain_free(struct iommu_domain *domain) static void __s390_iommu_detach_device(struct zpci_dev *zdev) { struct s390_domain *s390_domain = zdev->s390_domain; - struct s390_domain_device *domain_device, *tmp; unsigned long flags; if (!s390_domain) return; spin_lock_irqsave(&s390_domain->list_lock, flags); - list_for_each_entry_safe(domain_device, tmp, &s390_domain->devices, - list) { - if (domain_device->zdev == zdev) { - list_del(&domain_device->list); - kfree(domain_device); - break; - } - } + list_del_init(&zdev->iommu_list); spin_unlock_irqrestore(&s390_domain->list_lock, flags); zpci_unregister_ioat(zdev, 0); @@ -114,17 +101,12 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); - struct s390_domain_device *domain_device; unsigned long flags; int cc, rc = 0; if (!zdev) return -ENODEV; - domain_device = kzalloc(sizeof(*domain_device), GFP_KERNEL); - if (!domain_device) - return -ENOMEM; - if (zdev->s390_domain) __s390_iommu_detach_device(zdev); else if (zdev->dma_table) @@ -132,10 +114,8 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, virt_to_phys(s390_domain->dma_table)); - if (cc) { - rc = -EIO; - goto out_free; - } + if (cc) + return -EIO; zdev->dma_table = s390_domain->dma_table; spin_lock_irqsave(&s390_domain->list_lock, flags); @@ -151,9 +131,8 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, rc = -EINVAL; goto out_unregister; } - domain_device->zdev = zdev; zdev->s390_domain = s390_domain; - list_add(&domain_device->list, &s390_domain->devices); + list_add(&zdev->iommu_list, &s390_domain->devices); spin_unlock_irqrestore(&s390_domain->list_lock, flags); return 0; @@ -161,8 +140,6 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, out_unregister: zpci_unregister_ioat(zdev, 0); zdev->dma_table = NULL; -out_free: - kfree(domain_device); return rc; } @@ -206,10 +183,10 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain, phys_addr_t pa, dma_addr_t dma_addr, size_t size, int flags) { - struct s390_domain_device *domain_device; phys_addr_t page_addr = pa & PAGE_MASK; dma_addr_t start_dma_addr = dma_addr; unsigned long irq_flags, nr_pages, i; + struct zpci_dev *zdev; unsigned long *entry; int rc = 0; @@ -234,8 +211,8 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain, } spin_lock(&s390_domain->list_lock); - list_for_each_entry(domain_device, &s390_domain->devices, list) { - rc = zpci_refresh_trans((u64) domain_device->zdev->fh << 32, + list_for_each_entry(zdev, &s390_domain->devices, iommu_list) { + rc = zpci_refresh_trans((u64)zdev->fh << 32, start_dma_addr, nr_pages * PAGE_SIZE); if (rc) break; -- cgit From cbf7827bc5dcfa4301aaea6f57eba9a94dbee7b1 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 25 Oct 2022 13:56:54 +0200 Subject: iommu/s390: Fix potential s390_domain aperture shrinking The s390 IOMMU driver currently sets the IOMMU domain's aperture to match the device specific DMA address range of the device that is first attached. This is not ideal. For one if the domain has no device attached in the meantime the aperture could be shrunk allowing translations outside the aperture to exist in the translation tables. Also this is a bit of a misuse of the aperture which really should describe what addresses can be translated and not some device specific limitations. Instead of misusing the aperture like this we can instead create reserved ranges for the ranges inaccessible to the attached devices allowing devices with overlapping ranges to still share an IOMMU domain. This also significantly simplifies s390_iommu_attach_device() allowing us to move the aperture check to the beginning of the function and removing the need to hold the device list's lock to check the aperture. As we then use the same aperture for all domains and it only depends on the table properties we can already check zdev->start_dma/end_dma at probe time and turn the check on attach into a WARN_ON(). Suggested-by: Jason Gunthorpe Reviewed-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221025115657.1666860-4-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 63 +++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 20 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index af83ccde16a4..9b3adc61005c 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -62,6 +62,9 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type) kfree(s390_domain); return NULL; } + s390_domain->domain.geometry.force_aperture = true; + s390_domain->domain.geometry.aperture_start = 0; + s390_domain->domain.geometry.aperture_end = ZPCI_TABLE_SIZE_RT - 1; spin_lock_init(&s390_domain->dma_table_lock); spin_lock_init(&s390_domain->list_lock); @@ -102,11 +105,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); unsigned long flags; - int cc, rc = 0; + int cc; if (!zdev) return -ENODEV; + if (WARN_ON(domain->geometry.aperture_start > zdev->end_dma || + domain->geometry.aperture_end < zdev->start_dma)) + return -EINVAL; + if (zdev->s390_domain) __s390_iommu_detach_device(zdev); else if (zdev->dma_table) @@ -118,30 +125,14 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, return -EIO; zdev->dma_table = s390_domain->dma_table; - spin_lock_irqsave(&s390_domain->list_lock, flags); - /* First device defines the DMA range limits */ - if (list_empty(&s390_domain->devices)) { - domain->geometry.aperture_start = zdev->start_dma; - domain->geometry.aperture_end = zdev->end_dma; - domain->geometry.force_aperture = true; - /* Allow only devices with identical DMA range limits */ - } else if (domain->geometry.aperture_start != zdev->start_dma || - domain->geometry.aperture_end != zdev->end_dma) { - spin_unlock_irqrestore(&s390_domain->list_lock, flags); - rc = -EINVAL; - goto out_unregister; - } + zdev->dma_table = s390_domain->dma_table; zdev->s390_domain = s390_domain; + + spin_lock_irqsave(&s390_domain->list_lock, flags); list_add(&zdev->iommu_list, &s390_domain->devices); spin_unlock_irqrestore(&s390_domain->list_lock, flags); return 0; - -out_unregister: - zpci_unregister_ioat(zdev, 0); - zdev->dma_table = NULL; - - return rc; } static void s390_iommu_detach_device(struct iommu_domain *domain, @@ -155,6 +146,30 @@ static void s390_iommu_detach_device(struct iommu_domain *domain, zpci_dma_init_device(zdev); } +static void s390_iommu_get_resv_regions(struct device *dev, + struct list_head *list) +{ + struct zpci_dev *zdev = to_zpci_dev(dev); + struct iommu_resv_region *region; + + if (zdev->start_dma) { + region = iommu_alloc_resv_region(0, zdev->start_dma, 0, + IOMMU_RESV_RESERVED, GFP_KERNEL); + if (!region) + return; + list_add_tail(®ion->list, list); + } + + if (zdev->end_dma < ZPCI_TABLE_SIZE_RT - 1) { + region = iommu_alloc_resv_region(zdev->end_dma + 1, + ZPCI_TABLE_SIZE_RT - zdev->end_dma - 1, + 0, IOMMU_RESV_RESERVED, GFP_KERNEL); + if (!region) + return; + list_add_tail(®ion->list, list); + } +} + static struct iommu_device *s390_iommu_probe_device(struct device *dev) { struct zpci_dev *zdev; @@ -164,6 +179,13 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev) zdev = to_zpci_dev(dev); + if (zdev->start_dma > zdev->end_dma || + zdev->start_dma > ZPCI_TABLE_SIZE_RT - 1) + return ERR_PTR(-EINVAL); + + if (zdev->end_dma > ZPCI_TABLE_SIZE_RT - 1) + zdev->end_dma = ZPCI_TABLE_SIZE_RT - 1; + return &zdev->iommu_dev; } @@ -342,6 +364,7 @@ static const struct iommu_ops s390_iommu_ops = { .release_device = s390_iommu_release_device, .device_group = generic_device_group, .pgsize_bitmap = S390_IOMMU_PGSIZES, + .get_resv_regions = s390_iommu_get_resv_regions, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = s390_iommu_attach_device, .detach_dev = s390_iommu_detach_device, -- cgit From a4d996c2c4b55a42b21d0f7026b2bd6f7396f666 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 25 Oct 2022 13:56:55 +0200 Subject: iommu/s390: Fix incorrect aperture check The domain->geometry.aperture_end specifies the last valid address treat it as such when checking if a DMA address is valid. Reviewed-by: Pierre Morel Reviewed-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221025115657.1666860-5-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 9b3adc61005c..3e601ca6ee0f 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -213,7 +213,7 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain, int rc = 0; if (dma_addr < s390_domain->domain.geometry.aperture_start || - dma_addr + size > s390_domain->domain.geometry.aperture_end) + (dma_addr + size - 1) > s390_domain->domain.geometry.aperture_end) return -EINVAL; nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; -- cgit From b4d8ae0e907b096583491101ddfc5143b7c08918 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 25 Oct 2022 13:56:56 +0200 Subject: iommu/s390: Fix incorrect pgsize_bitmap The .pgsize_bitmap property of struct iommu_ops is not a page mask but rather has a bit set for each size of pages the IOMMU supports. As the comment correctly pointed out at this moment the code only support 4K pages so simply use SZ_4K here. Reviewed-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221025115657.1666860-6-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 3e601ca6ee0f..104dfbec1037 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -12,13 +12,6 @@ #include #include -/* - * Physically contiguous memory regions can be mapped with 4 KiB alignment, - * we allow all page sizes that are an order of 4KiB (no special large page - * support so far). - */ -#define S390_IOMMU_PGSIZES (~0xFFFUL) - static const struct iommu_ops s390_iommu_ops; struct s390_domain { @@ -363,7 +356,7 @@ static const struct iommu_ops s390_iommu_ops = { .probe_device = s390_iommu_probe_device, .release_device = s390_iommu_release_device, .device_group = generic_device_group, - .pgsize_bitmap = S390_IOMMU_PGSIZES, + .pgsize_bitmap = SZ_4K, .get_resv_regions = s390_iommu_get_resv_regions, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = s390_iommu_attach_device, -- cgit From f3cc4f874efa8d5b10ebd9dc8702cd25b9e536a3 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Tue, 25 Oct 2022 13:56:57 +0200 Subject: iommu/s390: Implement map_pages()/unmap_pages() instead of map()/unmap() While s390-iommu currently implements the map_page()/unmap_page() operations which only map/unmap a single page at a time the internal s390_iommu_update_trans() API already supports mapping/unmapping a range of pages at once. Take advantage of this by implementing the map_pages()/unmap_pages() operations instead thus allowing users of the IOMMU drivers to map multiple pages in a single call followed by a single I/O TLB flush if needed. Reviewed-by: Matthew Rosato Reviewed-by: Jason Gunthorpe Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221025115657.1666860-7-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 48 ++++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 16 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 104dfbec1037..7fb512bece9a 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -196,20 +196,15 @@ static void s390_iommu_release_device(struct device *dev) static int s390_iommu_update_trans(struct s390_domain *s390_domain, phys_addr_t pa, dma_addr_t dma_addr, - size_t size, int flags) + unsigned long nr_pages, int flags) { phys_addr_t page_addr = pa & PAGE_MASK; dma_addr_t start_dma_addr = dma_addr; - unsigned long irq_flags, nr_pages, i; + unsigned long irq_flags, i; struct zpci_dev *zdev; unsigned long *entry; int rc = 0; - if (dma_addr < s390_domain->domain.geometry.aperture_start || - (dma_addr + size - 1) > s390_domain->domain.geometry.aperture_end) - return -EINVAL; - - nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; if (!nr_pages) return 0; @@ -252,11 +247,24 @@ undo_cpu_trans: return rc; } -static int s390_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +static int s390_iommu_map_pages(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t pgsize, size_t pgcount, + int prot, gfp_t gfp, size_t *mapped) { struct s390_domain *s390_domain = to_s390_domain(domain); int flags = ZPCI_PTE_VALID, rc = 0; + size_t size = pgcount << __ffs(pgsize); + + if (pgsize != SZ_4K) + return -EINVAL; + + if (iova < s390_domain->domain.geometry.aperture_start || + (iova + size - 1) > s390_domain->domain.geometry.aperture_end) + return -EINVAL; + + if (!IS_ALIGNED(iova | paddr, pgsize)) + return -EINVAL; if (!(prot & IOMMU_READ)) return -EINVAL; @@ -265,7 +273,9 @@ static int s390_iommu_map(struct iommu_domain *domain, unsigned long iova, flags |= ZPCI_TABLE_PROTECTED; rc = s390_iommu_update_trans(s390_domain, paddr, iova, - size, flags); + pgcount, flags); + if (!rc) + *mapped = size; return rc; } @@ -301,21 +311,27 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, return phys; } -static size_t s390_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *gather) +static size_t s390_iommu_unmap_pages(struct iommu_domain *domain, + unsigned long iova, + size_t pgsize, size_t pgcount, + struct iommu_iotlb_gather *gather) { struct s390_domain *s390_domain = to_s390_domain(domain); + size_t size = pgcount << __ffs(pgsize); int flags = ZPCI_PTE_INVALID; phys_addr_t paddr; int rc; + if (WARN_ON(iova < s390_domain->domain.geometry.aperture_start || + (iova + size - 1) > s390_domain->domain.geometry.aperture_end)) + return 0; + paddr = s390_iommu_iova_to_phys(domain, iova); if (!paddr) return 0; rc = s390_iommu_update_trans(s390_domain, paddr, iova, - size, flags); + pgcount, flags); if (rc) return 0; @@ -361,8 +377,8 @@ static const struct iommu_ops s390_iommu_ops = { .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = s390_iommu_attach_device, .detach_dev = s390_iommu_detach_device, - .map = s390_iommu_map, - .unmap = s390_iommu_unmap, + .map_pages = s390_iommu_map_pages, + .unmap_pages = s390_iommu_unmap_pages, .iova_to_phys = s390_iommu_iova_to_phys, .free = s390_domain_free, } -- cgit From 59bbf596791b89c7f88fdcac29dfc39c1221d25d Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 9 Nov 2022 15:28:59 +0100 Subject: iommu/s390: Make attach succeed even if the device is in error state If a zPCI device is in the error state while switching IOMMU domains zpci_register_ioat() will fail and we would end up with the device not attached to any domain. In this state since zdev->dma_table == NULL a reset via zpci_hot_reset_device() would wrongfully re-initialize the device for DMA API usage using zpci_dma_init_device(). As automatic recovery is currently disabled while attached to an IOMMU domain this only affects slot resets triggered through other means but will affect automatic recovery once we switch to using dma-iommu. Additionally with that switch common code expects attaching to the default domain to always work so zpci_register_ioat() should only fail if there is no chance to recover anyway, e.g. if the device has been unplugged. Improve the robustness of attach by specifically looking at the status returned by zpci_mod_fc() to determine if the device is unavailable and in this case simply ignore the error. Once the device is reset zpci_hot_reset_device() will then correctly set the domain's DMA translation tables. Signed-off-by: Niklas Schnelle Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221109142903.4080275-2-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/include/asm/pci.h | 2 +- arch/s390/kvm/pci.c | 6 ++++-- arch/s390/pci/pci.c | 11 ++++++----- arch/s390/pci/pci_dma.c | 3 ++- drivers/iommu/s390-iommu.c | 9 +++++++-- 5 files changed, 20 insertions(+), 11 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 15f8714ca9b7..07361e2fd8c5 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -221,7 +221,7 @@ void zpci_device_reserved(struct zpci_dev *zdev); bool zpci_is_device_configured(struct zpci_dev *zdev); int zpci_hot_reset_device(struct zpci_dev *zdev); -int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); +int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64, u8 *); int zpci_unregister_ioat(struct zpci_dev *, u8); void zpci_remove_reserved_devices(void); void zpci_update_fh(struct zpci_dev *zdev, u32 fh); diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index c50c1645c0ae..03964c0e1fdf 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -434,6 +434,7 @@ static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) { struct zpci_dev *zdev = opaque; + u8 status; int rc; if (!zdev) @@ -486,7 +487,7 @@ static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) /* Re-register the IOMMU that was already created */ rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); + virt_to_phys(zdev->dma_table), &status); if (rc) goto clear_gisa; @@ -516,6 +517,7 @@ static void kvm_s390_pci_unregister_kvm(void *opaque) { struct zpci_dev *zdev = opaque; struct kvm *kvm; + u8 status; if (!zdev) return; @@ -554,7 +556,7 @@ static void kvm_s390_pci_unregister_kvm(void *opaque) /* Re-register the IOMMU that was already created */ zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); + virt_to_phys(zdev->dma_table), &status); out: spin_lock(&kvm->arch.kzdev_list_lock); diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 73cdc5539384..a703dcd94a68 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -116,20 +116,20 @@ EXPORT_SYMBOL_GPL(pci_proc_domain); /* Modify PCI: Register I/O address translation parameters */ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, - u64 base, u64 limit, u64 iota) + u64 base, u64 limit, u64 iota, u8 *status) { u64 req = ZPCI_CREATE_REQ(zdev->fh, dmaas, ZPCI_MOD_FC_REG_IOAT); struct zpci_fib fib = {0}; - u8 cc, status; + u8 cc; WARN_ON_ONCE(iota & 0x3fff); fib.pba = base; fib.pal = limit; fib.iota = iota | ZPCI_IOTA_RTTO_FLAG; fib.gd = zdev->gisa; - cc = zpci_mod_fc(req, &fib, &status); + cc = zpci_mod_fc(req, &fib, status); if (cc) - zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, status); + zpci_dbg(3, "reg ioat fid:%x, cc:%d, status:%d\n", zdev->fid, cc, *status); return cc; } EXPORT_SYMBOL_GPL(zpci_register_ioat); @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(zpci_disable_device); */ int zpci_hot_reset_device(struct zpci_dev *zdev) { + u8 status; int rc; zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh); @@ -787,7 +788,7 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) if (zdev->dma_table) rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table)); + virt_to_phys(zdev->dma_table), &status); else rc = zpci_dma_init_device(zdev); if (rc) { diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 227cf0a62800..dee825ee7305 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -547,6 +547,7 @@ static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int zpci_dma_init_device(struct zpci_dev *zdev) { + u8 status; int rc; /* @@ -598,7 +599,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev) } if (zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(zdev->dma_table))) { + virt_to_phys(zdev->dma_table), &status)) { rc = -EIO; goto free_bitmap; } diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 7fb512bece9a..e2c886bc4376 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -98,6 +98,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); unsigned long flags; + u8 status; int cc; if (!zdev) @@ -113,8 +114,12 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, zpci_dma_exit_device(zdev); cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, - virt_to_phys(s390_domain->dma_table)); - if (cc) + virt_to_phys(s390_domain->dma_table), &status); + /* + * If the device is undergoing error recovery the reset code + * will re-establish the new domain. + */ + if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) return -EIO; zdev->dma_table = s390_domain->dma_table; -- cgit From c228f5a043370ef02867e4f0aab1bdc8422500e6 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 9 Nov 2022 15:29:00 +0100 Subject: iommu/s390: Add I/O TLB ops Currently s390-iommu does an I/O TLB flush (RPCIT) for every update of the I/O translation table explicitly. For one this is wasteful since RPCIT can be skipped after a mapping operation if zdev->tlb_refresh is unset. Moreover we can do a single RPCIT for a range of pages including whne doing lazy unmapping. Thankfully both of these optimizations can be achieved by implementing the IOMMU operations common code provides for the different types of I/O tlb flushes: * flush_iotlb_all: Flushes the I/O TLB for the entire IOVA space * iotlb_sync: Flushes the I/O TLB for a range of pages that can be gathered up, for example to implement lazy unmapping. * iotlb_sync_map: Flushes the I/O TLB after a mapping operation Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221109142903.4080275-3-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 67 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 11 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index e2c886bc4376..9771bce86e94 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -199,14 +199,63 @@ static void s390_iommu_release_device(struct device *dev) __s390_iommu_detach_device(zdev); } +static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain) +{ + struct s390_domain *s390_domain = to_s390_domain(domain); + struct zpci_dev *zdev; + unsigned long flags; + + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_for_each_entry(zdev, &s390_domain->devices, iommu_list) { + zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma, + zdev->end_dma - zdev->start_dma + 1); + } + spin_unlock_irqrestore(&s390_domain->list_lock, flags); +} + +static void s390_iommu_iotlb_sync(struct iommu_domain *domain, + struct iommu_iotlb_gather *gather) +{ + struct s390_domain *s390_domain = to_s390_domain(domain); + size_t size = gather->end - gather->start + 1; + struct zpci_dev *zdev; + unsigned long flags; + + /* If gather was never added to there is nothing to flush */ + if (!gather->end) + return; + + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_for_each_entry(zdev, &s390_domain->devices, iommu_list) { + zpci_refresh_trans((u64)zdev->fh << 32, gather->start, + size); + } + spin_unlock_irqrestore(&s390_domain->list_lock, flags); +} + +static void s390_iommu_iotlb_sync_map(struct iommu_domain *domain, + unsigned long iova, size_t size) +{ + struct s390_domain *s390_domain = to_s390_domain(domain); + struct zpci_dev *zdev; + unsigned long flags; + + spin_lock_irqsave(&s390_domain->list_lock, flags); + list_for_each_entry(zdev, &s390_domain->devices, iommu_list) { + if (!zdev->tlb_refresh) + continue; + zpci_refresh_trans((u64)zdev->fh << 32, + iova, size); + } + spin_unlock_irqrestore(&s390_domain->list_lock, flags); +} + static int s390_iommu_update_trans(struct s390_domain *s390_domain, phys_addr_t pa, dma_addr_t dma_addr, unsigned long nr_pages, int flags) { phys_addr_t page_addr = pa & PAGE_MASK; - dma_addr_t start_dma_addr = dma_addr; unsigned long irq_flags, i; - struct zpci_dev *zdev; unsigned long *entry; int rc = 0; @@ -225,15 +274,6 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain, dma_addr += PAGE_SIZE; } - spin_lock(&s390_domain->list_lock); - list_for_each_entry(zdev, &s390_domain->devices, iommu_list) { - rc = zpci_refresh_trans((u64)zdev->fh << 32, - start_dma_addr, nr_pages * PAGE_SIZE); - if (rc) - break; - } - spin_unlock(&s390_domain->list_lock); - undo_cpu_trans: if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) { flags = ZPCI_PTE_INVALID; @@ -340,6 +380,8 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain, if (rc) return 0; + iommu_iotlb_gather_add_range(gather, iova, size); + return size; } @@ -384,6 +426,9 @@ static const struct iommu_ops s390_iommu_ops = { .detach_dev = s390_iommu_detach_device, .map_pages = s390_iommu_map_pages, .unmap_pages = s390_iommu_unmap_pages, + .flush_iotlb_all = s390_iommu_flush_iotlb_all, + .iotlb_sync = s390_iommu_iotlb_sync, + .iotlb_sync_map = s390_iommu_iotlb_sync_map, .iova_to_phys = s390_iommu_iova_to_phys, .free = s390_domain_free, } -- cgit From 2ba8336dab5fb81452aea9c21dfc870050a017f3 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 9 Nov 2022 15:29:01 +0100 Subject: iommu/s390: Use RCU to allow concurrent domain_list iteration The s390_domain->devices list is only added to when new devices are attached but is iterated through in read-only fashion for every mapping operation as well as for I/O TLB flushes and thus in performance critical code causing contention on the s390_domain->list_lock. Fortunately such a read-mostly linked list is a standard use case for RCU. This change closely follows the example fpr RCU protected list given in Documentation/RCU/listRCU.rst. Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221109142903.4080275-4-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/include/asm/pci.h | 1 + arch/s390/pci/pci.c | 2 +- drivers/iommu/s390-iommu.c | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 29 insertions(+), 18 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 07361e2fd8c5..e4c3e4e04d30 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -119,6 +119,7 @@ struct zpci_dev { struct list_head entry; /* list of all zpci_devices, needed for hotplug, etc. */ struct list_head iommu_list; struct kref kref; + struct rcu_head rcu; struct hotplug_slot hotplug_slot; enum zpci_state state; diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index a703dcd94a68..ef38b1514c77 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -996,7 +996,7 @@ void zpci_release_device(struct kref *kref) break; } zpci_dbg(3, "rem fid:%x\n", zdev->fid); - kfree(zdev); + kfree_rcu(zdev, rcu); } int zpci_report_error(struct pci_dev *pdev, diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 9771bce86e94..cf5dcbcea4e0 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include static const struct iommu_ops s390_iommu_ops; @@ -20,6 +22,7 @@ struct s390_domain { unsigned long *dma_table; spinlock_t dma_table_lock; spinlock_t list_lock; + struct rcu_head rcu; }; static struct s390_domain *to_s390_domain(struct iommu_domain *dom) @@ -61,18 +64,28 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type) spin_lock_init(&s390_domain->dma_table_lock); spin_lock_init(&s390_domain->list_lock); - INIT_LIST_HEAD(&s390_domain->devices); + INIT_LIST_HEAD_RCU(&s390_domain->devices); return &s390_domain->domain; } +static void s390_iommu_rcu_free_domain(struct rcu_head *head) +{ + struct s390_domain *s390_domain = container_of(head, struct s390_domain, rcu); + + dma_cleanup_tables(s390_domain->dma_table); + kfree(s390_domain); +} + static void s390_domain_free(struct iommu_domain *domain) { struct s390_domain *s390_domain = to_s390_domain(domain); + rcu_read_lock(); WARN_ON(!list_empty(&s390_domain->devices)); - dma_cleanup_tables(s390_domain->dma_table); - kfree(s390_domain); + rcu_read_unlock(); + + call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain); } static void __s390_iommu_detach_device(struct zpci_dev *zdev) @@ -84,7 +97,7 @@ static void __s390_iommu_detach_device(struct zpci_dev *zdev) return; spin_lock_irqsave(&s390_domain->list_lock, flags); - list_del_init(&zdev->iommu_list); + list_del_rcu(&zdev->iommu_list); spin_unlock_irqrestore(&s390_domain->list_lock, flags); zpci_unregister_ioat(zdev, 0); @@ -127,7 +140,7 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, zdev->s390_domain = s390_domain; spin_lock_irqsave(&s390_domain->list_lock, flags); - list_add(&zdev->iommu_list, &s390_domain->devices); + list_add_rcu(&zdev->iommu_list, &s390_domain->devices); spin_unlock_irqrestore(&s390_domain->list_lock, flags); return 0; @@ -203,14 +216,13 @@ static void s390_iommu_flush_iotlb_all(struct iommu_domain *domain) { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev; - unsigned long flags; - spin_lock_irqsave(&s390_domain->list_lock, flags); - list_for_each_entry(zdev, &s390_domain->devices, iommu_list) { + rcu_read_lock(); + list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { zpci_refresh_trans((u64)zdev->fh << 32, zdev->start_dma, zdev->end_dma - zdev->start_dma + 1); } - spin_unlock_irqrestore(&s390_domain->list_lock, flags); + rcu_read_unlock(); } static void s390_iommu_iotlb_sync(struct iommu_domain *domain, @@ -219,18 +231,17 @@ static void s390_iommu_iotlb_sync(struct iommu_domain *domain, struct s390_domain *s390_domain = to_s390_domain(domain); size_t size = gather->end - gather->start + 1; struct zpci_dev *zdev; - unsigned long flags; /* If gather was never added to there is nothing to flush */ if (!gather->end) return; - spin_lock_irqsave(&s390_domain->list_lock, flags); - list_for_each_entry(zdev, &s390_domain->devices, iommu_list) { + rcu_read_lock(); + list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { zpci_refresh_trans((u64)zdev->fh << 32, gather->start, size); } - spin_unlock_irqrestore(&s390_domain->list_lock, flags); + rcu_read_unlock(); } static void s390_iommu_iotlb_sync_map(struct iommu_domain *domain, @@ -238,16 +249,15 @@ static void s390_iommu_iotlb_sync_map(struct iommu_domain *domain, { struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev; - unsigned long flags; - spin_lock_irqsave(&s390_domain->list_lock, flags); - list_for_each_entry(zdev, &s390_domain->devices, iommu_list) { + rcu_read_lock(); + list_for_each_entry_rcu(zdev, &s390_domain->devices, iommu_list) { if (!zdev->tlb_refresh) continue; zpci_refresh_trans((u64)zdev->fh << 32, iova, size); } - spin_unlock_irqrestore(&s390_domain->list_lock, flags); + rcu_read_unlock(); } static int s390_iommu_update_trans(struct s390_domain *s390_domain, -- cgit From 08955af0600303455f57fe2f2a26f24f9b496b49 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 9 Nov 2022 15:29:02 +0100 Subject: iommu/s390: Optimize IOMMU table walking When invalidating existing table entries for unmap there is no need to know the physical address beforehand so don't do an extra walk of the IOMMU table to get it. Also when invalidating entries not finding an entry indicates an invalid unmap and not a lack of memory we also don't need to undo updates in this case. Implement this by splitting s390_iommu_update_trans() in a variant for validating and one for invalidating translations. Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221109142903.4080275-5-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- drivers/iommu/s390-iommu.c | 69 +++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 26 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index cf5dcbcea4e0..2b9a3e3bc606 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -260,14 +260,14 @@ static void s390_iommu_iotlb_sync_map(struct iommu_domain *domain, rcu_read_unlock(); } -static int s390_iommu_update_trans(struct s390_domain *s390_domain, - phys_addr_t pa, dma_addr_t dma_addr, - unsigned long nr_pages, int flags) +static int s390_iommu_validate_trans(struct s390_domain *s390_domain, + phys_addr_t pa, dma_addr_t dma_addr, + unsigned long nr_pages, int flags) { phys_addr_t page_addr = pa & PAGE_MASK; unsigned long irq_flags, i; unsigned long *entry; - int rc = 0; + int rc; if (!nr_pages) return 0; @@ -275,7 +275,7 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain, spin_lock_irqsave(&s390_domain->dma_table_lock, irq_flags); for (i = 0; i < nr_pages; i++) { entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr); - if (!entry) { + if (unlikely(!entry)) { rc = -ENOMEM; goto undo_cpu_trans; } @@ -283,19 +283,43 @@ static int s390_iommu_update_trans(struct s390_domain *s390_domain, page_addr += PAGE_SIZE; dma_addr += PAGE_SIZE; } + spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags); + + return 0; undo_cpu_trans: - if (rc && ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID)) { - flags = ZPCI_PTE_INVALID; - while (i-- > 0) { - page_addr -= PAGE_SIZE; - dma_addr -= PAGE_SIZE; - entry = dma_walk_cpu_trans(s390_domain->dma_table, - dma_addr); - if (!entry) - break; - dma_update_cpu_trans(entry, page_addr, flags); + while (i-- > 0) { + dma_addr -= PAGE_SIZE; + entry = dma_walk_cpu_trans(s390_domain->dma_table, + dma_addr); + if (!entry) + break; + dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID); + } + spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags); + + return rc; +} + +static int s390_iommu_invalidate_trans(struct s390_domain *s390_domain, + dma_addr_t dma_addr, unsigned long nr_pages) +{ + unsigned long irq_flags, i; + unsigned long *entry; + int rc = 0; + + if (!nr_pages) + return 0; + + spin_lock_irqsave(&s390_domain->dma_table_lock, irq_flags); + for (i = 0; i < nr_pages; i++) { + entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr); + if (unlikely(!entry)) { + rc = -EINVAL; + break; } + dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID); + dma_addr += PAGE_SIZE; } spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags); @@ -308,8 +332,8 @@ static int s390_iommu_map_pages(struct iommu_domain *domain, int prot, gfp_t gfp, size_t *mapped) { struct s390_domain *s390_domain = to_s390_domain(domain); - int flags = ZPCI_PTE_VALID, rc = 0; size_t size = pgcount << __ffs(pgsize); + int flags = ZPCI_PTE_VALID, rc = 0; if (pgsize != SZ_4K) return -EINVAL; @@ -327,8 +351,8 @@ static int s390_iommu_map_pages(struct iommu_domain *domain, if (!(prot & IOMMU_WRITE)) flags |= ZPCI_TABLE_PROTECTED; - rc = s390_iommu_update_trans(s390_domain, paddr, iova, - pgcount, flags); + rc = s390_iommu_validate_trans(s390_domain, paddr, iova, + pgcount, flags); if (!rc) *mapped = size; @@ -373,20 +397,13 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain, { struct s390_domain *s390_domain = to_s390_domain(domain); size_t size = pgcount << __ffs(pgsize); - int flags = ZPCI_PTE_INVALID; - phys_addr_t paddr; int rc; if (WARN_ON(iova < s390_domain->domain.geometry.aperture_start || (iova + size - 1) > s390_domain->domain.geometry.aperture_end)) return 0; - paddr = s390_iommu_iova_to_phys(domain, iova); - if (!paddr) - return 0; - - rc = s390_iommu_update_trans(s390_domain, paddr, iova, - pgcount, flags); + rc = s390_iommu_invalidate_trans(s390_domain, iova, pgcount); if (rc) return 0; -- cgit From 21c1f9021f0e7d28c3edfcc70e1ca1926ea3774e Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Wed, 9 Nov 2022 15:29:03 +0100 Subject: s390/pci: use lock-free I/O translation updates I/O translation tables on s390 use 8 byte page table entries and tables which are allocated lazily but only freed when the entire I/O translation table is torn down. Also each IOVA can at any time only translate to one physical address Furthermore I/O table accesses by the IOMMU hardware are cache coherent. With a bit of care we can thus use atomic updates to manipulate the translation table without having to use a global lock at all. This is done analogous to the existing I/O translation table handling code used on Intel and AMD x86 systems. Signed-off-by: Niklas Schnelle Link: https://lore.kernel.org/r/20221109142903.4080275-6-schnelle@linux.ibm.com Signed-off-by: Joerg Roedel --- arch/s390/include/asm/pci.h | 1 - arch/s390/pci/pci_dma.c | 74 +++++++++++++++++++++++++++------------------ drivers/iommu/s390-iommu.c | 37 ++++++++--------------- 3 files changed, 58 insertions(+), 54 deletions(-) (limited to 'drivers/iommu/s390-iommu.c') diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index e4c3e4e04d30..b248694e0024 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -157,7 +157,6 @@ struct zpci_dev { /* DMA stuff */ unsigned long *dma_table; - spinlock_t dma_table_lock; int tlb_refresh; spinlock_t iommu_bitmap_lock; diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index dee825ee7305..ea478d11fbd1 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -63,37 +63,55 @@ static void dma_free_page_table(void *table) kmem_cache_free(dma_page_table_cache, table); } -static unsigned long *dma_get_seg_table_origin(unsigned long *entry) +static unsigned long *dma_get_seg_table_origin(unsigned long *rtep) { + unsigned long old_rte, rte; unsigned long *sto; - if (reg_entry_isvalid(*entry)) - sto = get_rt_sto(*entry); - else { + rte = READ_ONCE(*rtep); + if (reg_entry_isvalid(rte)) { + sto = get_rt_sto(rte); + } else { sto = dma_alloc_cpu_table(); if (!sto) return NULL; - set_rt_sto(entry, virt_to_phys(sto)); - validate_rt_entry(entry); - entry_clr_protected(entry); + set_rt_sto(&rte, virt_to_phys(sto)); + validate_rt_entry(&rte); + entry_clr_protected(&rte); + + old_rte = cmpxchg(rtep, ZPCI_TABLE_INVALID, rte); + if (old_rte != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_cpu_table(sto); + sto = get_rt_sto(old_rte); + } } return sto; } -static unsigned long *dma_get_page_table_origin(unsigned long *entry) +static unsigned long *dma_get_page_table_origin(unsigned long *step) { + unsigned long old_ste, ste; unsigned long *pto; - if (reg_entry_isvalid(*entry)) - pto = get_st_pto(*entry); - else { + ste = READ_ONCE(*step); + if (reg_entry_isvalid(ste)) { + pto = get_st_pto(ste); + } else { pto = dma_alloc_page_table(); if (!pto) return NULL; - set_st_pto(entry, virt_to_phys(pto)); - validate_st_entry(entry); - entry_clr_protected(entry); + set_st_pto(&ste, virt_to_phys(pto)); + validate_st_entry(&ste); + entry_clr_protected(&ste); + + old_ste = cmpxchg(step, ZPCI_TABLE_INVALID, ste); + if (old_ste != ZPCI_TABLE_INVALID) { + /* Somone else was faster, use theirs */ + dma_free_page_table(pto); + pto = get_st_pto(old_ste); + } } return pto; } @@ -117,19 +135,24 @@ unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr) return &pto[px]; } -void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags) +void dma_update_cpu_trans(unsigned long *ptep, phys_addr_t page_addr, int flags) { + unsigned long pte; + + pte = READ_ONCE(*ptep); if (flags & ZPCI_PTE_INVALID) { - invalidate_pt_entry(entry); + invalidate_pt_entry(&pte); } else { - set_pt_pfaa(entry, page_addr); - validate_pt_entry(entry); + set_pt_pfaa(&pte, page_addr); + validate_pt_entry(&pte); } if (flags & ZPCI_TABLE_PROTECTED) - entry_set_protected(entry); + entry_set_protected(&pte); else - entry_clr_protected(entry); + entry_clr_protected(&pte); + + xchg(ptep, pte); } static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, @@ -137,18 +160,14 @@ static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa, { unsigned int nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; phys_addr_t page_addr = (pa & PAGE_MASK); - unsigned long irq_flags; unsigned long *entry; int i, rc = 0; if (!nr_pages) return -EINVAL; - spin_lock_irqsave(&zdev->dma_table_lock, irq_flags); - if (!zdev->dma_table) { - rc = -EINVAL; - goto out_unlock; - } + if (!zdev->dma_table) + return -EINVAL; for (i = 0; i < nr_pages; i++) { entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr); @@ -173,8 +192,6 @@ undo_cpu_trans: dma_update_cpu_trans(entry, page_addr, flags); } } -out_unlock: - spin_unlock_irqrestore(&zdev->dma_table_lock, irq_flags); return rc; } @@ -558,7 +575,6 @@ int zpci_dma_init_device(struct zpci_dev *zdev) WARN_ON(zdev->s390_domain); spin_lock_init(&zdev->iommu_bitmap_lock); - spin_lock_init(&zdev->dma_table_lock); zdev->dma_table = dma_alloc_cpu_table(); if (!zdev->dma_table) { diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index 2b9a3e3bc606..ed33c6cce083 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -20,7 +20,6 @@ struct s390_domain { struct iommu_domain domain; struct list_head devices; unsigned long *dma_table; - spinlock_t dma_table_lock; spinlock_t list_lock; struct rcu_head rcu; }; @@ -62,7 +61,6 @@ static struct iommu_domain *s390_domain_alloc(unsigned domain_type) s390_domain->domain.geometry.aperture_start = 0; s390_domain->domain.geometry.aperture_end = ZPCI_TABLE_SIZE_RT - 1; - spin_lock_init(&s390_domain->dma_table_lock); spin_lock_init(&s390_domain->list_lock); INIT_LIST_HEAD_RCU(&s390_domain->devices); @@ -265,14 +263,10 @@ static int s390_iommu_validate_trans(struct s390_domain *s390_domain, unsigned long nr_pages, int flags) { phys_addr_t page_addr = pa & PAGE_MASK; - unsigned long irq_flags, i; unsigned long *entry; + unsigned long i; int rc; - if (!nr_pages) - return 0; - - spin_lock_irqsave(&s390_domain->dma_table_lock, irq_flags); for (i = 0; i < nr_pages; i++) { entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr); if (unlikely(!entry)) { @@ -283,7 +277,6 @@ static int s390_iommu_validate_trans(struct s390_domain *s390_domain, page_addr += PAGE_SIZE; dma_addr += PAGE_SIZE; } - spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags); return 0; @@ -296,7 +289,6 @@ undo_cpu_trans: break; dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID); } - spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags); return rc; } @@ -304,14 +296,10 @@ undo_cpu_trans: static int s390_iommu_invalidate_trans(struct s390_domain *s390_domain, dma_addr_t dma_addr, unsigned long nr_pages) { - unsigned long irq_flags, i; unsigned long *entry; + unsigned long i; int rc = 0; - if (!nr_pages) - return 0; - - spin_lock_irqsave(&s390_domain->dma_table_lock, irq_flags); for (i = 0; i < nr_pages; i++) { entry = dma_walk_cpu_trans(s390_domain->dma_table, dma_addr); if (unlikely(!entry)) { @@ -321,7 +309,6 @@ static int s390_iommu_invalidate_trans(struct s390_domain *s390_domain, dma_update_cpu_trans(entry, 0, ZPCI_PTE_INVALID); dma_addr += PAGE_SIZE; } - spin_unlock_irqrestore(&s390_domain->dma_table_lock, irq_flags); return rc; } @@ -363,7 +350,8 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { struct s390_domain *s390_domain = to_s390_domain(domain); - unsigned long *sto, *pto, *rto, flags; + unsigned long *rto, *sto, *pto; + unsigned long ste, pte, rte; unsigned int rtx, sx, px; phys_addr_t phys = 0; @@ -376,16 +364,17 @@ static phys_addr_t s390_iommu_iova_to_phys(struct iommu_domain *domain, px = calc_px(iova); rto = s390_domain->dma_table; - spin_lock_irqsave(&s390_domain->dma_table_lock, flags); - if (rto && reg_entry_isvalid(rto[rtx])) { - sto = get_rt_sto(rto[rtx]); - if (sto && reg_entry_isvalid(sto[sx])) { - pto = get_st_pto(sto[sx]); - if (pto && pt_entry_isvalid(pto[px])) - phys = pto[px] & ZPCI_PTE_ADDR_MASK; + rte = READ_ONCE(rto[rtx]); + if (reg_entry_isvalid(rte)) { + sto = get_rt_sto(rte); + ste = READ_ONCE(sto[sx]); + if (reg_entry_isvalid(ste)) { + pto = get_st_pto(ste); + pte = READ_ONCE(pto[px]); + if (pt_entry_isvalid(pte)) + phys = pte & ZPCI_PTE_ADDR_MASK; } } - spin_unlock_irqrestore(&s390_domain->dma_table_lock, flags); return phys; } -- cgit