diff options
Diffstat (limited to 'drivers/iommu')
57 files changed, 6350 insertions, 1938 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index b3aa1f5d5321..47c46e4b739e 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -195,6 +195,7 @@ config MSM_IOMMU source "drivers/iommu/amd/Kconfig" source "drivers/iommu/intel/Kconfig" source "drivers/iommu/iommufd/Kconfig" +source "drivers/iommu/riscv/Kconfig" config IRQ_REMAP bool "Support for Interrupt Remapping" @@ -415,6 +416,15 @@ config ARM_SMMU_V3_SVA Say Y here if your system supports SVA extensions such as PCIe PASID and PRI. +config ARM_SMMU_V3_IOMMUFD + bool "Enable IOMMUFD features for ARM SMMUv3 (EXPERIMENTAL)" + depends on IOMMUFD + help + Support for IOMMUFD features intended to support virtual machines + with accelerated virtual IOMMUs. + + Say Y here if you are doing development and testing on this feature. + config ARM_SMMU_V3_KUNIT_TEST tristate "KUnit tests for arm-smmu-v3 driver" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 542760d963ec..5e5a83c6c2aa 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += amd/ intel/ arm/ iommufd/ +obj-y += amd/ intel/ arm/ iommufd/ riscv/ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 6386fa4556d9..1bef5d55b2f9 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -46,13 +46,15 @@ extern int amd_iommu_gpt_level; extern unsigned long amd_iommu_pgsize_bitmap; /* Protection domain ops */ +void amd_iommu_init_identity_domain(void); struct protection_domain *protection_domain_alloc(unsigned int type, int nid); void protection_domain_free(struct protection_domain *domain); struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct mm_struct *mm); void amd_iommu_domain_free(struct iommu_domain *dom); int iommu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid); + struct device *dev, ioasid_t pasid, + struct iommu_domain *old); void amd_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain); @@ -118,9 +120,14 @@ static inline bool check_feature2(u64 mask) return (amd_iommu_efr2 & mask); } +static inline bool amd_iommu_v2_pgtbl_supported(void) +{ + return (check_feature(FEATURE_GIOSUP) && check_feature(FEATURE_GT)); +} + static inline bool amd_iommu_gt_ppr_supported(void) { - return (check_feature(FEATURE_GT) && + return (amd_iommu_v2_pgtbl_supported() && check_feature(FEATURE_PPR) && check_feature(FEATURE_EPHSUP)); } diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 601fb4ee6900..fdb0357e0bb9 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -565,6 +565,12 @@ struct pdom_dev_data { struct list_head list; }; +/* Keeps track of the IOMMUs attached to protection domain */ +struct pdom_iommu_info { + struct amd_iommu *iommu; /* IOMMUs attach to protection domain */ + u32 refcnt; /* Count of attached dev/pasid per domain/IOMMU */ +}; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. @@ -578,8 +584,7 @@ struct protection_domain { u16 id; /* the domain id written to the device table */ enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ - unsigned dev_cnt; /* devices assigned to this domain */ - unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ + struct xarray iommu_array; /* per-IOMMU reference count */ struct mmu_notifier mn; /* mmu notifier for the SVA domain */ struct list_head dev_data_list; /* List of pdom_dev_data */ @@ -831,7 +836,7 @@ struct devid_map { */ struct iommu_dev_data { /*Protect against attach/detach races */ - spinlock_t lock; + struct mutex mutex; struct list_head list; /* For domain->dev_list */ struct llist_node dev_data_list; /* For global dev_data_list */ @@ -873,12 +878,6 @@ extern struct list_head amd_iommu_pci_seg_list; extern struct list_head amd_iommu_list; /* - * Array with pointers to each IOMMU struct - * The indices are referenced in the protection domains - */ -extern struct amd_iommu *amd_iommus[MAX_IOMMUS]; - -/* * Structure defining one entry in the device table */ struct dev_table_entry { @@ -912,14 +911,14 @@ struct unity_map_entry { /* size of the dma_ops aperture as power of 2 */ extern unsigned amd_iommu_aperture_order; -/* allocation bitmap for domain ids */ -extern unsigned long *amd_iommu_pd_alloc_bitmap; - extern bool amd_iommu_force_isolation; /* Max levels of glxval supported */ extern int amd_iommu_max_glx_val; +/* IDA to track protection domain IDs */ +extern struct ida pdom_ids; + /* Global EFR and EFR2 registers */ extern u64 amd_iommu_efr; extern u64 amd_iommu_efr2; diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 43131c3a2172..0e0a531042ac 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -177,9 +177,6 @@ LIST_HEAD(amd_iommu_pci_seg_list); /* list of all PCI segments */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ -/* Array to assign indices to IOMMUs*/ -struct amd_iommu *amd_iommus[MAX_IOMMUS]; - /* Number of IOMMUs present in the system */ static int amd_iommus_present; @@ -194,12 +191,6 @@ bool amd_iommu_force_isolation __read_mostly; unsigned long amd_iommu_pgsize_bitmap __ro_after_init = AMD_IOMMU_PGSIZES; -/* - * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap - * to know which ones are already in use. - */ -unsigned long *amd_iommu_pd_alloc_bitmap; - enum iommu_init_state { IOMMU_START_STATE, IOMMU_IVRS_DETECTED, @@ -1082,7 +1073,12 @@ static bool __copy_device_table(struct amd_iommu *iommu) if (dte_v && dom_id) { pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0]; pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1]; - __set_bit(dom_id, amd_iommu_pd_alloc_bitmap); + /* Reserve the Domain IDs used by previous kernel */ + if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) { + pr_err("Failed to reserve domain ID 0x%x\n", dom_id); + memunmap(old_devtb); + return false; + } /* If gcr3 table existed, mask it out */ if (old_devtb[devid].data[0] & DTE_FLAG_GV) { tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; @@ -1744,9 +1740,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, return -ENOSYS; } - /* Index is fine - add IOMMU to the array */ - amd_iommus[iommu->index] = iommu; - /* * Copy data from ACPI table entry to the iommu struct */ @@ -2070,14 +2063,6 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) init_iommu_perf_ctr(iommu); - if (amd_iommu_pgtable == AMD_IOMMU_V2) { - if (!check_feature(FEATURE_GIOSUP) || - !check_feature(FEATURE_GT)) { - pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); - amd_iommu_pgtable = AMD_IOMMU_V1; - } - } - if (is_rd890_iommu(iommu->dev)) { int i, j; @@ -2172,6 +2157,9 @@ static int __init amd_iommu_init_pci(void) struct amd_iommu_pci_seg *pci_seg; int ret; + /* Init global identity domain before registering IOMMU */ + amd_iommu_init_identity_domain(); + for_each_iommu(iommu) { ret = iommu_init_pci(iommu); if (ret) { @@ -2882,11 +2870,6 @@ static void enable_iommus_vapic(void) #endif } -static void enable_iommus(void) -{ - early_enable_iommus(); -} - static void disable_iommus(void) { struct amd_iommu *iommu; @@ -2913,7 +2896,8 @@ static void amd_iommu_resume(void) iommu_apply_resume_quirks(iommu); /* re-load the hardware */ - enable_iommus(); + for_each_iommu(iommu) + early_enable_iommu(iommu); amd_iommu_enable_interrupts(); } @@ -2994,9 +2978,7 @@ static bool __init check_ioapic_information(void) static void __init free_dma_resources(void) { - iommu_free_pages(amd_iommu_pd_alloc_bitmap, - get_order(MAX_DOMAIN_ID / 8)); - amd_iommu_pd_alloc_bitmap = NULL; + ida_destroy(&pdom_ids); free_unity_maps(); } @@ -3064,20 +3046,6 @@ static int __init early_amd_iommu_init(void) amd_iommu_target_ivhd_type = get_highest_supported_ivhd_type(ivrs_base); DUMP_printk("Using IVHD type %#x\n", amd_iommu_target_ivhd_type); - /* Device table - directly used by all IOMMUs */ - ret = -ENOMEM; - - amd_iommu_pd_alloc_bitmap = iommu_alloc_pages(GFP_KERNEL, - get_order(MAX_DOMAIN_ID / 8)); - if (amd_iommu_pd_alloc_bitmap == NULL) - goto out; - - /* - * never allocate domain 0 because its used as the non-allocated and - * error value placeholder - */ - __set_bit(0, amd_iommu_pd_alloc_bitmap); - /* * now the data structures are allocated and basically initialized * start the real acpi table scan @@ -3091,6 +3059,13 @@ static int __init early_amd_iommu_init(void) FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL) amd_iommu_gpt_level = PAGE_MODE_5_LEVEL; + if (amd_iommu_pgtable == AMD_IOMMU_V2) { + if (!amd_iommu_v2_pgtbl_supported()) { + pr_warn("Cannot enable v2 page table for DMA-API. Fallback to v1.\n"); + amd_iommu_pgtable = AMD_IOMMU_V1; + } + } + /* Disable any previously enabled IOMMUs */ if (!is_kdump_kernel() || amd_iommu_disabled) disable_iommus(); diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 804b788f3f16..f3399087859f 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -118,6 +118,7 @@ static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) */ static bool increase_address_space(struct amd_io_pgtable *pgtable, unsigned long address, + unsigned int page_size_level, gfp_t gfp) { struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; @@ -133,7 +134,8 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable, spin_lock_irqsave(&domain->lock, flags); - if (address <= PM_LEVEL_SIZE(pgtable->mode)) + if (address <= PM_LEVEL_SIZE(pgtable->mode) && + pgtable->mode - 1 >= page_size_level) goto out; ret = false; @@ -163,18 +165,21 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, gfp_t gfp, bool *updated) { + unsigned long last_addr = address + (page_size - 1); struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; int level, end_lvl; u64 *pte, *page; BUG_ON(!is_power_of_2(page_size)); - while (address > PM_LEVEL_SIZE(pgtable->mode)) { + while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || + pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { /* * Return an error if there is no memory to update the * page-table. */ - if (!increase_address_space(pgtable, address, gfp)) + if (!increase_address_space(pgtable, last_addr, + PAGE_SIZE_LEVEL(page_size), gfp)) return NULL; } diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 25b9042fa453..c616de2c5926 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -268,8 +268,11 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, out: if (updated) { struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); + unsigned long flags; + spin_lock_irqsave(&pdom->lock, flags); amd_iommu_domain_flush_pages(pdom, o_iova, size); + spin_unlock_irqrestore(&pdom->lock, flags); } if (mapped) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 8364cd6fa47d..5ce8e6504ba7 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -18,6 +18,7 @@ #include <linux/scatterlist.h> #include <linux/dma-map-ops.h> #include <linux/dma-direct.h> +#include <linux/idr.h> #include <linux/iommu-helper.h> #include <linux/delay.h> #include <linux/amd-iommu.h> @@ -52,8 +53,6 @@ #define HT_RANGE_START (0xfd00000000ULL) #define HT_RANGE_END (0xffffffffffULL) -static DEFINE_SPINLOCK(pd_bitmap_lock); - LIST_HEAD(ioapic_map); LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); @@ -70,9 +69,16 @@ struct iommu_cmd { u32 data[4]; }; +/* + * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap + * to know which ones are already in use. + */ +DEFINE_IDA(pdom_ids); + struct kmem_cache *amd_iommu_irq_cache; -static void detach_device(struct device *dev); +static int amd_iommu_attach_device(struct iommu_domain *dom, + struct device *dev); static void set_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data); @@ -202,7 +208,7 @@ static struct iommu_dev_data *alloc_dev_data(struct amd_iommu *iommu, u16 devid) if (!dev_data) return NULL; - spin_lock_init(&dev_data->lock); + mutex_init(&dev_data->mutex); dev_data->devid = devid; ratelimit_default_init(&dev_data->rs); @@ -555,22 +561,6 @@ static void iommu_ignore_device(struct amd_iommu *iommu, struct device *dev) setup_aliases(iommu, dev); } -static void amd_iommu_uninit_device(struct device *dev) -{ - struct iommu_dev_data *dev_data; - - dev_data = dev_iommu_priv_get(dev); - if (!dev_data) - return; - - if (dev_data->domain) - detach_device(dev); - - /* - * We keep dev_data around for unplugged devices and reuse it when the - * device is re-plugged - not doing so would introduce a ton of races. - */ -} /**************************************************************************** * @@ -1230,7 +1220,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (!iommu->need_sync) return 0; - data = atomic64_add_return(1, &iommu->cmd_sem_val); + data = atomic64_inc_return(&iommu->cmd_sem_val); build_completion_wait(&cmd, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); @@ -1249,18 +1239,17 @@ out_unlock: static void domain_flush_complete(struct protection_domain *domain) { - int i; + struct pdom_iommu_info *pdom_iommu_info; + unsigned long i; - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (domain && !domain->dev_iommu[i]) - continue; + lockdep_assert_held(&domain->lock); - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); - } + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + xa_for_each(&domain->iommu_array, i, pdom_iommu_info) + iommu_completion_wait(pdom_iommu_info->iommu); } static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) @@ -1442,21 +1431,22 @@ static int domain_flush_pages_v2(struct protection_domain *pdom, static int domain_flush_pages_v1(struct protection_domain *pdom, u64 address, size_t size) { + struct pdom_iommu_info *pdom_iommu_info; struct iommu_cmd cmd; - int ret = 0, i; + int ret = 0; + unsigned long i; + + lockdep_assert_held(&pdom->lock); build_inv_iommu_pages(&cmd, address, size, pdom->id, IOMMU_NO_PASID, false); - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (!pdom->dev_iommu[i]) - continue; - + xa_for_each(&pdom->iommu_array, i, pdom_iommu_info) { /* * Devices of this domain are behind this IOMMU * We need a TLB flush */ - ret |= iommu_queue_command(amd_iommus[i], &cmd); + ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); } return ret; @@ -1495,6 +1485,8 @@ static void __domain_flush_pages(struct protection_domain *domain, void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size) { + lockdep_assert_held(&domain->lock); + if (likely(!amd_iommu_np_cache)) { __domain_flush_pages(domain, address, size); @@ -1640,31 +1632,14 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) * ****************************************************************************/ -static u16 domain_id_alloc(void) +static int pdom_id_alloc(void) { - unsigned long flags; - int id; - - spin_lock_irqsave(&pd_bitmap_lock, flags); - id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); - BUG_ON(id == 0); - if (id > 0 && id < MAX_DOMAIN_ID) - __set_bit(id, amd_iommu_pd_alloc_bitmap); - else - id = 0; - spin_unlock_irqrestore(&pd_bitmap_lock, flags); - - return id; + return ida_alloc_range(&pdom_ids, 1, MAX_DOMAIN_ID - 1, GFP_ATOMIC); } -static void domain_id_free(int id) +static void pdom_id_free(int id) { - unsigned long flags; - - spin_lock_irqsave(&pd_bitmap_lock, flags); - if (id > 0 && id < MAX_DOMAIN_ID) - __clear_bit(id, amd_iommu_pd_alloc_bitmap); - spin_unlock_irqrestore(&pd_bitmap_lock, flags); + ida_free(&pdom_ids, id); } static void free_gcr3_tbl_level1(u64 *tbl) @@ -1709,7 +1684,7 @@ static void free_gcr3_table(struct gcr3_tbl_info *gcr3_info) gcr3_info->glx = 0; /* Free per device domain ID */ - domain_id_free(gcr3_info->domid); + pdom_id_free(gcr3_info->domid); iommu_free_page(gcr3_info->gcr3_tbl); gcr3_info->gcr3_tbl = NULL; @@ -1736,6 +1711,7 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, { int levels = get_gcr3_levels(pasids); int nid = iommu ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; + int domid; if (levels > amd_iommu_max_glx_val) return -EINVAL; @@ -1744,11 +1720,14 @@ static int setup_gcr3_table(struct gcr3_tbl_info *gcr3_info, return -EBUSY; /* Allocate per device domain ID */ - gcr3_info->domid = domain_id_alloc(); + domid = pdom_id_alloc(); + if (domid <= 0) + return -ENOSPC; + gcr3_info->domid = domid; gcr3_info->gcr3_tbl = iommu_alloc_page_node(nid, GFP_ATOMIC); if (gcr3_info->gcr3_tbl == NULL) { - domain_id_free(gcr3_info->domid); + pdom_id_free(domid); return -ENOMEM; } @@ -2019,57 +1998,69 @@ static void destroy_gcr3_table(struct iommu_dev_data *dev_data, free_gcr3_table(gcr3_info); } -static int do_attach(struct iommu_dev_data *dev_data, - struct protection_domain *domain) +static int pdom_attach_iommu(struct amd_iommu *iommu, + struct protection_domain *pdom) { - struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - struct io_pgtable_cfg *cfg = &domain->iop.pgtbl.cfg; + struct pdom_iommu_info *pdom_iommu_info, *curr; + struct io_pgtable_cfg *cfg = &pdom->iop.pgtbl.cfg; + unsigned long flags; int ret = 0; - /* Update data structures */ - dev_data->domain = domain; - list_add(&dev_data->list, &domain->dev_list); + spin_lock_irqsave(&pdom->lock, flags); - /* Update NUMA Node ID */ - if (cfg->amd.nid == NUMA_NO_NODE) - cfg->amd.nid = dev_to_node(dev_data->dev); + pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); + if (pdom_iommu_info) { + pdom_iommu_info->refcnt++; + goto out_unlock; + } - /* Do reference counting */ - domain->dev_iommu[iommu->index] += 1; - domain->dev_cnt += 1; + pdom_iommu_info = kzalloc(sizeof(*pdom_iommu_info), GFP_ATOMIC); + if (!pdom_iommu_info) { + ret = -ENOMEM; + goto out_unlock; + } - /* Setup GCR3 table */ - if (pdom_is_sva_capable(domain)) { - ret = init_gcr3_table(dev_data, domain); - if (ret) - return ret; + pdom_iommu_info->iommu = iommu; + pdom_iommu_info->refcnt = 1; + + curr = xa_cmpxchg(&pdom->iommu_array, iommu->index, + NULL, pdom_iommu_info, GFP_ATOMIC); + if (curr) { + kfree(pdom_iommu_info); + ret = -ENOSPC; + goto out_unlock; } + /* Update NUMA Node ID */ + if (cfg->amd.nid == NUMA_NO_NODE) + cfg->amd.nid = dev_to_node(&iommu->dev->dev); + +out_unlock: + spin_unlock_irqrestore(&pdom->lock, flags); return ret; } -static void do_detach(struct iommu_dev_data *dev_data) +static void pdom_detach_iommu(struct amd_iommu *iommu, + struct protection_domain *pdom) { - struct protection_domain *domain = dev_data->domain; - struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); - - /* Clear DTE and flush the entry */ - dev_update_dte(dev_data, false); + struct pdom_iommu_info *pdom_iommu_info; + unsigned long flags; - /* Flush IOTLB and wait for the flushes to finish */ - amd_iommu_domain_flush_all(domain); + spin_lock_irqsave(&pdom->lock, flags); - /* Clear GCR3 table */ - if (pdom_is_sva_capable(domain)) - destroy_gcr3_table(dev_data, domain); + pdom_iommu_info = xa_load(&pdom->iommu_array, iommu->index); + if (!pdom_iommu_info) { + spin_unlock_irqrestore(&pdom->lock, flags); + return; + } - /* Update data structures */ - dev_data->domain = NULL; - list_del(&dev_data->list); + pdom_iommu_info->refcnt--; + if (pdom_iommu_info->refcnt == 0) { + xa_erase(&pdom->iommu_array, iommu->index); + kfree(pdom_iommu_info); + } - /* decrease reference counters - needs to happen after the flushes */ - domain->dev_iommu[iommu->index] -= 1; - domain->dev_cnt -= 1; + spin_unlock_irqrestore(&pdom->lock, flags); } /* @@ -2079,27 +2070,56 @@ static void do_detach(struct iommu_dev_data *dev_data) static int attach_device(struct device *dev, struct protection_domain *domain) { - struct iommu_dev_data *dev_data; - unsigned long flags; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct pci_dev *pdev; int ret = 0; - spin_lock_irqsave(&domain->lock, flags); - - dev_data = dev_iommu_priv_get(dev); - - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); if (dev_data->domain != NULL) { ret = -EBUSY; goto out; } - ret = do_attach(dev_data, domain); + /* Do reference counting */ + ret = pdom_attach_iommu(iommu, domain); + if (ret) + goto out; -out: - spin_unlock(&dev_data->lock); + /* Setup GCR3 table */ + if (pdom_is_sva_capable(domain)) { + ret = init_gcr3_table(dev_data, domain); + if (ret) { + pdom_detach_iommu(iommu, domain); + goto out; + } + } - spin_unlock_irqrestore(&domain->lock, flags); + pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; + if (pdev && pdom_is_sva_capable(domain)) { + pdev_enable_caps(pdev); + + /* + * Device can continue to function even if IOPF + * enablement failed. Hence in error path just + * disable device PRI support. + */ + if (amd_iommu_iopf_add_device(iommu, dev_data)) + pdev_disable_cap_pri(pdev); + } else if (pdev) { + pdev_enable_cap_ats(pdev); + } + + /* Update data structures */ + dev_data->domain = domain; + list_add(&dev_data->list, &domain->dev_list); + + /* Update device table */ + dev_update_dte(dev_data, true); + +out: + mutex_unlock(&dev_data->mutex); return ret; } @@ -2110,14 +2130,11 @@ out: static void detach_device(struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - struct protection_domain *domain = dev_data->domain; struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct protection_domain *domain = dev_data->domain; unsigned long flags; - bool ppr = dev_data->ppr; - - spin_lock_irqsave(&domain->lock, flags); - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); /* * First check if the device is still attached. It might already @@ -2128,27 +2145,36 @@ static void detach_device(struct device *dev) if (WARN_ON(!dev_data->domain)) goto out; - if (ppr) { + /* Remove IOPF handler */ + if (dev_data->ppr) { iopf_queue_flush_dev(dev); - - /* Updated here so that it gets reflected in DTE */ - dev_data->ppr = false; + amd_iommu_iopf_remove_device(iommu, dev_data); } - do_detach(dev_data); + if (dev_is_pci(dev)) + pdev_disable_caps(to_pci_dev(dev)); -out: - spin_unlock(&dev_data->lock); + /* Clear DTE and flush the entry */ + dev_update_dte(dev_data, false); + /* Flush IOTLB and wait for the flushes to finish */ + spin_lock_irqsave(&domain->lock, flags); + amd_iommu_domain_flush_all(domain); spin_unlock_irqrestore(&domain->lock, flags); - /* Remove IOPF handler */ - if (ppr) - amd_iommu_iopf_remove_device(iommu, dev_data); + /* Clear GCR3 table */ + if (pdom_is_sva_capable(domain)) + destroy_gcr3_table(dev_data, domain); - if (dev_is_pci(dev)) - pdev_disable_caps(to_pci_dev(dev)); + /* Update data structures */ + dev_data->domain = NULL; + list_del(&dev_data->list); + + /* decrease reference counters - needs to happen after the flushes */ + pdom_detach_iommu(iommu, domain); +out: + mutex_unlock(&dev_data->mutex); } static struct iommu_device *amd_iommu_probe_device(struct device *dev) @@ -2205,17 +2231,14 @@ out_err: static void amd_iommu_release_device(struct device *dev) { - struct amd_iommu *iommu; - - if (!check_device(dev)) - return; + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); - iommu = rlookup_amd_iommu(dev); - if (!iommu) - return; + WARN_ON(dev_data->domain); - amd_iommu_uninit_device(dev); - iommu_completion_wait(iommu); + /* + * We keep dev_data around for unplugged devices and reuse it when the + * device is re-plugged - not doing so would introduce a ton of races. + */ } static struct iommu_group *amd_iommu_device_group(struct device *dev) @@ -2236,70 +2259,53 @@ static struct iommu_group *amd_iommu_device_group(struct device *dev) * *****************************************************************************/ -static void cleanup_domain(struct protection_domain *domain) -{ - struct iommu_dev_data *entry; - - lockdep_assert_held(&domain->lock); - - if (!domain->dev_cnt) - return; - - while (!list_empty(&domain->dev_list)) { - entry = list_first_entry(&domain->dev_list, - struct iommu_dev_data, list); - BUG_ON(!entry->domain); - do_detach(entry); - } - WARN_ON(domain->dev_cnt != 0); -} - void protection_domain_free(struct protection_domain *domain) { WARN_ON(!list_empty(&domain->dev_list)); if (domain->domain.type & __IOMMU_DOMAIN_PAGING) free_io_pgtable_ops(&domain->iop.pgtbl.ops); - domain_id_free(domain->id); + pdom_id_free(domain->id); kfree(domain); } +static void protection_domain_init(struct protection_domain *domain, int nid) +{ + spin_lock_init(&domain->lock); + INIT_LIST_HEAD(&domain->dev_list); + INIT_LIST_HEAD(&domain->dev_data_list); + xa_init(&domain->iommu_array); + domain->iop.pgtbl.cfg.amd.nid = nid; +} + struct protection_domain *protection_domain_alloc(unsigned int type, int nid) { - struct io_pgtable_ops *pgtbl_ops; struct protection_domain *domain; - int pgtable; + int domid; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) return NULL; - domain->id = domain_id_alloc(); - if (!domain->id) - goto err_free; + domid = pdom_id_alloc(); + if (domid <= 0) { + kfree(domain); + return NULL; + } + domain->id = domid; - spin_lock_init(&domain->lock); - INIT_LIST_HEAD(&domain->dev_list); - INIT_LIST_HEAD(&domain->dev_data_list); - domain->iop.pgtbl.cfg.amd.nid = nid; + protection_domain_init(domain, nid); + + return domain; +} + +static int pdom_setup_pgtable(struct protection_domain *domain, + unsigned int type, int pgtable) +{ + struct io_pgtable_ops *pgtbl_ops; - switch (type) { /* No need to allocate io pgtable ops in passthrough mode */ - case IOMMU_DOMAIN_IDENTITY: - case IOMMU_DOMAIN_SVA: - return domain; - case IOMMU_DOMAIN_DMA: - pgtable = amd_iommu_pgtable; - break; - /* - * Force IOMMU v1 page table when allocating - * domain for pass-through devices. - */ - case IOMMU_DOMAIN_UNMANAGED: - pgtable = AMD_IOMMU_V1; - break; - default: - goto err_id; - } + if (!(type & __IOMMU_DOMAIN_PAGING)) + return 0; switch (pgtable) { case AMD_IOMMU_V1: @@ -2309,25 +2315,20 @@ struct protection_domain *protection_domain_alloc(unsigned int type, int nid) domain->pd_mode = PD_MODE_V2; break; default: - goto err_id; + return -EINVAL; } pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl.cfg, domain); if (!pgtbl_ops) - goto err_id; + return -ENOMEM; - return domain; -err_id: - domain_id_free(domain->id); -err_free: - kfree(domain); - return NULL; + return 0; } -static inline u64 dma_max_address(void) +static inline u64 dma_max_address(int pgtable) { - if (amd_iommu_pgtable == AMD_IOMMU_V1) + if (pgtable == AMD_IOMMU_V1) return ~0ULL; /* V2 with 4/5 level page table */ @@ -2340,11 +2341,13 @@ static bool amd_iommu_hd_support(struct amd_iommu *iommu) } static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, - struct device *dev, u32 flags) + struct device *dev, + u32 flags, int pgtable) { bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct protection_domain *domain; struct amd_iommu *iommu = NULL; + int ret; if (dev) iommu = get_amd_iommu_from_dev(dev); @@ -2356,16 +2359,20 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) return ERR_PTR(-EINVAL); - if (dirty_tracking && !amd_iommu_hd_support(iommu)) - return ERR_PTR(-EOPNOTSUPP); - domain = protection_domain_alloc(type, dev ? dev_to_node(dev) : NUMA_NO_NODE); if (!domain) return ERR_PTR(-ENOMEM); + ret = pdom_setup_pgtable(domain, type, pgtable); + if (ret) { + pdom_id_free(domain->id); + kfree(domain); + return ERR_PTR(ret); + } + domain->domain.geometry.aperture_start = 0; - domain->domain.geometry.aperture_end = dma_max_address(); + domain->domain.geometry.aperture_end = dma_max_address(pgtable); domain->domain.geometry.force_aperture = true; domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; @@ -2383,8 +2390,16 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) { struct iommu_domain *domain; + int pgtable = amd_iommu_pgtable; - domain = do_iommu_domain_alloc(type, NULL, 0); + /* + * Force IOMMU v1 page table when allocating + * domain for pass-through devices. + */ + if (type == IOMMU_DOMAIN_UNMANAGED) + pgtable = AMD_IOMMU_V1; + + domain = do_iommu_domain_alloc(type, NULL, 0, pgtable); if (IS_ERR(domain)) return NULL; @@ -2398,25 +2413,41 @@ amd_iommu_domain_alloc_user(struct device *dev, u32 flags, { unsigned int type = IOMMU_DOMAIN_UNMANAGED; + struct amd_iommu *iommu = NULL; + const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID; - if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) + if (dev) + iommu = get_amd_iommu_from_dev(dev); + + if ((flags & ~supported_flags) || parent || user_data) return ERR_PTR(-EOPNOTSUPP); - return do_iommu_domain_alloc(type, dev, flags); -} + /* Allocate domain with v2 page table if IOMMU supports PASID. */ + if (flags & IOMMU_HWPT_ALLOC_PASID) { + if (!amd_iommu_pasid_supported()) + return ERR_PTR(-EOPNOTSUPP); -void amd_iommu_domain_free(struct iommu_domain *dom) -{ - struct protection_domain *domain; - unsigned long flags; + return do_iommu_domain_alloc(type, dev, flags, AMD_IOMMU_V2); + } - domain = to_pdomain(dom); + /* Allocate domain with v1 page table for dirty tracking */ + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) { + if (iommu && amd_iommu_hd_support(iommu)) { + return do_iommu_domain_alloc(type, dev, + flags, AMD_IOMMU_V1); + } - spin_lock_irqsave(&domain->lock, flags); + return ERR_PTR(-EOPNOTSUPP); + } - cleanup_domain(domain); + /* If nothing specific is required use the kernel commandline default */ + return do_iommu_domain_alloc(type, dev, 0, amd_iommu_pgtable); +} - spin_unlock_irqrestore(&domain->lock, flags); +void amd_iommu_domain_free(struct iommu_domain *dom) +{ + struct protection_domain *domain = to_pdomain(dom); protection_domain_free(domain); } @@ -2430,9 +2461,9 @@ static int blocked_domain_attach_device(struct iommu_domain *domain, detach_device(dev); /* Clear DTE and flush the entry */ - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->mutex); dev_update_dte(dev_data, false); - spin_unlock(&dev_data->lock); + mutex_unlock(&dev_data->mutex); return 0; } @@ -2444,13 +2475,39 @@ static struct iommu_domain blocked_domain = { } }; +static struct protection_domain identity_domain; + +static const struct iommu_domain_ops identity_domain_ops = { + .attach_dev = amd_iommu_attach_device, +}; + +void amd_iommu_init_identity_domain(void) +{ + struct iommu_domain *domain = &identity_domain.domain; + + domain->type = IOMMU_DOMAIN_IDENTITY; + domain->ops = &identity_domain_ops; + domain->owner = &amd_iommu_ops; + + identity_domain.id = pdom_id_alloc(); + + protection_domain_init(&identity_domain, NUMA_NO_NODE); +} + +/* Same as blocked domain except it supports only ops->attach_dev() */ +static struct iommu_domain release_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocked_domain_attach_device, + } +}; + static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); - struct pci_dev *pdev; int ret; /* @@ -2483,24 +2540,6 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } #endif - pdev = dev_is_pci(dev_data->dev) ? to_pci_dev(dev_data->dev) : NULL; - if (pdev && pdom_is_sva_capable(domain)) { - pdev_enable_caps(pdev); - - /* - * Device can continue to function even if IOPF - * enablement failed. Hence in error path just - * disable device PRI support. - */ - if (amd_iommu_iopf_add_device(iommu, dev_data)) - pdev_disable_cap_pri(pdev); - } else if (pdev) { - pdev_enable_cap_ats(pdev); - } - - /* Update device table */ - dev_update_dte(dev_data, true); - return ret; } @@ -2842,6 +2881,8 @@ static int amd_iommu_dev_disable_feature(struct device *dev, const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .blocked_domain = &blocked_domain, + .release_domain = &release_domain, + .identity_domain = &identity_domain.domain, .domain_alloc = amd_iommu_domain_alloc, .domain_alloc_user = amd_iommu_domain_alloc_user, .domain_alloc_sva = amd_iommu_domain_alloc_sva, @@ -2890,7 +2931,7 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) return; build_inv_irt(&cmd, devid); - data = atomic64_add_return(1, &iommu->cmd_sem_val); + data = atomic64_inc_return(&iommu->cmd_sem_val); build_completion_wait(&cmd2, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index 0657b9373be5..8c73a30c2800 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -100,7 +100,8 @@ static const struct mmu_notifier_ops sva_mn = { }; int iommu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct pdom_dev_data *pdom_dev_data; struct protection_domain *sva_pdom = to_pdomain(domain); @@ -108,6 +109,9 @@ int iommu_sva_set_dev_pasid(struct iommu_domain *domain, unsigned long flags; int ret = -EINVAL; + if (old) + return -EOPNOTSUPP; + /* PASID zero is used for requests from the I/O device without PASID */ if (!is_pasid_valid(dev_data, pasid)) return ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile index dc98c88b48c8..493a659cc66b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/Makefile +++ b/drivers/iommu/arm/arm-smmu-v3/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o arm_smmu_v3-y := arm-smmu-v3.o +arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_IOMMUFD) += arm-smmu-v3-iommufd.o arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c new file mode 100644 index 000000000000..6cc14d82399f --- /dev/null +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ + +#include <uapi/linux/iommufd.h> + +#include "arm-smmu-v3.h" + +void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct iommu_hw_info_arm_smmuv3 *info; + u32 __iomem *base_idr; + unsigned int i; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + base_idr = master->smmu->base + ARM_SMMU_IDR0; + for (i = 0; i <= 5; i++) + info->idr[i] = readl_relaxed(base_idr + i); + info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR); + info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3; + + return info; +} + +static void arm_smmu_make_nested_cd_table_ste( + struct arm_smmu_ste *target, struct arm_smmu_master *master, + struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) +{ + arm_smmu_make_s2_domain_ste( + target, master, nested_domain->vsmmu->s2_parent, ats_enabled); + + target->data[0] = cpu_to_le64(STRTAB_STE_0_V | + FIELD_PREP(STRTAB_STE_0_CFG, + STRTAB_STE_0_CFG_NESTED)); + target->data[0] |= nested_domain->ste[0] & + ~cpu_to_le64(STRTAB_STE_0_CFG); + target->data[1] |= nested_domain->ste[1]; +} + +/* + * Create a physical STE from the virtual STE that userspace provided when it + * created the nested domain. Using the vSTE userspace can request: + * - Non-valid STE + * - Abort STE + * - Bypass STE (install the S2, no CD table) + * - CD table STE (install the S2 and the userspace CD table) + */ +static void arm_smmu_make_nested_domain_ste( + struct arm_smmu_ste *target, struct arm_smmu_master *master, + struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) +{ + unsigned int cfg = + FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])); + + /* + * Userspace can request a non-valid STE through the nesting interface. + * We relay that into an abort physical STE with the intention that + * C_BAD_STE for this SID can be generated to userspace. + */ + if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) + cfg = STRTAB_STE_0_CFG_ABORT; + + switch (cfg) { + case STRTAB_STE_0_CFG_S1_TRANS: + arm_smmu_make_nested_cd_table_ste(target, master, nested_domain, + ats_enabled); + break; + case STRTAB_STE_0_CFG_BYPASS: + arm_smmu_make_s2_domain_ste(target, master, + nested_domain->vsmmu->s2_parent, + ats_enabled); + break; + case STRTAB_STE_0_CFG_ABORT: + default: + arm_smmu_make_abort_ste(target); + break; + } +} + +static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, + struct device *dev) +{ + struct arm_smmu_nested_domain *nested_domain = + to_smmu_nested_domain(domain); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_attach_state state = { + .master = master, + .old_domain = iommu_get_domain_for_dev(dev), + .ssid = IOMMU_NO_PASID, + }; + struct arm_smmu_ste ste; + int ret; + + if (nested_domain->vsmmu->smmu != master->smmu) + return -EINVAL; + if (arm_smmu_ssids_in_use(&master->cd_table)) + return -EBUSY; + + mutex_lock(&arm_smmu_asid_lock); + /* + * The VM has to control the actual ATS state at the PCI device because + * we forward the invalidations directly from the VM. If the VM doesn't + * think ATS is on it will not generate ATC flushes and the ATC will + * become incoherent. Since we can't access the actual virtual PCI ATS + * config bit here base this off the EATS value in the STE. If the EATS + * is set then the VM must generate ATC flushes. + */ + state.disable_ats = !nested_domain->enable_ats; + ret = arm_smmu_attach_prepare(&state, domain); + if (ret) { + mutex_unlock(&arm_smmu_asid_lock); + return ret; + } + + arm_smmu_make_nested_domain_ste(&ste, master, nested_domain, + state.ats_enabled); + arm_smmu_install_ste_for_dev(master, &ste); + arm_smmu_attach_commit(&state); + mutex_unlock(&arm_smmu_asid_lock); + return 0; +} + +static void arm_smmu_domain_nested_free(struct iommu_domain *domain) +{ + kfree(to_smmu_nested_domain(domain)); +} + +static const struct iommu_domain_ops arm_smmu_nested_ops = { + .attach_dev = arm_smmu_attach_dev_nested, + .free = arm_smmu_domain_nested_free, +}; + +static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg, + bool *enable_ats) +{ + unsigned int eats; + unsigned int cfg; + + if (!(arg->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) { + memset(arg->ste, 0, sizeof(arg->ste)); + return 0; + } + + /* EIO is reserved for invalid STE data. */ + if ((arg->ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) || + (arg->ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED)) + return -EIO; + + cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg->ste[0])); + if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS && + cfg != STRTAB_STE_0_CFG_S1_TRANS) + return -EIO; + + /* + * Only Full ATS or ATS UR is supported + * The EATS field will be set by arm_smmu_make_nested_domain_ste() + */ + eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg->ste[1])); + arg->ste[1] &= ~cpu_to_le64(STRTAB_STE_1_EATS); + if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS) + return -EIO; + + if (cfg == STRTAB_STE_0_CFG_S1_TRANS) + *enable_ats = (eats == STRTAB_STE_1_EATS_TRANS); + return 0; +} + +static struct iommu_domain * +arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + const u32 SUPPORTED_FLAGS = IOMMU_HWPT_FAULT_ID_VALID; + struct arm_smmu_nested_domain *nested_domain; + struct iommu_hwpt_arm_smmuv3 arg; + bool enable_ats = false; + int ret; + + /* + * Faults delivered to the nested domain are faults that originated by + * the S1 in the domain. The core code will match all PASIDs when + * delivering the fault due to user_pasid_table + */ + if (flags & ~SUPPORTED_FLAGS) + return ERR_PTR(-EOPNOTSUPP); + + ret = iommu_copy_struct_from_user(&arg, user_data, + IOMMU_HWPT_DATA_ARM_SMMUV3, ste); + if (ret) + return ERR_PTR(ret); + + ret = arm_smmu_validate_vste(&arg, &enable_ats); + if (ret) + return ERR_PTR(ret); + + nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT); + if (!nested_domain) + return ERR_PTR(-ENOMEM); + + nested_domain->domain.type = IOMMU_DOMAIN_NESTED; + nested_domain->domain.ops = &arm_smmu_nested_ops; + nested_domain->enable_ats = enable_ats; + nested_domain->vsmmu = vsmmu; + nested_domain->ste[0] = arg.ste[0]; + nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS); + + return &nested_domain->domain; +} + +static int arm_vsmmu_vsid_to_sid(struct arm_vsmmu *vsmmu, u32 vsid, u32 *sid) +{ + struct arm_smmu_master *master; + struct device *dev; + int ret = 0; + + xa_lock(&vsmmu->core.vdevs); + dev = iommufd_viommu_find_dev(&vsmmu->core, (unsigned long)vsid); + if (!dev) { + ret = -EIO; + goto unlock; + } + master = dev_iommu_priv_get(dev); + + /* At this moment, iommufd only supports PCI device that has one SID */ + if (sid) + *sid = master->streams[0].id; +unlock: + xa_unlock(&vsmmu->core.vdevs); + return ret; +} + +/* This is basically iommu_viommu_arm_smmuv3_invalidate in u64 for conversion */ +struct arm_vsmmu_invalidation_cmd { + union { + u64 cmd[2]; + struct iommu_viommu_arm_smmuv3_invalidate ucmd; + }; +}; + +/* + * Convert, in place, the raw invalidation command into an internal format that + * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are + * stored in CPU endian. + * + * Enforce the VMID or SID on the command. + */ +static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu, + struct arm_vsmmu_invalidation_cmd *cmd) +{ + /* Commands are le64 stored in u64 */ + cmd->cmd[0] = le64_to_cpu(cmd->ucmd.cmd[0]); + cmd->cmd[1] = le64_to_cpu(cmd->ucmd.cmd[1]); + + switch (cmd->cmd[0] & CMDQ_0_OP) { + case CMDQ_OP_TLBI_NSNH_ALL: + /* Convert to NH_ALL */ + cmd->cmd[0] = CMDQ_OP_TLBI_NH_ALL | + FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid); + cmd->cmd[1] = 0; + break; + case CMDQ_OP_TLBI_NH_VA: + case CMDQ_OP_TLBI_NH_VAA: + case CMDQ_OP_TLBI_NH_ALL: + case CMDQ_OP_TLBI_NH_ASID: + cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID; + cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vsmmu->vmid); + break; + case CMDQ_OP_ATC_INV: + case CMDQ_OP_CFGI_CD: + case CMDQ_OP_CFGI_CD_ALL: { + u32 sid, vsid = FIELD_GET(CMDQ_CFGI_0_SID, cmd->cmd[0]); + + if (arm_vsmmu_vsid_to_sid(vsmmu, vsid, &sid)) + return -EIO; + cmd->cmd[0] &= ~CMDQ_CFGI_0_SID; + cmd->cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, sid); + break; + } + default: + return -EIO; + } + return 0; +} + +static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + struct arm_smmu_device *smmu = vsmmu->smmu; + struct arm_vsmmu_invalidation_cmd *last; + struct arm_vsmmu_invalidation_cmd *cmds; + struct arm_vsmmu_invalidation_cmd *cur; + struct arm_vsmmu_invalidation_cmd *end; + int ret; + + cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + cur = cmds; + end = cmds + array->entry_num; + + static_assert(sizeof(*cmds) == 2 * sizeof(u64)); + ret = iommu_copy_struct_from_full_user_array( + cmds, sizeof(*cmds), array, + IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3); + if (ret) + goto out; + + last = cmds; + while (cur != end) { + ret = arm_vsmmu_convert_user_cmd(vsmmu, cur); + if (ret) + goto out; + + /* FIXME work in blocks of CMDQ_BATCH_ENTRIES and copy each block? */ + cur++; + if (cur != end && (cur - last) != CMDQ_BATCH_ENTRIES - 1) + continue; + + /* FIXME always uses the main cmdq rather than trying to group by type */ + ret = arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, last->cmd, + cur - last, true); + if (ret) { + cur--; + goto out; + } + last = cur; + } +out: + array->entry_num = cur - cmds; + kfree(cmds); + return ret; +} + +static const struct iommufd_viommu_ops arm_vsmmu_ops = { + .alloc_domain_nested = arm_vsmmu_alloc_domain_nested, + .cache_invalidate = arm_vsmmu_cache_invalidate, +}; + +struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, + struct iommu_domain *parent, + struct iommufd_ctx *ictx, + unsigned int viommu_type) +{ + struct arm_smmu_device *smmu = + iommu_get_iommu_dev(dev, struct arm_smmu_device, iommu); + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + struct arm_smmu_domain *s2_parent = to_smmu_domain(parent); + struct arm_vsmmu *vsmmu; + + if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3) + return ERR_PTR(-EOPNOTSUPP); + + if (!(smmu->features & ARM_SMMU_FEAT_NESTING)) + return ERR_PTR(-EOPNOTSUPP); + + if (s2_parent->smmu != master->smmu) + return ERR_PTR(-EINVAL); + + /* + * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW + * defect is needed to determine if arm_vsmmu_cache_invalidate() needs + * any change to remove this. + */ + if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) + return ERR_PTR(-EOPNOTSUPP); + + /* + * Must support some way to prevent the VM from bypassing the cache + * because VFIO currently does not do any cache maintenance. canwbs + * indicates the device is fully coherent and no cache maintenance is + * ever required, even for PCI No-Snoop. S2FWB means the S1 can't make + * things non-coherent using the memattr, but No-Snoop behavior is not + * effected. + */ + if (!arm_smmu_master_canwbs(master) && + !(smmu->features & ARM_SMMU_FEAT_S2FWB)) + return ERR_PTR(-EOPNOTSUPP); + + vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core, + &arm_vsmmu_ops); + if (IS_ERR(vsmmu)) + return ERR_CAST(vsmmu); + + vsmmu->smmu = smmu; + vsmmu->s2_parent = s2_parent; + /* FIXME Move VMID allocation from the S2 domain allocation to here */ + vsmmu->vmid = s2_parent->s2_cfg.vmid; + + return &vsmmu->core; +} + +MODULE_IMPORT_NS(IOMMUFD); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index a7c36654dee5..1d3e71569775 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -332,7 +332,8 @@ void arm_smmu_sva_notifier_synchronize(void) } static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) + struct device *dev, ioasid_t id, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); @@ -348,7 +349,7 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, * get reassigned */ arm_smmu_make_sva_cd(&target, master, domain->mm, smmu_domain->cd.asid); - ret = arm_smmu_set_pasid(master, smmu_domain, id, &target); + ret = arm_smmu_set_pasid(master, smmu_domain, id, &target, old); mmput(domain->mm); return ret; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 353fea58cd31..04630dbfedd9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -295,6 +295,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent) case CMDQ_OP_TLBI_NH_ASID: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid); fallthrough; + case CMDQ_OP_TLBI_NH_ALL: case CMDQ_OP_TLBI_S12_VMALL: cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid); break; @@ -765,9 +766,9 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds, * insert their own list of commands then all of the commands from one * CPU will appear before any of the commands from the other CPU. */ -static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, - struct arm_smmu_cmdq *cmdq, - u64 *cmds, int n, bool sync) +int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, u64 *cmds, int n, + bool sync) { u64 cmd_sync[CMDQ_ENT_DWORDS]; u32 prod; @@ -1045,7 +1046,8 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits) /* S2 translates */ if (cfg & BIT(1)) { used_bits[1] |= - cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG); + cpu_to_le64(STRTAB_STE_1_S2FWB | STRTAB_STE_1_EATS | + STRTAB_STE_1_SHCFG); used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR | STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI | @@ -1549,7 +1551,6 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, } } -VISIBLE_IF_KUNIT void arm_smmu_make_abort_ste(struct arm_smmu_ste *target) { memset(target, 0, sizeof(*target)); @@ -1632,7 +1633,6 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, } EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste); -VISIBLE_IF_KUNIT void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, @@ -1655,6 +1655,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, FIELD_PREP(STRTAB_STE_1_EATS, ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0)); + if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_S2FWB) + target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB); if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR) target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING)); @@ -2105,7 +2107,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, if (!master->ats_enabled) continue; - arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd); + if (master_domain->nested_ats_flush) { + /* + * If a S2 used as a nesting parent is changed we have + * no option but to completely flush the ATC. + */ + arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); + } else { + arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, + &cmd); + } for (i = 0; i < master->num_streams; i++) { cmd.atc.sid = master->streams[i].id; @@ -2232,6 +2243,15 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, } __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain); + if (smmu_domain->nest_parent) { + /* + * When the S2 domain changes all the nested S1 ASIDs have to be + * flushed too. + */ + cmd.opcode = CMDQ_OP_TLBI_NH_ALL; + arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd); + } + /* * Unfortunately, this can't be leaf-only since we may have * zapped an entire table. @@ -2293,6 +2313,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) case IOMMU_CAP_CACHE_COHERENCY: /* Assume that a coherent TCU implies coherent TBUs */ return master->smmu->features & ARM_SMMU_FEAT_COHERENCY; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return arm_smmu_master_canwbs(master); case IOMMU_CAP_NOEXEC: case IOMMU_CAP_DEFERRED_FLUSH: return true; @@ -2303,6 +2325,26 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) } } +static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain) +{ + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct arm_smmu_master_domain *master_domain; + unsigned long flags; + bool ret = true; + + spin_lock_irqsave(&smmu_domain->devices_lock, flags); + list_for_each_entry(master_domain, &smmu_domain->devices, + devices_elm) { + if (!arm_smmu_master_canwbs(master_domain->master)) { + ret = false; + break; + } + } + smmu_domain->enforce_cache_coherency = ret; + spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); + return ret; +} + struct arm_smmu_domain *arm_smmu_domain_alloc(void) { struct arm_smmu_domain *smmu_domain; @@ -2442,6 +2484,9 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain, pgtbl_cfg.oas = smmu->oas; fmt = ARM_64_LPAE_S2; finalise_stage_fn = arm_smmu_domain_finalise_s2; + if ((smmu->features & ARM_SMMU_FEAT_S2FWB) && + (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)) + pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB; break; default: return -EINVAL; @@ -2483,8 +2528,8 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid) } } -static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, - const struct arm_smmu_ste *target) +void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target) { int i, j; struct arm_smmu_device *smmu = master->smmu; @@ -2595,7 +2640,7 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master) static struct arm_smmu_master_domain * arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain, struct arm_smmu_master *master, - ioasid_t ssid) + ioasid_t ssid, bool nested_ats_flush) { struct arm_smmu_master_domain *master_domain; @@ -2604,7 +2649,8 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain, list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) { if (master_domain->master == master && - master_domain->ssid == ssid) + master_domain->ssid == ssid && + master_domain->nested_ats_flush == nested_ats_flush) return master_domain; } return NULL; @@ -2624,6 +2670,8 @@ to_smmu_domain_devices(struct iommu_domain *domain) if ((domain->type & __IOMMU_DOMAIN_PAGING) || domain->type == IOMMU_DOMAIN_SVA) return to_smmu_domain(domain); + if (domain->type == IOMMU_DOMAIN_NESTED) + return to_smmu_nested_domain(domain)->vsmmu->s2_parent; return NULL; } @@ -2633,13 +2681,18 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master, { struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain); struct arm_smmu_master_domain *master_domain; + bool nested_ats_flush = false; unsigned long flags; if (!smmu_domain) return; + if (domain->type == IOMMU_DOMAIN_NESTED) + nested_ats_flush = to_smmu_nested_domain(domain)->enable_ats; + spin_lock_irqsave(&smmu_domain->devices_lock, flags); - master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid); + master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid, + nested_ats_flush); if (master_domain) { list_del(&master_domain->devices_elm); kfree(master_domain); @@ -2649,16 +2702,6 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master, spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); } -struct arm_smmu_attach_state { - /* Inputs */ - struct iommu_domain *old_domain; - struct arm_smmu_master *master; - bool cd_needs_ats; - ioasid_t ssid; - /* Resulting state */ - bool ats_enabled; -}; - /* * Start the sequence to attach a domain to a master. The sequence contains three * steps: @@ -2679,8 +2722,8 @@ struct arm_smmu_attach_state { * new_domain can be a non-paging domain. In this case ATS will not be enabled, * and invalidations won't be tracked. */ -static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, - struct iommu_domain *new_domain) +int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, + struct iommu_domain *new_domain) { struct arm_smmu_master *master = state->master; struct arm_smmu_master_domain *master_domain; @@ -2706,7 +2749,8 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * enabled if we have arm_smmu_domain, those always have page * tables. */ - state->ats_enabled = arm_smmu_ats_supported(master); + state->ats_enabled = !state->disable_ats && + arm_smmu_ats_supported(master); } if (smmu_domain) { @@ -2715,6 +2759,9 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, return -ENOMEM; master_domain->master = master; master_domain->ssid = state->ssid; + if (new_domain->type == IOMMU_DOMAIN_NESTED) + master_domain->nested_ats_flush = + to_smmu_nested_domain(new_domain)->enable_ats; /* * During prepare we want the current smmu_domain and new @@ -2731,6 +2778,14 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * one of them. */ spin_lock_irqsave(&smmu_domain->devices_lock, flags); + if (smmu_domain->enforce_cache_coherency && + !arm_smmu_master_canwbs(master)) { + spin_unlock_irqrestore(&smmu_domain->devices_lock, + flags); + kfree(master_domain); + return -EINVAL; + } + if (state->ats_enabled) atomic_inc(&smmu_domain->nr_ats_masters); list_add(&master_domain->devices_elm, &smmu_domain->devices); @@ -2754,7 +2809,7 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, * completes synchronizing the PCI device's ATC and finishes manipulating the * smmu_domain->devices list. */ -static void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) +void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) { struct arm_smmu_master *master = state->master; @@ -2856,7 +2911,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) } static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t id) + struct device *dev, ioasid_t id, + struct iommu_domain *old) { struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); struct arm_smmu_master *master = dev_iommu_priv_get(dev); @@ -2882,7 +2938,7 @@ static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain, */ arm_smmu_make_s1_cd(&target_cd, master, smmu_domain); return arm_smmu_set_pasid(master, to_smmu_domain(domain), id, - &target_cd); + &target_cd, old); } static void arm_smmu_update_ste(struct arm_smmu_master *master, @@ -2912,16 +2968,13 @@ static void arm_smmu_update_ste(struct arm_smmu_master *master, int arm_smmu_set_pasid(struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, ioasid_t pasid, - struct arm_smmu_cd *cd) + struct arm_smmu_cd *cd, struct iommu_domain *old) { struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev); struct arm_smmu_attach_state state = { .master = master, - /* - * For now the core code prevents calling this when a domain is - * already attached, no need to set old_domain. - */ .ssid = pasid, + .old_domain = old, }; struct arm_smmu_cd *cdptr; int ret; @@ -3084,7 +3137,9 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, const struct iommu_user_data *user_data) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); - const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID | + IOMMU_HWPT_ALLOC_NEST_PARENT; struct arm_smmu_domain *smmu_domain; int ret; @@ -3093,10 +3148,22 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags, if (parent || user_data) return ERR_PTR(-EOPNOTSUPP); + if (flags & IOMMU_HWPT_ALLOC_PASID) + return arm_smmu_domain_alloc_paging(dev); + smmu_domain = arm_smmu_domain_alloc(); if (IS_ERR(smmu_domain)) return ERR_CAST(smmu_domain); + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) { + if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) { + ret = -EOPNOTSUPP; + goto err_free; + } + smmu_domain->stage = ARM_SMMU_DOMAIN_S2; + smmu_domain->nest_parent = true; + } + smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED; smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops; ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, flags); @@ -3378,21 +3445,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_S2; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_of_xlate(struct device *dev, const struct of_phandle_args *args) { @@ -3491,6 +3543,7 @@ static struct iommu_ops arm_smmu_ops = { .identity_domain = &arm_smmu_identity_domain, .blocked_domain = &arm_smmu_blocked_domain, .capable = arm_smmu_capable, + .hw_info = arm_smmu_hw_info, .domain_alloc_paging = arm_smmu_domain_alloc_paging, .domain_alloc_sva = arm_smmu_sva_domain_alloc, .domain_alloc_user = arm_smmu_domain_alloc_user, @@ -3504,17 +3557,19 @@ static struct iommu_ops arm_smmu_ops = { .dev_disable_feat = arm_smmu_dev_disable_feature, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, + .viommu_alloc = arm_vsmmu_alloc, + .user_pasid_table = 1, .pgsize_bitmap = -1UL, /* Restricted during device attach */ .owner = THIS_MODULE, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = arm_smmu_attach_dev, + .enforce_cache_coherency = arm_smmu_enforce_cache_coherency, .set_dev_pasid = arm_smmu_s1_set_dev_pasid, .map_pages = arm_smmu_map_pages, .unmap_pages = arm_smmu_unmap_pages, .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, .free = arm_smmu_domain_free_paging, } }; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 1e9952ca989f..0107d3f333a1 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -10,6 +10,7 @@ #include <linux/bitfield.h> #include <linux/iommu.h> +#include <linux/iommufd.h> #include <linux/kernel.h> #include <linux/mmzone.h> #include <linux/sizes.h> @@ -57,6 +58,7 @@ struct arm_smmu_device; #define IDR1_SIDSIZE GENMASK(5, 0) #define ARM_SMMU_IDR3 0xc +#define IDR3_FWB (1 << 8) #define IDR3_RIL (1 << 10) #define ARM_SMMU_IDR5 0x14 @@ -81,6 +83,8 @@ struct arm_smmu_device; #define IIDR_REVISION GENMASK(15, 12) #define IIDR_IMPLEMENTER GENMASK(11, 0) +#define ARM_SMMU_AIDR 0x1C + #define ARM_SMMU_CR0 0x20 #define CR0_ATSCHK (1 << 4) #define CR0_CMDQEN (1 << 3) @@ -241,6 +245,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_0_CFG_BYPASS 4 #define STRTAB_STE_0_CFG_S1_TRANS 5 #define STRTAB_STE_0_CFG_S2_TRANS 6 +#define STRTAB_STE_0_CFG_NESTED 7 #define STRTAB_STE_0_S1FMT GENMASK_ULL(5, 4) #define STRTAB_STE_0_S1FMT_LINEAR 0 @@ -261,6 +266,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_1_S1COR GENMASK_ULL(5, 4) #define STRTAB_STE_1_S1CSH GENMASK_ULL(7, 6) +#define STRTAB_STE_1_S2FWB (1UL << 25) #define STRTAB_STE_1_S1STALLD (1UL << 27) #define STRTAB_STE_1_EATS GENMASK_ULL(29, 28) @@ -292,6 +298,15 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid) #define STRTAB_STE_3_S2TTB_MASK GENMASK_ULL(51, 4) +/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */ +#define STRTAB_STE_0_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \ + STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX) +#define STRTAB_STE_1_NESTING_ALLOWED \ + cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR | \ + STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH | \ + STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS) + /* * Context descriptors. * @@ -511,8 +526,10 @@ struct arm_smmu_cmdq_ent { }; } cfgi; + #define CMDQ_OP_TLBI_NH_ALL 0x10 #define CMDQ_OP_TLBI_NH_ASID 0x11 #define CMDQ_OP_TLBI_NH_VA 0x12 + #define CMDQ_OP_TLBI_NH_VAA 0x13 #define CMDQ_OP_TLBI_EL2_ALL 0x20 #define CMDQ_OP_TLBI_EL2_ASID 0x21 #define CMDQ_OP_TLBI_EL2_VA 0x22 @@ -726,6 +743,7 @@ struct arm_smmu_device { #define ARM_SMMU_FEAT_ATTR_TYPES_OVR (1 << 20) #define ARM_SMMU_FEAT_HA (1 << 21) #define ARM_SMMU_FEAT_HD (1 << 22) +#define ARM_SMMU_FEAT_S2FWB (1 << 23) u32 features; #define ARM_SMMU_OPT_SKIP_PREFETCH (1 << 0) @@ -811,10 +829,20 @@ struct arm_smmu_domain { /* List of struct arm_smmu_master_domain */ struct list_head devices; spinlock_t devices_lock; + bool enforce_cache_coherency : 1; + bool nest_parent : 1; struct mmu_notifier mmu_notifier; }; +struct arm_smmu_nested_domain { + struct iommu_domain domain; + struct arm_vsmmu *vsmmu; + bool enable_ats : 1; + + __le64 ste[2]; +}; + /* The following are exposed for testing purposes. */ struct arm_smmu_entry_writer_ops; struct arm_smmu_entry_writer { @@ -827,21 +855,22 @@ struct arm_smmu_entry_writer_ops { void (*sync)(struct arm_smmu_entry_writer *writer); }; +void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); +void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, + struct arm_smmu_master *master, + struct arm_smmu_domain *smmu_domain, + bool ats_enabled); + #if IS_ENABLED(CONFIG_KUNIT) void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits); void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur, const __le64 *target); void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits); -void arm_smmu_make_abort_ste(struct arm_smmu_ste *target); void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu, struct arm_smmu_ste *target); void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target, struct arm_smmu_master *master, bool ats_enabled, unsigned int s1dss); -void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target, - struct arm_smmu_master *master, - struct arm_smmu_domain *smmu_domain, - bool ats_enabled); void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, struct arm_smmu_master *master, struct mm_struct *mm, u16 asid); @@ -851,6 +880,7 @@ struct arm_smmu_master_domain { struct list_head devices_elm; struct arm_smmu_master *master; ioasid_t ssid; + bool nested_ats_flush : 1; }; static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) @@ -858,6 +888,12 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom) return container_of(dom, struct arm_smmu_domain, domain); } +static inline struct arm_smmu_nested_domain * +to_smmu_nested_domain(struct iommu_domain *dom) +{ + return container_of(dom, struct arm_smmu_nested_domain, domain); +} + extern struct xarray arm_smmu_asid_xa; extern struct mutex arm_smmu_asid_lock; @@ -875,7 +911,7 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid, int arm_smmu_set_pasid(struct arm_smmu_master *master, struct arm_smmu_domain *smmu_domain, ioasid_t pasid, - struct arm_smmu_cd *cd); + struct arm_smmu_cd *cd, struct iommu_domain *old); void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid); void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, @@ -893,6 +929,33 @@ int arm_smmu_init_one_queue(struct arm_smmu_device *smmu, int arm_smmu_cmdq_init(struct arm_smmu_device *smmu, struct arm_smmu_cmdq *cmdq); +static inline bool arm_smmu_master_canwbs(struct arm_smmu_master *master) +{ + return dev_iommu_fwspec_get(master->dev)->flags & + IOMMU_FWSPEC_PCI_RC_CANWBS; +} + +struct arm_smmu_attach_state { + /* Inputs */ + struct iommu_domain *old_domain; + struct arm_smmu_master *master; + bool cd_needs_ats; + bool disable_ats; + ioasid_t ssid; + /* Resulting state */ + bool ats_enabled; +}; + +int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, + struct iommu_domain *new_domain); +void arm_smmu_attach_commit(struct arm_smmu_attach_state *state); +void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master, + const struct arm_smmu_ste *target); + +int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq *cmdq, u64 *cmds, int n, + bool sync); + #ifdef CONFIG_ARM_SMMU_V3_SVA bool arm_smmu_sva_supported(struct arm_smmu_device *smmu); bool arm_smmu_master_sva_supported(struct arm_smmu_master *master); @@ -949,4 +1012,23 @@ tegra241_cmdqv_probe(struct arm_smmu_device *smmu) return ERR_PTR(-ENODEV); } #endif /* CONFIG_TEGRA241_CMDQV */ + +struct arm_vsmmu { + struct iommufd_viommu core; + struct arm_smmu_device *smmu; + struct arm_smmu_domain *s2_parent; + u16 vmid; +}; + +#if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD) +void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type); +struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, + struct iommu_domain *parent, + struct iommufd_ctx *ictx, + unsigned int viommu_type); +#else +#define arm_smmu_hw_info NULL +#define arm_vsmmu_alloc NULL +#endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ + #endif /* _ARM_SMMU_V3_H */ diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index fcd13d301fff..c8ec74f089f3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -509,7 +509,8 @@ static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq) snprintf(name, 16, "vcmdq%u", vcmdq->idx); - q->llq.max_n_shift = VCMDQ_LOG2SIZE_MAX; + /* Queue size, capped to ensure natural alignment */ + q->llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, VCMDQ_LOG2SIZE_MAX); /* Use the common helper to init the VCMDQ, and then... */ ret = arm_smmu_init_one_queue(smmu, q, vcmdq->page0, @@ -800,7 +801,9 @@ out_fallback: return 0; } -struct dentry *cmdqv_debugfs_dir; +#ifdef CONFIG_IOMMU_DEBUGFS +static struct dentry *cmdqv_debugfs_dir; +#endif static struct arm_smmu_device * __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 8321962b3714..ade4684c14c9 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1437,6 +1437,17 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) goto out_free; } else { smmu = arm_smmu_get_by_fwnode(fwspec->iommu_fwnode); + + /* + * Defer probe if the relevant SMMU instance hasn't finished + * probing yet. This is a fragile hack and we'd ideally + * avoid this race in the core code. Until that's ironed + * out, however, this is the most pragmatic option on the + * table. + */ + if (!smmu) + return ERR_PTR(dev_err_probe(dev, -EPROBE_DEFER, + "smmu dev has not bound yet\n")); } ret = -EINVAL; @@ -1558,21 +1569,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev) return group; } -static int arm_smmu_enable_nesting(struct iommu_domain *domain) -{ - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - int ret = 0; - - mutex_lock(&smmu_domain->init_mutex); - if (smmu_domain->smmu) - ret = -EPERM; - else - smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED; - mutex_unlock(&smmu_domain->init_mutex); - - return ret; -} - static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirks) { @@ -1656,7 +1652,6 @@ static struct iommu_ops arm_smmu_ops = { .flush_iotlb_all = arm_smmu_flush_iotlb_all, .iotlb_sync = arm_smmu_iotlb_sync, .iova_to_phys = arm_smmu_iova_to_phys, - .enable_nesting = arm_smmu_enable_nesting, .set_pgtable_quirks = arm_smmu_set_pgtable_quirks, .free = arm_smmu_domain_free, } diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 88fd32a9323c..f2f538c70650 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -14,6 +14,7 @@ config INTEL_IOMMU depends on PCI_MSI && ACPI && X86 select IOMMU_API select IOMMU_IOVA + select IOMMU_IOPF select IOMMUFD_DRIVER if IOMMUFD select NEED_DMA_MAP_STATE select DMAR_TABLE @@ -50,7 +51,6 @@ config INTEL_IOMMU_SVM depends on X86_64 select MMU_NOTIFIER select IOMMU_SVA - select IOMMU_IOPF help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index c8beb0281559..d3bb0798092d 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o cache.o prq.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index eaf862e8dea1..9f424acf474e 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1060,7 +1060,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err = iommu->seq_id; goto error; } - sprintf(iommu->name, "dmar%d", iommu->seq_id); + snprintf(iommu->name, sizeof(iommu->name), "dmar%d", iommu->seq_id); err = map_iommu(iommu, drhd); if (err) { @@ -1895,19 +1895,6 @@ void dmar_msi_write(int irq, struct msi_msg *msg) raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -void dmar_msi_read(int irq, struct msi_msg *msg) -{ - struct intel_iommu *iommu = irq_get_handler_data(irq); - int reg = dmar_msi_reg(iommu, irq); - unsigned long flag; - - raw_spin_lock_irqsave(&iommu->register_lock, flag); - msg->data = readl(iommu->reg + reg + 4); - msg->address_lo = readl(iommu->reg + reg + 8); - msg->address_hi = readl(iommu->reg + reg + 12); - raw_spin_unlock_irqrestore(&iommu->register_lock, flag); -} - static int dmar_fault_do_one(struct intel_iommu *iommu, int type, u8 fault_reason, u32 pasid, u16 source_id, unsigned long long addr) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e860bc9439a2..527f6f89d8a1 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -352,89 +352,6 @@ static bool iommu_paging_structure_coherency(struct intel_iommu *iommu) ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); } -static void domain_update_iommu_coherency(struct dmar_domain *domain) -{ - struct iommu_domain_info *info; - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - bool found = false; - unsigned long i; - - domain->iommu_coherency = true; - xa_for_each(&domain->iommu_array, i, info) { - found = true; - if (!iommu_paging_structure_coherency(info->iommu)) { - domain->iommu_coherency = false; - break; - } - } - if (found) - return; - - /* No hardware attached; use lowest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (!iommu_paging_structure_coherency(iommu)) { - domain->iommu_coherency = false; - break; - } - } - rcu_read_unlock(); -} - -static int domain_update_iommu_superpage(struct dmar_domain *domain, - struct intel_iommu *skip) -{ - struct dmar_drhd_unit *drhd; - struct intel_iommu *iommu; - int mask = 0x3; - - if (!intel_iommu_superpage) - return 0; - - /* set iommu_superpage to the smallest common denominator */ - rcu_read_lock(); - for_each_active_iommu(iommu, drhd) { - if (iommu != skip) { - if (domain && domain->use_first_level) { - if (!cap_fl1gp_support(iommu->cap)) - mask = 0x1; - } else { - mask &= cap_super_page_val(iommu->cap); - } - - if (!mask) - break; - } - } - rcu_read_unlock(); - - return fls(mask); -} - -static int domain_update_device_node(struct dmar_domain *domain) -{ - struct device_domain_info *info; - int nid = NUMA_NO_NODE; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - /* - * There could possibly be multiple device numa nodes as devices - * within the same domain may sit behind different IOMMUs. There - * isn't perfect answer in such situation, so we select first - * come first served policy. - */ - nid = dev_to_node(info->dev); - if (nid != NUMA_NO_NODE) - break; - } - spin_unlock_irqrestore(&domain->lock, flags); - - return nid; -} - /* Return the super pagesize bitmap if supported. */ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) { @@ -452,34 +369,6 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) return bitmap; } -/* Some capabilities may be different across iommus */ -void domain_update_iommu_cap(struct dmar_domain *domain) -{ - domain_update_iommu_coherency(domain); - domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); - - /* - * If RHSA is missing, we should default to the device numa domain - * as fall back. - */ - if (domain->nid == NUMA_NO_NODE) - domain->nid = domain_update_device_node(domain); - - /* - * First-level translation restricts the input-address to a - * canonical address (i.e., address bits 63:N have the same - * value as address bit [N-1], where N is 48-bits with 4-level - * paging and 57-bits with 5-level paging). Hence, skip bit - * [N-1]. - */ - if (domain->use_first_level) - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); - else - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); - - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); -} - struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc) { @@ -707,14 +596,15 @@ static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, while (1) { offset = pfn_level_offset(pfn, level); pte = &parent[offset]; - if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) { - pr_info("PTE not present at level %d\n", level); - break; - } pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val); - if (level == 1) + if (!dma_pte_present(pte)) { + pr_info("page table not present at level %d\n", level - 1); + break; + } + + if (level == 1 || dma_pte_superpage(pte)) break; parent = phys_to_virt(dma_pte_addr(pte)); @@ -737,11 +627,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); /* root entry dump */ - rt_entry = &iommu->root_entry[bus]; - if (!rt_entry) { - pr_info("root table entry is not present\n"); + if (!iommu->root_entry) { + pr_info("root table is not present\n"); return; } + rt_entry = &iommu->root_entry[bus]; if (sm_supported(iommu)) pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n", @@ -752,7 +642,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* context entry dump */ ctx_entry = iommu_context_addr(iommu, bus, devfn, 0); if (!ctx_entry) { - pr_info("context table entry is not present\n"); + pr_info("context table is not present\n"); return; } @@ -761,17 +651,23 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* legacy mode does not require PASID entries */ if (!sm_supported(iommu)) { + if (!context_present(ctx_entry)) { + pr_info("legacy mode page table is not present\n"); + return; + } level = agaw_to_level(ctx_entry->hi & 7); pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); goto pgtable_walk; } - /* get the pointer to pasid directory entry */ - dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); - if (!dir) { - pr_info("pasid directory entry is not present\n"); + if (!context_present(ctx_entry)) { + pr_info("pasid directory table is not present\n"); return; } + + /* get the pointer to pasid directory entry */ + dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); + /* For request-without-pasid, get the pasid from context entry */ if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID) pasid = IOMMU_NO_PASID; @@ -783,7 +679,7 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, /* get the pointer to the pasid table entry */ entries = get_pasid_table_from_pde(pde); if (!entries) { - pr_info("pasid table entry is not present\n"); + pr_info("pasid table is not present\n"); return; } index = pasid & PASID_PTE_MASK; @@ -791,6 +687,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, for (i = 0; i < ARRAY_SIZE(pte->val); i++) pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); + if (!pasid_pte_is_present(pte)) { + pr_info("scalable mode page table is not present\n"); + return; + } + if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { level = pte->val[2] & BIT_ULL(2) ? 5 : 4; pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); @@ -1428,51 +1329,25 @@ static void free_dmar_iommu(struct intel_iommu *iommu) /* free context mapping */ free_context_table(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu)) { - if (ecap_prs(iommu->ecap)) - intel_svm_finish_prq(iommu); - } -#endif + if (ecap_prs(iommu->ecap)) + intel_iommu_finish_prq(iommu); } /* * Check and return whether first level is used by default for * DMA translation. */ -static bool first_level_by_default(unsigned int type) +static bool first_level_by_default(struct intel_iommu *iommu) { /* Only SL is available in legacy mode */ - if (!scalable_mode_support()) + if (!sm_supported(iommu)) return false; /* Only level (either FL or SL) is available, just use it */ - if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity()) - return intel_cap_flts_sanity(); - - /* Both levels are available, decide it based on domain type */ - return type != IOMMU_DOMAIN_UNMANAGED; -} + if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap)) + return ecap_flts(iommu->ecap); -static struct dmar_domain *alloc_domain(unsigned int type) -{ - struct dmar_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) - return NULL; - - domain->nid = NUMA_NO_NODE; - if (first_level_by_default(type)) - domain->use_first_level = true; - INIT_LIST_HEAD(&domain->devices); - INIT_LIST_HEAD(&domain->dev_pasids); - INIT_LIST_HEAD(&domain->cache_tags); - spin_lock_init(&domain->lock); - spin_lock_init(&domain->cache_lock); - xa_init(&domain->iommu_array); - - return domain; + return true; } int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -1514,7 +1389,6 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) ret = xa_err(curr) ? : -EBUSY; goto err_clear; } - domain_update_iommu_cap(domain); spin_unlock(&iommu->lock); return 0; @@ -1540,26 +1414,11 @@ void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) clear_bit(info->did, iommu->domain_ids); xa_erase(&domain->iommu_array, iommu->seq_id); domain->nid = NUMA_NO_NODE; - domain_update_iommu_cap(domain); kfree(info); } spin_unlock(&iommu->lock); } -static int guestwidth_to_adjustwidth(int gaw) -{ - int agaw; - int r = (gaw - 12) % 9; - - if (r == 0) - agaw = gaw; - else - agaw = gaw + 9 - r; - if (agaw > 64) - agaw = 64; - return agaw; -} - static void domain_exit(struct dmar_domain *domain) { if (domain->pgd) { @@ -1601,7 +1460,7 @@ static void copied_context_tear_down(struct intel_iommu *iommu, if (did_old < cap_ndoms(iommu->cap)) { iommu->flush.flush_context(iommu, did_old, - (((u16)bus) << 8) | devfn, + PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did_old, 0, 0, @@ -1622,7 +1481,7 @@ static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, { if (cap_caching_mode(iommu->cap)) { iommu->flush.flush_context(iommu, 0, - (((u16)bus) << 8) | devfn, + PCI_DEVID(bus, devfn), DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL); iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); @@ -1641,7 +1500,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, int translation = CONTEXT_TT_MULTI_LEVEL; struct dma_pte *pgd = domain->pgd; struct context_entry *context; - int agaw, ret; + int ret; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1658,27 +1517,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); - context_set_domain_id(context, did); - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - if (info && info->ats_supported) translation = CONTEXT_TT_DEV_IOTLB; else translation = CONTEXT_TT_MULTI_LEVEL; context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); + context_set_address_width(context, domain->agaw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); @@ -1905,26 +1752,52 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 intel_context_flush_present(info, context, did, true); } +int __domain_setup_first_level(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + u16 did, pgd_t *pgd, int flags, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_first_level(iommu, dev, pgd, + pasid, did, flags); + return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did, + iommu_domain_did(old, iommu), + flags); +} + +static int domain_setup_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_second_level(iommu, domain, + dev, pasid); + return intel_pasid_replace_second_level(iommu, domain, dev, + iommu_domain_did(old, iommu), + pasid); +} + +static int domain_setup_passthrough(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_pass_through(iommu, dev, pasid); + return intel_pasid_replace_pass_through(iommu, dev, + iommu_domain_did(old, iommu), + pasid); +} + static int domain_setup_first_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, - u32 pasid) + u32 pasid, struct iommu_domain *old) { struct dma_pte *pgd = domain->pgd; - int agaw, level; - int flags = 0; - - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - return -ENOMEM; - } + int level, flags = 0; - level = agaw_to_level(agaw); + level = agaw_to_level(domain->agaw); if (level != 4 && level != 5) return -EINVAL; @@ -1934,15 +1807,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu, if (domain->force_snooping) flags |= PASID_FLAG_PAGE_SNOOP; - return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid, - domain_id_iommu(domain, iommu), - flags); -} - -static bool dev_is_real_dma_subdevice(struct device *dev) -{ - return dev && dev_is_pci(dev) && - pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); + return __domain_setup_first_level(iommu, dev, pasid, + domain_id_iommu(domain, iommu), + (pgd_t *)pgd, flags, old); } static int dmar_domain_attach_device(struct dmar_domain *domain, @@ -1968,9 +1835,11 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); else if (domain->use_first_level) - ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); else - ret = intel_pasid_setup_second_level(iommu, domain, dev, IOMMU_NO_PASID); + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); if (ret) goto out_block_translation; @@ -2354,19 +2223,18 @@ static int __init init_dmars(void) iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { + if (ecap_prs(iommu->ecap)) { /* * Call dmar_alloc_hwirq() with dmar_global_lock held, * could cause possible lock race condition. */ up_write(&dmar_global_lock); - ret = intel_svm_enable_prq(iommu); + ret = intel_iommu_enable_prq(iommu); down_write(&dmar_global_lock); if (ret) goto free_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto free_iommu; @@ -2746,20 +2614,13 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { - int sp, ret; struct intel_iommu *iommu = dmaru->iommu; + int ret; ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu); if (ret) goto out; - sp = domain_update_iommu_superpage(NULL, iommu) - 1; - if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { - pr_warn("%s: Doesn't support large page.\n", - iommu->name); - return -ENXIO; - } - /* * Disable translation if already enabled prior to OS handover. */ @@ -2786,13 +2647,12 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) intel_iommu_init_qi(iommu); iommu_flush_write_buffer(iommu); -#ifdef CONFIG_INTEL_IOMMU_SVM - if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { - ret = intel_svm_enable_prq(iommu); + if (ecap_prs(iommu->ecap)) { + ret = intel_iommu_enable_prq(iommu); if (ret) goto disable_iommu; } -#endif + ret = dmar_set_interrupt(iommu); if (ret) goto disable_iommu; @@ -3288,7 +3148,7 @@ int __init intel_iommu_init(void) * the virtual and physical IOMMU page-tables. */ if (cap_caching_mode(iommu->cap) && - !first_level_by_default(IOMMU_DOMAIN_DMA)) { + !first_level_by_default(iommu)) { pr_info_once("IOMMU batching disallowed due to virtualization\n"); iommu_set_dma_strict(); } @@ -3381,27 +3241,6 @@ void device_block_translation(struct device *dev) info->domain = NULL; } -static int md_domain_init(struct dmar_domain *domain, int guest_width) -{ - int adjust_width; - - /* calculate AGAW */ - domain->gaw = guest_width; - adjust_width = guestwidth_to_adjustwidth(guest_width); - domain->agaw = width_to_agaw(adjust_width); - - domain->iommu_coherency = false; - domain->iommu_superpage = 0; - domain->max_addr = 0; - - /* always allocate the top pgd */ - domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC); - if (!domain->pgd) - return -ENOMEM; - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); - return 0; -} - static int blocking_domain_attach_dev(struct iommu_domain *domain, struct device *dev) { @@ -3488,39 +3327,6 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st return domain; } -static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) -{ - struct dmar_domain *dmar_domain; - struct iommu_domain *domain; - - switch (type) { - case IOMMU_DOMAIN_DMA: - case IOMMU_DOMAIN_UNMANAGED: - dmar_domain = alloc_domain(type); - if (!dmar_domain) { - pr_err("Can't allocate dmar_domain\n"); - return NULL; - } - if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - pr_err("Domain initialization failed\n"); - domain_exit(dmar_domain); - return NULL; - } - - domain = &dmar_domain->domain; - domain->geometry.aperture_start = 0; - domain->geometry.aperture_end = - __DOMAIN_MAX_ADDR(dmar_domain->gaw); - domain->geometry.force_aperture = true; - - return domain; - default: - return NULL; - } - - return NULL; -} - static struct iommu_domain * intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct iommu_domain *parent, @@ -3532,6 +3338,7 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, struct intel_iommu *iommu = info->iommu; struct dmar_domain *dmar_domain; struct iommu_domain *domain; + bool first_stage; /* Must be NESTING domain */ if (parent) { @@ -3541,15 +3348,28 @@ intel_iommu_domain_alloc_user(struct device *dev, u32 flags, } if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING + | IOMMU_HWPT_FAULT_ID_VALID))) return ERR_PTR(-EOPNOTSUPP); if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); if (user_data || (dirty_tracking && !ssads_supported(iommu))) return ERR_PTR(-EOPNOTSUPP); - /* Do not use first stage for user domain translation. */ - dmar_domain = paging_domain_alloc(dev, false); + /* + * Always allocate the guest compatible page table unless + * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING + * is specified. + */ + if (nested_parent || dirty_tracking) { + if (!sm_supported(iommu) || !ecap_slts(iommu->ecap)) + return ERR_PTR(-EOPNOTSUPP); + first_stage = false; + } else { + first_stage = first_level_by_default(iommu); + } + + dmar_domain = paging_domain_alloc(dev, first_stage); if (IS_ERR(dmar_domain)) return ERR_CAST(dmar_domain); domain = &dmar_domain->domain; @@ -3583,42 +3403,41 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) domain_exit(dmar_domain); } -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; int addr_width; + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EPERM; + if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; if (domain->dirty_ops && !ssads_supported(iommu)) return -EINVAL; + if (dmar_domain->iommu_coherency != + iommu_paging_structure_coherency(iommu)) + return -EINVAL; + + if (dmar_domain->iommu_superpage != + iommu_superpage_capability(iommu, dmar_domain->use_first_level)) + return -EINVAL; + + if (dmar_domain->use_first_level && + (!sm_supported(iommu) || !ecap_flts(iommu->ecap))) + return -EINVAL; + /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); - if (dmar_domain->max_addr > (1LL << addr_width)) + if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) return -EINVAL; - dmar_domain->gaw = addr_width; - - /* - * Knock out extra levels of page tables if necessary - */ - while (iommu->agaw < dmar_domain->agaw) { - struct dma_pte *pte; - - pte = dmar_domain->pgd; - if (dma_pte_present(pte)) { - dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); - iommu_free_page(pte); - } - dmar_domain->agaw--; - } if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && context_copied(iommu, info->bus, info->devfn)) @@ -3634,7 +3453,7 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, device_block_translation(dev); - ret = prepare_domain_attach_device(domain, dev); + ret = paging_domain_compatible(domain, dev); if (ret) return ret; @@ -4252,8 +4071,8 @@ static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain, return 0; } -static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, - struct iommu_domain *domain) +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dev_pasid_info *curr, *dev_pasid = NULL; @@ -4261,10 +4080,12 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct dmar_domain *dmar_domain; unsigned long flags; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - intel_pasid_tear_down_entry(iommu, dev, pasid, false); + if (!domain) + return; + + /* Identity domain has no meta data for pasid. */ + if (domain->type == IOMMU_DOMAIN_IDENTITY) return; - } dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); @@ -4282,12 +4103,20 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, domain_detach_iommu(dmar_domain, iommu); intel_iommu_debugfs_remove_dev_pasid(dev_pasid); kfree(dev_pasid); - intel_pasid_tear_down_entry(iommu, dev, pasid, false); - intel_drain_pasid_prq(dev, pasid); } -static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) +static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, + struct iommu_domain *domain) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + intel_pasid_tear_down_entry(info->iommu, dev, pasid, false); + domain_remove_dev_pasid(domain, dev, pasid); +} + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *dmar_domain = to_dmar_domain(domain); @@ -4296,22 +4125,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, unsigned long flags; int ret; - if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) - return -EOPNOTSUPP; - - if (domain->dirty_ops) - return -EINVAL; - - if (context_copied(iommu, info->bus, info->devfn)) - return -EBUSY; - - ret = prepare_domain_attach_device(domain, dev); - if (ret) - return ret; - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); if (!dev_pasid) - return -ENOMEM; + return ERR_PTR(-ENOMEM); ret = domain_attach_iommu(dmar_domain, iommu); if (ret) @@ -4321,31 +4137,67 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (ret) goto out_detach_iommu; - if (dmar_domain->use_first_level) - ret = domain_setup_first_level(iommu, dmar_domain, - dev, pasid); - else - ret = intel_pasid_setup_second_level(iommu, dmar_domain, - dev, pasid); - if (ret) - goto out_unassign_tag; - dev_pasid->dev = dev; dev_pasid->pasid = pasid; spin_lock_irqsave(&dmar_domain->lock, flags); list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); spin_unlock_irqrestore(&dmar_domain->lock, flags); - if (domain->type & __IOMMU_DOMAIN_PAGING) - intel_iommu_debugfs_create_dev_pasid(dev_pasid); - - return 0; -out_unassign_tag: - cache_tag_unassign_domain(dmar_domain, dev, pasid); + return dev_pasid; out_detach_iommu: domain_detach_iommu(dmar_domain, iommu); out_free: kfree(dev_pasid); + return ERR_PTR(ret); +} + +static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING))) + return -EINVAL; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (domain->dirty_ops) + return -EINVAL; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + if (dmar_domain->use_first_level) + ret = domain_setup_first_level(iommu, dmar_domain, + dev, pasid, old); + else + ret = domain_setup_second_level(iommu, dmar_domain, + dev, pasid, old); + if (ret) + goto out_remove_dev_pasid; + + domain_remove_dev_pasid(old, dev, pasid); + + intel_iommu_debugfs_create_dev_pasid(dev_pasid); + + return 0; + +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); return ret; } @@ -4573,15 +4425,22 @@ static int identity_domain_attach_dev(struct iommu_domain *domain, struct device } static int identity_domain_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + int ret; if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; - return intel_pasid_setup_pass_through(iommu, dev, pasid); + ret = domain_setup_passthrough(iommu, dev, pasid, old); + if (ret) + return ret; + + domain_remove_dev_pasid(old, dev, pasid); + return 0; } static struct iommu_domain identity_domain = { @@ -4592,15 +4451,30 @@ static struct iommu_domain identity_domain = { }, }; +static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; + bool first_stage; + + first_stage = first_level_by_default(iommu); + dmar_domain = paging_domain_alloc(dev, first_stage); + if (IS_ERR(dmar_domain)) + return ERR_CAST(dmar_domain); + + return &dmar_domain->domain; +} + const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, - .domain_alloc = intel_iommu_domain_alloc, .domain_alloc_user = intel_iommu_domain_alloc_user, .domain_alloc_sva = intel_svm_domain_alloc, + .domain_alloc_paging = intel_iommu_domain_alloc_paging, .probe_device = intel_iommu_probe_device, .release_device = intel_iommu_release_device, .get_resv_regions = intel_iommu_get_resv_regions, @@ -4611,9 +4485,7 @@ const struct iommu_ops intel_iommu_ops = { .def_domain_type = device_def_domain_type, .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, -#ifdef CONFIG_INTEL_IOMMU_SVM - .page_response = intel_svm_page_response, -#endif + .page_response = intel_iommu_page_response, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = intel_iommu_attach_device, .set_dev_pasid = intel_iommu_set_dev_pasid, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 1497f3112b12..2cca094c259d 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -22,6 +22,7 @@ #include <linux/bitfield.h> #include <linux/xarray.h> #include <linux/perf_event.h> +#include <linux/pci.h> #include <asm/cacheflush.h> #include <asm/iommu.h> @@ -653,8 +654,6 @@ struct dmar_domain { struct { /* parent page table which the user domain is nested on */ struct dmar_domain *s2_domain; - /* user page table pointer (in GPA) */ - unsigned long s1_pgtbl; /* page table attributes */ struct iommu_hwpt_vtd_s1 s1_cfg; /* link to parent domain siblings */ @@ -720,7 +719,7 @@ struct intel_iommu { int msagaw; /* max sagaw of this iommu */ unsigned int irq, pr_irq, perf_irq; u16 segment; /* PCI segment# */ - unsigned char name[13]; /* Device Name */ + unsigned char name[16]; /* Device Name */ #ifdef CONFIG_INTEL_IOMMU unsigned long *domain_ids; /* bitmap of domains */ @@ -730,12 +729,10 @@ struct intel_iommu { struct iommu_flush flush; #endif -#ifdef CONFIG_INTEL_IOMMU_SVM struct page_req_dsc *prq; unsigned char prq_name[16]; /* Name for PRQ interrupt */ unsigned long prq_seq_number; struct completion prq_complete; -#endif struct iopf_queue *iopf_queue; unsigned char iopfq_name[16]; /* Synchronization between fault report and iommu device release. */ @@ -810,6 +807,13 @@ static inline struct dmar_domain *to_dmar_domain(struct iommu_domain *dom) return container_of(dom, struct dmar_domain, domain); } +/* + * Domain ID reserved for pasid entries programmed for first-level + * only and pass-through transfer modes. + */ +#define FLPT_DEFAULT_DID 1 +#define NUM_RESERVED_DID 2 + /* Retrieve the domain ID which has allocated to the domain */ static inline u16 domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) @@ -820,6 +824,21 @@ domain_id_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) return info->did; } +static inline u16 +iommu_domain_did(struct iommu_domain *domain, struct intel_iommu *iommu) +{ + if (domain->type == IOMMU_DOMAIN_SVA || + domain->type == IOMMU_DOMAIN_IDENTITY) + return FLPT_DEFAULT_DID; + return domain_id_iommu(to_dmar_domain(domain), iommu); +} + +static inline bool dev_is_real_dma_subdevice(struct device *dev) +{ + return dev && dev_is_pci(dev) && + pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); +} + /* * 0: readable * 1: writable @@ -1230,9 +1249,18 @@ void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); -int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev); -void domain_update_iommu_cap(struct dmar_domain *domain); +int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); + +struct dev_pasid_info * +domain_add_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); +void domain_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid); + +int __domain_setup_first_level(struct intel_iommu *iommu, + struct device *dev, ioasid_t pasid, + u16 did, pgd_t *pgd, int flags, + struct iommu_domain *old); int dmar_ir_support(void); @@ -1278,18 +1306,18 @@ void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, u16 did, bool affect_domains); +int intel_iommu_enable_prq(struct intel_iommu *iommu); +int intel_iommu_finish_prq(struct intel_iommu *iommu); +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg); +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid); + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); -int intel_svm_enable_prq(struct intel_iommu *iommu); -int intel_svm_finish_prq(struct intel_iommu *iommu); -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm); -void intel_drain_pasid_prq(struct device *dev, u32 pasid); #else static inline void intel_svm_check(struct intel_iommu *iommu) {} -static inline void intel_drain_pasid_prq(struct device *dev, u32 pasid) {} static inline struct iommu_domain *intel_svm_domain_alloc(struct device *dev, struct mm_struct *mm) { diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index 7a6d188e3bea..466c1412dd45 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -312,7 +312,7 @@ static int set_ioapic_sid(struct irte *irte, int apic) for (i = 0; i < MAX_IO_APICS; i++) { if (ir_ioapic[i].iommu && ir_ioapic[i].id == apic) { - sid = (ir_ioapic[i].bus << 8) | ir_ioapic[i].devfn; + sid = PCI_DEVID(ir_ioapic[i].bus, ir_ioapic[i].devfn); break; } } @@ -337,7 +337,7 @@ static int set_hpet_sid(struct irte *irte, u8 id) for (i = 0; i < MAX_HPET_TBS; i++) { if (ir_hpet[i].iommu && ir_hpet[i].id == id) { - sid = (ir_hpet[i].bus << 8) | ir_hpet[i].devfn; + sid = PCI_DEVID(ir_hpet[i].bus, ir_hpet[i].devfn); break; } } diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 433c58944401..42c4533a6ea2 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -40,7 +40,7 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, * The s2_domain will be used in nested translation, hence needs * to ensure the s2_domain is compatible with this IOMMU. */ - ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); if (ret) { dev_err_ratelimited(dev, "s2 domain is not compatible\n"); return ret; @@ -130,8 +130,58 @@ out: return ret; } +static int domain_setup_nested(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + if (!old) + return intel_pasid_setup_nested(iommu, dev, pasid, domain); + return intel_pasid_replace_nested(iommu, dev, pasid, + iommu_domain_did(old, iommu), + domain); +} + +static int intel_nested_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + struct dev_pasid_info *dev_pasid; + int ret; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + if (context_copied(iommu, info->bus, info->devfn)) + return -EBUSY; + + ret = paging_domain_compatible(&dmar_domain->s2_domain->domain, dev); + if (ret) + return ret; + + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); + + ret = domain_setup_nested(iommu, dmar_domain, dev, pasid, old); + if (ret) + goto out_remove_dev_pasid; + + domain_remove_dev_pasid(old, dev, pasid); + + return 0; + +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); + return ret; +} + static const struct iommu_domain_ops intel_nested_domain_ops = { .attach_dev = intel_nested_attach_dev, + .set_dev_pasid = intel_nested_set_dev_pasid, .free = intel_nested_domain_free, .cache_invalidate_user = intel_nested_cache_invalidate_user, }; @@ -162,7 +212,6 @@ struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, domain->use_first_level = true; domain->s2_domain = s2_domain; - domain->s1_pgtbl = vtd.pgtbl_addr; domain->s1_cfg = vtd; domain->domain.ops = &intel_nested_domain_ops; domain->domain.type = IOMMU_DOMAIN_NESTED; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 2e5fa0a23299..0f2a926d3bd5 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -220,7 +220,7 @@ devtlb_invalidation_with_pasid(struct intel_iommu *iommu, if (pci_dev_is_disconnected(to_pci_dev(dev))) return; - sid = info->bus << 8 | info->devfn; + sid = PCI_DEVID(info->bus, info->devfn); qdep = info->ats_qdep; pfsid = info->pfsid; @@ -265,6 +265,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); devtlb_invalidation_with_pasid(iommu, dev, pasid); + intel_iommu_drain_pasid_prq(dev, pasid); } /* @@ -287,9 +288,68 @@ static void pasid_flush_caches(struct intel_iommu *iommu, } /* + * This function is supposed to be used after caller updates the fields + * except for the SSADE and P bit of a pasid table entry. It does the + * below: + * - Flush cacheline if needed + * - Flush the caches per Table 28 ”Guidance to Software for Invalidations“ + * of VT-d spec 5.0. + */ +static void intel_pasid_flush_present(struct intel_iommu *iommu, + struct device *dev, + u32 pasid, u16 did, + struct pasid_entry *pte) +{ + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * VT-d spec 5.0 table28 states guides for cache invalidation: + * + * - PASID-selective-within-Domain PASID-cache invalidation + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + + devtlb_invalidation_with_pasid(iommu, dev, pasid); +} + +/* * Set up the scalable mode pasid table entry for first only * translation type. */ +static void pasid_pte_config_first_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + pgd_t *pgd, u16 did, int flags) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + /* Setup the first level page table pointer: */ + pasid_set_flptr(pte, (u64)__pa(pgd)); + + if (flags & PASID_FLAG_FL5LP) + pasid_set_flpm(pte, 1); + + if (flags & PASID_FLAG_PAGE_SNOOP) + pasid_set_pgsnp(pte); + + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + + /* Setup Present and PASID Granular Transfer Type: */ + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); + pasid_set_present(pte); +} + int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, pgd_t *pgd, u32 pasid, u16 did, int flags) @@ -320,53 +380,82 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); + pasid_pte_config_first_level(iommu, pte, pgd, did, flags); - /* Setup the first level page table pointer: */ - pasid_set_flptr(pte, (u64)__pa(pgd)); + spin_unlock(&iommu->lock); - if (flags & PASID_FLAG_FL5LP) - pasid_set_flpm(pte, 1); + pasid_flush_caches(iommu, pte, pasid, did); - if (flags & PASID_FLAG_PAGE_SNOOP) - pasid_set_pgsnp(pte); + return 0; +} - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, + u32 pasid, u16 did, u16 old_did, + int flags) +{ + struct pasid_entry *pte, new_pte; - /* Setup Present and PASID Granular Transfer Type: */ - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); - pasid_set_present(pte); + if (!ecap_flts(iommu->ecap)) { + pr_err("No first level translation support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { + pr_err("No 5-level paging support for first-level on %s\n", + iommu->name); + return -EINVAL; + } + + pasid_pte_config_first_level(iommu, &new_pte, pgd, did, flags); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } /* - * Skip top levels of page tables for iommu which has less agaw - * than default. Unnecessary for PT mode. + * Set up the scalable mode pasid entry for second only translation type. */ -static int iommu_skip_agaw(struct dmar_domain *domain, - struct intel_iommu *iommu, - struct dma_pte **pgd) +static void pasid_pte_config_second_level(struct intel_iommu *iommu, + struct pasid_entry *pte, + u64 pgd_val, int agaw, u16 did, + bool dirty_tracking) { - int agaw; + lockdep_assert_held(&iommu->lock); - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - *pgd = phys_to_virt(dma_pte_addr(*pgd)); - if (!dma_pte_present(*pgd)) - return -EINVAL; - } + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_slptr(pte, pgd_val); + pasid_set_address_width(pte, agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (dirty_tracking) + pasid_set_ssade(pte); - return agaw; + pasid_set_present(pte); } -/* - * Set up the scalable mode pasid entry for second only translation type. - */ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid) @@ -374,7 +463,6 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct pasid_entry *pte; struct dma_pte *pgd; u64 pgd_val; - int agaw; u16 did; /* @@ -388,15 +476,58 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, } pgd = domain->pgd; - agaw = iommu_skip_agaw(domain, iommu, &pgd); - if (agaw < 0) { - dev_err(dev, "Invalid domain page table\n"); + pgd_val = virt_to_phys(pgd); + did = domain_id_iommu(domain, iommu); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw, + did, domain->dirty_tracking); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} + +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + struct dma_pte *pgd; + u64 pgd_val; + u16 did; + + /* + * If hardware advertises no support for second level + * translation, return directly. + */ + if (!ecap_slts(iommu->ecap)) { + pr_err("No second level translation support on %s\n", + iommu->name); return -EINVAL; } + pgd = domain->pgd; pgd_val = virt_to_phys(pgd); did = domain_id_iommu(domain, iommu); + pasid_pte_config_second_level(iommu, &new_pte, pgd_val, + domain->agaw, did, + domain->dirty_tracking); + spin_lock(&iommu->lock); pte = intel_pasid_get_entry(dev, pasid); if (!pte) { @@ -404,25 +535,18 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, return -ENODEV; } - if (pasid_pte_is_present(pte)) { + if (!pasid_pte_is_present(pte)) { spin_unlock(&iommu->lock); - return -EBUSY; + return -EINVAL; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_slptr(pte, pgd_val); - pasid_set_address_width(pte, agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (domain->dirty_tracking) - pasid_set_ssade(pte); + WARN_ON(old_did != pasid_get_domain_id(pte)); - pasid_set_present(pte); + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } @@ -499,6 +623,20 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, /* * Set up the scalable mode pasid entry for passthrough translation type. */ +static void pasid_pte_config_pass_through(struct intel_iommu *iommu, + struct pasid_entry *pte, u16 did) +{ + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, iommu->agaw); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); + pasid_set_fault_enable(pte); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_present(pte); +} + int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid) { @@ -517,13 +655,7 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return -EBUSY; } - pasid_clear_entry(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, iommu->agaw); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_PT); - pasid_set_fault_enable(pte); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - pasid_set_present(pte); + pasid_pte_config_pass_through(iommu, pte, did); spin_unlock(&iommu->lock); pasid_flush_caches(iommu, pte, pasid, did); @@ -531,6 +663,38 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, return 0; } +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid) +{ + struct pasid_entry *pte, new_pte; + u16 did = FLPT_DEFAULT_DID; + + pasid_pte_config_pass_through(iommu, &new_pte, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; + spin_unlock(&iommu->lock); + + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); + + return 0; +} + /* * Set the page snoop control for a pasid entry which has been set up. */ @@ -551,24 +715,47 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, did = pasid_get_domain_id(pte); spin_unlock(&iommu->lock); - if (!ecap_coherent(iommu->ecap)) - clflush_cache_range(pte, sizeof(*pte)); + intel_pasid_flush_present(iommu, dev, pasid, did, pte); +} - /* - * VT-d spec 3.4 table23 states guides for cache invalidation: - * - * - PASID-selective-within-Domain PASID-cache invalidation - * - PASID-selective PASID-based IOTLB invalidation - * - If (pasid is RID_PASID) - * - Global Device-TLB invalidation to affected functions - * Else - * - PASID-based Device-TLB invalidation (with S=1 and - * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions - */ - pasid_cache_invalidation_with_pasid(iommu, did, pasid); - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); +static void pasid_pte_config_nestd(struct intel_iommu *iommu, + struct pasid_entry *pte, + struct iommu_hwpt_vtd_s1 *s1_cfg, + struct dmar_domain *s2_domain, + u16 did) +{ + struct dma_pte *pgd = s2_domain->pgd; - devtlb_invalidation_with_pasid(iommu, dev, pasid); + lockdep_assert_held(&iommu->lock); + + pasid_clear_entry(pte); + + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, s1_cfg->pgtbl_addr); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, s2_domain->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (s2_domain->dirty_tracking) + pasid_set_ssade(pte); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); } /** @@ -586,10 +773,8 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain) { struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; - pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; struct dmar_domain *s2_domain = domain->s2_domain; u16 did = domain_id_iommu(domain, iommu); - struct dma_pte *pgd = s2_domain->pgd; struct pasid_entry *pte; /* Address width should match the address width supported by hardware */ @@ -632,37 +817,73 @@ int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, return -EBUSY; } - pasid_clear_entry(pte); + pasid_pte_config_nestd(iommu, pte, s1_cfg, s2_domain, did); + spin_unlock(&iommu->lock); - if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) - pasid_set_flpm(pte, 1); + pasid_flush_caches(iommu, pte, pasid, did); - pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + return 0; +} - if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { - pasid_set_sre(pte); - if (s1_cfg->flags & IOMMU_VTD_S1_WPE) - pasid_set_wpe(pte); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct pasid_entry *pte, new_pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; } - if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) - pasid_set_eafe(pte); + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } - if (s2_domain->force_snooping) - pasid_set_pgsnp(pte); + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } - pasid_set_slptr(pte, virt_to_phys(pgd)); - pasid_set_fault_enable(pte); - pasid_set_domain_id(pte, did); - pasid_set_address_width(pte, s2_domain->agaw); - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); - if (s2_domain->dirty_tracking) - pasid_set_ssade(pte); - pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); - pasid_set_present(pte); + pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did); + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + + if (!pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EINVAL; + } + + WARN_ON(old_did != pasid_get_domain_id(pte)); + + *pte = new_pte; spin_unlock(&iommu->lock); - pasid_flush_caches(iommu, pte, pasid, did); + intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); + intel_iommu_drain_pasid_prq(dev, pasid); return 0; } diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index dde6d3ba5ae0..082f4fe20216 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -22,13 +22,6 @@ #define is_pasid_enabled(entry) (((entry)->lo >> 3) & 0x1) #define get_pasid_dir_size(entry) (1 << ((((entry)->lo >> 9) & 0x7) + 7)) -/* - * Domain ID reserved for pasid entries programmed for first-level - * only and pass-through transfer modes. - */ -#define FLPT_DEFAULT_DID 1 -#define NUM_RESERVED_DID 2 - #define PASID_FLAG_NESTED BIT(1) #define PASID_FLAG_PAGE_SNOOP BIT(2) @@ -303,6 +296,21 @@ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct device *dev, u32 pasid); int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, u32 pasid, struct dmar_domain *domain); +int intel_pasid_replace_first_level(struct intel_iommu *iommu, + struct device *dev, pgd_t *pgd, + u32 pasid, u16 did, u16 old_did, + int flags); +int intel_pasid_replace_second_level(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_pass_through(struct intel_iommu *iommu, + struct device *dev, u16 old_did, + u32 pasid); +int intel_pasid_replace_nested(struct intel_iommu *iommu, + struct device *dev, u32 pasid, + u16 old_did, struct dmar_domain *domain); + void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); diff --git a/drivers/iommu/intel/prq.c b/drivers/iommu/intel/prq.c new file mode 100644 index 000000000000..c2d792db52c3 --- /dev/null +++ b/drivers/iommu/intel/prq.c @@ -0,0 +1,396 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2015 Intel Corporation + * + * Originally split from drivers/iommu/intel/svm.c + */ + +#include <linux/pci.h> +#include <linux/pci-ats.h> + +#include "iommu.h" +#include "pasid.h" +#include "../iommu-pages.h" +#include "trace.h" + +/* Page request queue descriptor */ +struct page_req_dsc { + union { + struct { + u64 type:8; + u64 pasid_present:1; + u64 rsvd:7; + u64 rid:16; + u64 pasid:20; + u64 exe_req:1; + u64 pm_req:1; + u64 rsvd2:10; + }; + u64 qw_0; + }; + union { + struct { + u64 rd_req:1; + u64 wr_req:1; + u64 lpig:1; + u64 prg_index:9; + u64 addr:52; + }; + u64 qw_1; + }; + u64 qw_2; + u64 qw_3; +}; + +/** + * intel_iommu_drain_pasid_prq - Drain page requests and responses for a pasid + * @dev: target device + * @pasid: pasid for draining + * + * Drain all pending page requests and responses related to @pasid in both + * software and hardware. This is supposed to be called after the device + * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB + * and DevTLB have been invalidated. + * + * It waits until all pending page requests for @pasid in the page fault + * queue are completed by the prq handling thread. Then follow the steps + * described in VT-d spec CH7.10 to drain all page requests and page + * responses pending in the hardware. + */ +void intel_iommu_drain_pasid_prq(struct device *dev, u32 pasid) +{ + struct device_domain_info *info; + struct dmar_domain *domain; + struct intel_iommu *iommu; + struct qi_desc desc[3]; + int head, tail; + u16 sid, did; + + info = dev_iommu_priv_get(dev); + if (!info->pri_enabled) + return; + + iommu = info->iommu; + domain = info->domain; + sid = PCI_DEVID(info->bus, info->devfn); + did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; + + /* + * Check and wait until all pending page requests in the queue are + * handled by the prq handling thread. + */ +prq_retry: + reinit_completion(&iommu->prq_complete); + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + while (head != tail) { + struct page_req_dsc *req; + + req = &iommu->prq[head / sizeof(*req)]; + if (!req->pasid_present || req->pasid != pasid) { + head = (head + sizeof(*req)) & PRQ_RING_MASK; + continue; + } + + wait_for_completion(&iommu->prq_complete); + goto prq_retry; + } + + iopf_queue_flush_dev(dev); + + /* + * Perform steps described in VT-d spec CH7.10 to drain page + * requests and responses in hardware. + */ + memset(desc, 0, sizeof(desc)); + desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | + QI_IWD_FENCE | + QI_IWD_TYPE; + if (pasid == IOMMU_NO_PASID) { + qi_desc_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH, &desc[1]); + qi_desc_dev_iotlb(sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, &desc[2]); + } else { + qi_desc_piotlb(did, pasid, 0, -1, 0, &desc[1]); + qi_desc_dev_iotlb_pasid(sid, info->pfsid, pasid, info->ats_qdep, + 0, MAX_AGAW_PFN_WIDTH, &desc[2]); + } +qi_retry: + reinit_completion(&iommu->prq_complete); + qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + wait_for_completion(&iommu->prq_complete); + goto qi_retry; + } +} + +static bool is_canonical_address(u64 addr) +{ + int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); + long saddr = (long)addr; + + return (((saddr << shift) >> shift) == saddr); +} + +static void handle_bad_prq_event(struct intel_iommu *iommu, + struct page_req_dsc *req, int result) +{ + struct qi_desc desc = { }; + + pr_err("%s: Invalid page request: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + + if (!req->lpig) + return; + + desc.qw0 = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID(req->rid) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_RESP_CODE(result) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(req->prg_index) | + QI_PGRP_LPIG(req->lpig); + + qi_submit_sync(iommu, &desc, 1, 0); +} + +static int prq_to_iommu_prot(struct page_req_dsc *req) +{ + int prot = 0; + + if (req->rd_req) + prot |= IOMMU_FAULT_PERM_READ; + if (req->wr_req) + prot |= IOMMU_FAULT_PERM_WRITE; + if (req->exe_req) + prot |= IOMMU_FAULT_PERM_EXEC; + if (req->pm_req) + prot |= IOMMU_FAULT_PERM_PRIV; + + return prot; +} + +static void intel_prq_report(struct intel_iommu *iommu, struct device *dev, + struct page_req_dsc *desc) +{ + struct iopf_fault event = { }; + + /* Fill in event data for device specific processing */ + event.fault.type = IOMMU_FAULT_PAGE_REQ; + event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; + event.fault.prm.pasid = desc->pasid; + event.fault.prm.grpid = desc->prg_index; + event.fault.prm.perm = prq_to_iommu_prot(desc); + + if (desc->lpig) + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + if (desc->pasid_present) { + event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; + } + + iommu_report_device_fault(dev, &event); +} + +static irqreturn_t prq_event_thread(int irq, void *d) +{ + struct intel_iommu *iommu = d; + struct page_req_dsc *req; + int head, tail, handled; + struct device *dev; + u64 address; + + /* + * Clear PPR bit before reading head/tail registers, to ensure that + * we get a new interrupt if needed. + */ + writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); + + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + handled = (head != tail); + while (head != tail) { + req = &iommu->prq[head / sizeof(*req)]; + address = (u64)req->addr << VTD_PAGE_SHIFT; + + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", + iommu->name); +bad_req: + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + goto prq_advance; + } + + if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { + pr_err("IOMMU: %s: Page request in Privilege Mode\n", + iommu->name); + goto bad_req; + } + + if (unlikely(req->exe_req && req->rd_req)) { + pr_err("IOMMU: %s: Execution request not supported\n", + iommu->name); + goto bad_req; + } + + /* Drop Stop Marker message. No need for a response. */ + if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) + goto prq_advance; + + /* + * If prq is to be handled outside iommu driver via receiver of + * the fault notifiers, we skip the page response here. + */ + mutex_lock(&iommu->iopf_lock); + dev = device_rbtree_find(iommu, req->rid); + if (!dev) { + mutex_unlock(&iommu->iopf_lock); + goto bad_req; + } + + intel_prq_report(iommu, dev, req); + trace_prq_report(iommu, dev, req->qw_0, req->qw_1, + req->qw_2, req->qw_3, + iommu->prq_seq_number++); + mutex_unlock(&iommu->iopf_lock); +prq_advance: + head = (head + sizeof(*req)) & PRQ_RING_MASK; + } + + dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); + + /* + * Clear the page request overflow bit and wake up all threads that + * are waiting for the completion of this handling. + */ + if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { + pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", + iommu->name); + head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; + if (head == tail) { + iopf_queue_discard_partial(iommu->iopf_queue); + writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); + pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", + iommu->name); + } + } + + if (!completion_done(&iommu->prq_complete)) + complete(&iommu->prq_complete); + + return IRQ_RETVAL(handled); +} + +int intel_iommu_enable_prq(struct intel_iommu *iommu) +{ + struct iopf_queue *iopfq; + int irq, ret; + + iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); + if (!iommu->prq) { + pr_warn("IOMMU: %s: Failed to allocate page request queue\n", + iommu->name); + return -ENOMEM; + } + + irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); + if (irq <= 0) { + pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", + iommu->name); + ret = -EINVAL; + goto free_prq; + } + iommu->pr_irq = irq; + + snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), + "dmar%d-iopfq", iommu->seq_id); + iopfq = iopf_queue_alloc(iommu->iopfq_name); + if (!iopfq) { + pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); + ret = -ENOMEM; + goto free_hwirq; + } + iommu->iopf_queue = iopfq; + + snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); + + ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, + iommu->prq_name, iommu); + if (ret) { + pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", + iommu->name); + goto free_iopfq; + } + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); + + init_completion(&iommu->prq_complete); + + return 0; + +free_iopfq: + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; +free_hwirq: + dmar_free_hwirq(irq); + iommu->pr_irq = 0; +free_prq: + iommu_free_pages(iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + + return ret; +} + +int intel_iommu_finish_prq(struct intel_iommu *iommu) +{ + dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); + dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); + + if (iommu->pr_irq) { + free_irq(iommu->pr_irq, iommu); + dmar_free_hwirq(iommu->pr_irq); + iommu->pr_irq = 0; + } + + if (iommu->iopf_queue) { + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; + } + + iommu_free_pages(iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + + return 0; +} + +void intel_iommu_page_response(struct device *dev, struct iopf_fault *evt, + struct iommu_page_response *msg) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + u8 bus = info->bus, devfn = info->devfn; + struct iommu_fault_page_request *prm; + struct qi_desc desc; + bool pasid_present; + bool last_page; + u16 sid; + + prm = &evt->fault.prm; + sid = PCI_DEVID(bus, devfn); + pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; + + desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | + QI_PGRP_PASID_P(pasid_present) | + QI_PGRP_RESP_CODE(msg->code) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); + desc.qw2 = 0; + desc.qw3 = 0; + + qi_submit_sync(iommu, &desc, 1, 0); +} diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 078d1e32a24e..f5569347591f 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -25,92 +25,6 @@ #include "../iommu-pages.h" #include "trace.h" -static irqreturn_t prq_event_thread(int irq, void *d); - -int intel_svm_enable_prq(struct intel_iommu *iommu) -{ - struct iopf_queue *iopfq; - int irq, ret; - - iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER); - if (!iommu->prq) { - pr_warn("IOMMU: %s: Failed to allocate page request queue\n", - iommu->name); - return -ENOMEM; - } - - irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu); - if (irq <= 0) { - pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", - iommu->name); - ret = -EINVAL; - goto free_prq; - } - iommu->pr_irq = irq; - - snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), - "dmar%d-iopfq", iommu->seq_id); - iopfq = iopf_queue_alloc(iommu->iopfq_name); - if (!iopfq) { - pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); - ret = -ENOMEM; - goto free_hwirq; - } - iommu->iopf_queue = iopfq; - - snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); - - ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, - iommu->prq_name, iommu); - if (ret) { - pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", - iommu->name); - goto free_iopfq; - } - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); - - init_completion(&iommu->prq_complete); - - return 0; - -free_iopfq: - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; -free_hwirq: - dmar_free_hwirq(irq); - iommu->pr_irq = 0; -free_prq: - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return ret; -} - -int intel_svm_finish_prq(struct intel_iommu *iommu) -{ - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); - dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); - - if (iommu->pr_irq) { - free_irq(iommu->pr_irq, iommu); - dmar_free_hwirq(iommu->pr_irq); - iommu->pr_irq = 0; - } - - if (iommu->iopf_queue) { - iopf_queue_free(iommu->iopf_queue); - iommu->iopf_queue = NULL; - } - - iommu_free_pages(iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - - return 0; -} - void intel_svm_check(struct intel_iommu *iommu) { if (!pasid_supported(iommu)) @@ -197,360 +111,37 @@ static const struct mmu_notifier_ops intel_mmuops = { }; static int intel_svm_set_dev_pasid(struct iommu_domain *domain, - struct device *dev, ioasid_t pasid) + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu = info->iommu; struct mm_struct *mm = domain->mm; struct dev_pasid_info *dev_pasid; unsigned long sflags; - unsigned long flags; int ret = 0; - dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL); - if (!dev_pasid) - return -ENOMEM; - - dev_pasid->dev = dev; - dev_pasid->pasid = pasid; - - ret = cache_tag_assign_domain(to_dmar_domain(domain), dev, pasid); - if (ret) - goto free_dev_pasid; + dev_pasid = domain_add_dev_pasid(domain, dev, pasid); + if (IS_ERR(dev_pasid)) + return PTR_ERR(dev_pasid); /* Setup the pasid table: */ sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; - ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, pasid, - FLPT_DEFAULT_DID, sflags); + ret = __domain_setup_first_level(iommu, dev, pasid, + FLPT_DEFAULT_DID, mm->pgd, + sflags, old); if (ret) - goto unassign_tag; + goto out_remove_dev_pasid; - spin_lock_irqsave(&dmar_domain->lock, flags); - list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids); - spin_unlock_irqrestore(&dmar_domain->lock, flags); + domain_remove_dev_pasid(old, dev, pasid); return 0; -unassign_tag: - cache_tag_unassign_domain(to_dmar_domain(domain), dev, pasid); -free_dev_pasid: - kfree(dev_pasid); - +out_remove_dev_pasid: + domain_remove_dev_pasid(domain, dev, pasid); return ret; } -/* Page request queue descriptor */ -struct page_req_dsc { - union { - struct { - u64 type:8; - u64 pasid_present:1; - u64 rsvd:7; - u64 rid:16; - u64 pasid:20; - u64 exe_req:1; - u64 pm_req:1; - u64 rsvd2:10; - }; - u64 qw_0; - }; - union { - struct { - u64 rd_req:1; - u64 wr_req:1; - u64 lpig:1; - u64 prg_index:9; - u64 addr:52; - }; - u64 qw_1; - }; - u64 qw_2; - u64 qw_3; -}; - -static bool is_canonical_address(u64 addr) -{ - int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); - long saddr = (long) addr; - - return (((saddr << shift) >> shift) == saddr); -} - -/** - * intel_drain_pasid_prq - Drain page requests and responses for a pasid - * @dev: target device - * @pasid: pasid for draining - * - * Drain all pending page requests and responses related to @pasid in both - * software and hardware. This is supposed to be called after the device - * driver has stopped DMA, the pasid entry has been cleared, and both IOTLB - * and DevTLB have been invalidated. - * - * It waits until all pending page requests for @pasid in the page fault - * queue are completed by the prq handling thread. Then follow the steps - * described in VT-d spec CH7.10 to drain all page requests and page - * responses pending in the hardware. - */ -void intel_drain_pasid_prq(struct device *dev, u32 pasid) -{ - struct device_domain_info *info; - struct dmar_domain *domain; - struct intel_iommu *iommu; - struct qi_desc desc[3]; - struct pci_dev *pdev; - int head, tail; - u16 sid, did; - int qdep; - - info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev))) - return; - - if (!info->pri_enabled) - return; - - iommu = info->iommu; - domain = info->domain; - pdev = to_pci_dev(dev); - sid = PCI_DEVID(info->bus, info->devfn); - did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; - qdep = pci_ats_queue_depth(pdev); - - /* - * Check and wait until all pending page requests in the queue are - * handled by the prq handling thread. - */ -prq_retry: - reinit_completion(&iommu->prq_complete); - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - while (head != tail) { - struct page_req_dsc *req; - - req = &iommu->prq[head / sizeof(*req)]; - if (!req->pasid_present || req->pasid != pasid) { - head = (head + sizeof(*req)) & PRQ_RING_MASK; - continue; - } - - wait_for_completion(&iommu->prq_complete); - goto prq_retry; - } - - iopf_queue_flush_dev(dev); - - /* - * Perform steps described in VT-d spec CH7.10 to drain page - * requests and responses in hardware. - */ - memset(desc, 0, sizeof(desc)); - desc[0].qw0 = QI_IWD_STATUS_DATA(QI_DONE) | - QI_IWD_FENCE | - QI_IWD_TYPE; - desc[1].qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc[2].qw0 = QI_DEV_EIOTLB_PASID(pasid) | - QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | - QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(info->pfsid); -qi_retry: - reinit_completion(&iommu->prq_complete); - qi_submit_sync(iommu, desc, 3, QI_OPT_WAIT_DRAIN); - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - wait_for_completion(&iommu->prq_complete); - goto qi_retry; - } -} - -static int prq_to_iommu_prot(struct page_req_dsc *req) -{ - int prot = 0; - - if (req->rd_req) - prot |= IOMMU_FAULT_PERM_READ; - if (req->wr_req) - prot |= IOMMU_FAULT_PERM_WRITE; - if (req->exe_req) - prot |= IOMMU_FAULT_PERM_EXEC; - if (req->pm_req) - prot |= IOMMU_FAULT_PERM_PRIV; - - return prot; -} - -static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, - struct page_req_dsc *desc) -{ - struct iopf_fault event = { }; - - /* Fill in event data for device specific processing */ - event.fault.type = IOMMU_FAULT_PAGE_REQ; - event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT; - event.fault.prm.pasid = desc->pasid; - event.fault.prm.grpid = desc->prg_index; - event.fault.prm.perm = prq_to_iommu_prot(desc); - - if (desc->lpig) - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - if (desc->pasid_present) { - event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID; - } - - iommu_report_device_fault(dev, &event); -} - -static void handle_bad_prq_event(struct intel_iommu *iommu, - struct page_req_dsc *req, int result) -{ - struct qi_desc desc = { }; - - pr_err("%s: Invalid page request: %08llx %08llx\n", - iommu->name, ((unsigned long long *)req)[0], - ((unsigned long long *)req)[1]); - - if (!req->lpig) - return; - - desc.qw0 = QI_PGRP_PASID(req->pasid) | - QI_PGRP_DID(req->rid) | - QI_PGRP_PASID_P(req->pasid_present) | - QI_PGRP_RESP_CODE(result) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(req->prg_index) | - QI_PGRP_LPIG(req->lpig); - - qi_submit_sync(iommu, &desc, 1, 0); -} - -static irqreturn_t prq_event_thread(int irq, void *d) -{ - struct intel_iommu *iommu = d; - struct page_req_dsc *req; - int head, tail, handled; - struct device *dev; - u64 address; - - /* - * Clear PPR bit before reading head/tail registers, to ensure that - * we get a new interrupt if needed. - */ - writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); - - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - handled = (head != tail); - while (head != tail) { - req = &iommu->prq[head / sizeof(*req)]; - address = (u64)req->addr << VTD_PAGE_SHIFT; - - if (unlikely(!req->pasid_present)) { - pr_err("IOMMU: %s: Page request without PASID\n", - iommu->name); -bad_req: - handle_bad_prq_event(iommu, req, QI_RESP_INVALID); - goto prq_advance; - } - - if (unlikely(!is_canonical_address(address))) { - pr_err("IOMMU: %s: Address is not canonical\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { - pr_err("IOMMU: %s: Page request in Privilege Mode\n", - iommu->name); - goto bad_req; - } - - if (unlikely(req->exe_req && req->rd_req)) { - pr_err("IOMMU: %s: Execution request not supported\n", - iommu->name); - goto bad_req; - } - - /* Drop Stop Marker message. No need for a response. */ - if (unlikely(req->lpig && !req->rd_req && !req->wr_req)) - goto prq_advance; - - /* - * If prq is to be handled outside iommu driver via receiver of - * the fault notifiers, we skip the page response here. - */ - mutex_lock(&iommu->iopf_lock); - dev = device_rbtree_find(iommu, req->rid); - if (!dev) { - mutex_unlock(&iommu->iopf_lock); - goto bad_req; - } - - intel_svm_prq_report(iommu, dev, req); - trace_prq_report(iommu, dev, req->qw_0, req->qw_1, - req->qw_2, req->qw_3, - iommu->prq_seq_number++); - mutex_unlock(&iommu->iopf_lock); -prq_advance: - head = (head + sizeof(*req)) & PRQ_RING_MASK; - } - - dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); - - /* - * Clear the page request overflow bit and wake up all threads that - * are waiting for the completion of this handling. - */ - if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { - pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", - iommu->name); - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; - if (head == tail) { - iopf_queue_discard_partial(iommu->iopf_queue); - writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); - pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", - iommu->name); - } - } - - if (!completion_done(&iommu->prq_complete)) - complete(&iommu->prq_complete); - - return IRQ_RETVAL(handled); -} - -void intel_svm_page_response(struct device *dev, struct iopf_fault *evt, - struct iommu_page_response *msg) -{ - struct device_domain_info *info = dev_iommu_priv_get(dev); - struct intel_iommu *iommu = info->iommu; - u8 bus = info->bus, devfn = info->devfn; - struct iommu_fault_page_request *prm; - struct qi_desc desc; - bool pasid_present; - bool last_page; - u16 sid; - - prm = &evt->fault.prm; - sid = PCI_DEVID(bus, devfn); - pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; - - desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) | - QI_PGRP_PASID_P(pasid_present) | - QI_PGRP_RESP_CODE(msg->code) | - QI_PGRP_RESP_TYPE; - desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); - desc.qw2 = 0; - desc.qw3 = 0; - - qi_submit_sync(iommu, &desc, 1, 0); -} - static void intel_svm_domain_free(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 06ffc683b28f..523355e91a2c 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -166,7 +166,6 @@ struct arm_v7s_io_pgtable { arm_v7s_iopte *pgd; struct kmem_cache *l2_tables; - spinlock_t split_lock; }; static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl); @@ -363,25 +362,6 @@ static arm_v7s_iopte arm_v7s_prot_to_pte(int prot, int lvl, return pte; } -static int arm_v7s_pte_to_prot(arm_v7s_iopte pte, int lvl) -{ - int prot = IOMMU_READ; - arm_v7s_iopte attr = pte >> ARM_V7S_ATTR_SHIFT(lvl); - - if (!(attr & ARM_V7S_PTE_AP_RDONLY)) - prot |= IOMMU_WRITE; - if (!(attr & ARM_V7S_PTE_AP_UNPRIV)) - prot |= IOMMU_PRIV; - if ((attr & (ARM_V7S_TEX_MASK << ARM_V7S_TEX_SHIFT)) == 0) - prot |= IOMMU_MMIO; - else if (pte & ARM_V7S_ATTR_C) - prot |= IOMMU_CACHE; - if (pte & ARM_V7S_ATTR_XN(lvl)) - prot |= IOMMU_NOEXEC; - - return prot; -} - static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl) { if (lvl == 1) { @@ -398,23 +378,6 @@ static arm_v7s_iopte arm_v7s_pte_to_cont(arm_v7s_iopte pte, int lvl) return pte; } -static arm_v7s_iopte arm_v7s_cont_to_pte(arm_v7s_iopte pte, int lvl) -{ - if (lvl == 1) { - pte &= ~ARM_V7S_CONT_SECTION; - } else if (lvl == 2) { - arm_v7s_iopte xn = pte & BIT(ARM_V7S_CONT_PAGE_XN_SHIFT); - arm_v7s_iopte tex = pte & (ARM_V7S_CONT_PAGE_TEX_MASK << - ARM_V7S_CONT_PAGE_TEX_SHIFT); - - pte ^= xn | tex | ARM_V7S_PTE_TYPE_CONT_PAGE; - pte |= (xn >> ARM_V7S_CONT_PAGE_XN_SHIFT) | - (tex >> ARM_V7S_CONT_PAGE_TEX_SHIFT) | - ARM_V7S_PTE_TYPE_PAGE; - } - return pte; -} - static bool arm_v7s_pte_is_cont(arm_v7s_iopte pte, int lvl) { if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte, lvl)) @@ -591,77 +554,6 @@ static void arm_v7s_free_pgtable(struct io_pgtable *iop) kfree(data); } -static arm_v7s_iopte arm_v7s_split_cont(struct arm_v7s_io_pgtable *data, - unsigned long iova, int idx, int lvl, - arm_v7s_iopte *ptep) -{ - struct io_pgtable *iop = &data->iop; - arm_v7s_iopte pte; - size_t size = ARM_V7S_BLOCK_SIZE(lvl); - int i; - - /* Check that we didn't lose a race to get the lock */ - pte = *ptep; - if (!arm_v7s_pte_is_cont(pte, lvl)) - return pte; - - ptep -= idx & (ARM_V7S_CONT_PAGES - 1); - pte = arm_v7s_cont_to_pte(pte, lvl); - for (i = 0; i < ARM_V7S_CONT_PAGES; i++) - ptep[i] = pte + i * size; - - __arm_v7s_pte_sync(ptep, ARM_V7S_CONT_PAGES, &iop->cfg); - - size *= ARM_V7S_CONT_PAGES; - io_pgtable_tlb_flush_walk(iop, iova, size, size); - return pte; -} - -static size_t arm_v7s_split_blk_unmap(struct arm_v7s_io_pgtable *data, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size, - arm_v7s_iopte blk_pte, - arm_v7s_iopte *ptep) -{ - struct io_pgtable_cfg *cfg = &data->iop.cfg; - arm_v7s_iopte pte, *tablep; - int i, unmap_idx, num_entries, num_ptes; - - tablep = __arm_v7s_alloc_table(2, GFP_ATOMIC, data); - if (!tablep) - return 0; /* Bytes unmapped */ - - num_ptes = ARM_V7S_PTES_PER_LVL(2, cfg); - num_entries = size >> ARM_V7S_LVL_SHIFT(2); - unmap_idx = ARM_V7S_LVL_IDX(iova, 2, cfg); - - pte = arm_v7s_prot_to_pte(arm_v7s_pte_to_prot(blk_pte, 1), 2, cfg); - if (num_entries > 1) - pte = arm_v7s_pte_to_cont(pte, 2); - - for (i = 0; i < num_ptes; i += num_entries, pte += size) { - /* Unmap! */ - if (i == unmap_idx) - continue; - - __arm_v7s_set_pte(&tablep[i], pte, num_entries, cfg); - } - - pte = arm_v7s_install_table(tablep, ptep, blk_pte, cfg); - if (pte != blk_pte) { - __arm_v7s_free_table(tablep, 2, data); - - if (!ARM_V7S_PTE_IS_TABLE(pte, 1)) - return 0; - - tablep = iopte_deref(pte, 1, data); - return __arm_v7s_unmap(data, gather, iova, size, 2, tablep); - } - - io_pgtable_tlb_add_page(&data->iop, gather, iova, size); - return size; -} - static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, int lvl, @@ -694,11 +586,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, * case in a lock for the sake of correctness and be done with it. */ if (num_entries <= 1 && arm_v7s_pte_is_cont(pte[0], lvl)) { - unsigned long flags; - - spin_lock_irqsave(&data->split_lock, flags); - pte[0] = arm_v7s_split_cont(data, iova, idx, lvl, ptep); - spin_unlock_irqrestore(&data->split_lock, flags); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* If the size matches this level, we're in the right place */ @@ -721,12 +610,8 @@ static size_t __arm_v7s_unmap(struct arm_v7s_io_pgtable *data, } return size; } else if (lvl == 1 && !ARM_V7S_PTE_IS_TABLE(pte[0], lvl)) { - /* - * Insert a table at the next level to map the old region, - * minus the part we want to unmap - */ - return arm_v7s_split_blk_unmap(data, gather, iova, size, pte[0], - ptep); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* Keep on walkin' */ @@ -811,8 +696,6 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg, if (!data) return NULL; - spin_lock_init(&data->split_lock); - /* * ARM_MTK_TTBR_EXT extend the translation table base support larger * memory address. @@ -936,8 +819,8 @@ static int __init arm_v7s_do_selftests(void) .quirks = IO_PGTABLE_QUIRK_ARM_NS, .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M, }; - unsigned int iova, size, iova_start; - unsigned int i, loopnr = 0; + unsigned int iova, size; + unsigned int i; size_t mapped; selftest_running = true; @@ -985,26 +868,6 @@ static int __init arm_v7s_do_selftests(void) return __FAIL(ops); iova += SZ_16M; - loopnr++; - } - - /* Partial unmap */ - i = 1; - size = 1UL << __ffs(cfg.pgsize_bitmap); - while (i < loopnr) { - iova_start = i * SZ_16M; - if (ops->unmap_pages(ops, iova_start + size, size, 1, NULL) != size) - return __FAIL(ops); - - /* Remap of partial unmap */ - if (ops->map_pages(ops, iova_start + size, size, size, 1, - IOMMU_READ, GFP_KERNEL, &mapped)) - return __FAIL(ops); - - if (ops->iova_to_phys(ops, iova_start + size + 42) - != (size + 42)) - return __FAIL(ops); - i++; } /* Full unmap */ diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index 0e67f1721a3d..6b9bb58a414f 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -106,6 +106,18 @@ #define ARM_LPAE_PTE_HAP_FAULT (((arm_lpae_iopte)0) << 6) #define ARM_LPAE_PTE_HAP_READ (((arm_lpae_iopte)1) << 6) #define ARM_LPAE_PTE_HAP_WRITE (((arm_lpae_iopte)2) << 6) +/* + * For !FWB these code to: + * 1111 = Normal outer write back cachable / Inner Write Back Cachable + * Permit S1 to override + * 0101 = Normal Non-cachable / Inner Non-cachable + * 0001 = Device / Device-nGnRE + * For S2FWB these code: + * 0110 Force Normal Write Back + * 0101 Normal* is forced Normal-NC, Device unchanged + * 0001 Force Device-nGnRE + */ +#define ARM_LPAE_PTE_MEMATTR_FWB_WB (((arm_lpae_iopte)0x6) << 2) #define ARM_LPAE_PTE_MEMATTR_OIWB (((arm_lpae_iopte)0xf) << 2) #define ARM_LPAE_PTE_MEMATTR_NC (((arm_lpae_iopte)0x5) << 2) #define ARM_LPAE_PTE_MEMATTR_DEV (((arm_lpae_iopte)0x1) << 2) @@ -199,6 +211,18 @@ static phys_addr_t iopte_to_paddr(arm_lpae_iopte pte, return (paddr | (paddr << (48 - 12))) & (ARM_LPAE_PTE_ADDR_MASK << 4); } +/* + * Convert an index returned by ARM_LPAE_PGD_IDX(), which can point into + * a concatenated PGD, into the maximum number of entries that can be + * mapped in the same table page. + */ +static inline int arm_lpae_max_entries(int i, struct arm_lpae_io_pgtable *data) +{ + int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data); + + return ptes_per_table - (i & (ptes_per_table - 1)); +} + static bool selftest_running = false; static dma_addr_t __arm_lpae_dma_addr(void *pages) @@ -390,7 +414,7 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, /* If we can install a leaf entry at this level, then do so */ if (size == block_size) { - max_entries = ARM_LPAE_PTES_PER_TABLE(data) - map_idx_start; + max_entries = arm_lpae_max_entries(map_idx_start, data); num_entries = min_t(int, pgcount, max_entries); ret = arm_lpae_init_pte(data, iova, paddr, prot, lvl, num_entries, ptep); if (!ret) @@ -458,12 +482,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data, */ if (data->iop.fmt == ARM_64_LPAE_S2 || data->iop.fmt == ARM_32_LPAE_S2) { - if (prot & IOMMU_MMIO) + if (prot & IOMMU_MMIO) { pte |= ARM_LPAE_PTE_MEMATTR_DEV; - else if (prot & IOMMU_CACHE) - pte |= ARM_LPAE_PTE_MEMATTR_OIWB; - else + } else if (prot & IOMMU_CACHE) { + if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB) + pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB; + else + pte |= ARM_LPAE_PTE_MEMATTR_OIWB; + } else { pte |= ARM_LPAE_PTE_MEMATTR_NC; + } } else { if (prot & IOMMU_MMIO) pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV @@ -569,66 +597,6 @@ static void arm_lpae_free_pgtable(struct io_pgtable *iop) kfree(data); } -static size_t arm_lpae_split_blk_unmap(struct arm_lpae_io_pgtable *data, - struct iommu_iotlb_gather *gather, - unsigned long iova, size_t size, - arm_lpae_iopte blk_pte, int lvl, - arm_lpae_iopte *ptep, size_t pgcount) -{ - struct io_pgtable_cfg *cfg = &data->iop.cfg; - arm_lpae_iopte pte, *tablep; - phys_addr_t blk_paddr; - size_t tablesz = ARM_LPAE_GRANULE(data); - size_t split_sz = ARM_LPAE_BLOCK_SIZE(lvl, data); - int ptes_per_table = ARM_LPAE_PTES_PER_TABLE(data); - int i, unmap_idx_start = -1, num_entries = 0, max_entries; - - if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS)) - return 0; - - tablep = __arm_lpae_alloc_pages(tablesz, GFP_ATOMIC, cfg, data->iop.cookie); - if (!tablep) - return 0; /* Bytes unmapped */ - - if (size == split_sz) { - unmap_idx_start = ARM_LPAE_LVL_IDX(iova, lvl, data); - max_entries = ptes_per_table - unmap_idx_start; - num_entries = min_t(int, pgcount, max_entries); - } - - blk_paddr = iopte_to_paddr(blk_pte, data); - pte = iopte_prot(blk_pte); - - for (i = 0; i < ptes_per_table; i++, blk_paddr += split_sz) { - /* Unmap! */ - if (i >= unmap_idx_start && i < (unmap_idx_start + num_entries)) - continue; - - __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 1, &tablep[i]); - } - - pte = arm_lpae_install_table(tablep, ptep, blk_pte, data); - if (pte != blk_pte) { - __arm_lpae_free_pages(tablep, tablesz, cfg, data->iop.cookie); - /* - * We may race against someone unmapping another part of this - * block, but anything else is invalid. We can't misinterpret - * a page entry here since we're never at the last level. - */ - if (iopte_type(pte) != ARM_LPAE_PTE_TYPE_TABLE) - return 0; - - tablep = iopte_deref(pte, data); - } else if (unmap_idx_start >= 0) { - for (i = 0; i < num_entries; i++) - io_pgtable_tlb_add_page(&data->iop, gather, iova + i * size, size); - - return num_entries * size; - } - - return __arm_lpae_unmap(data, gather, iova, size, pgcount, lvl, tablep); -} - static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, struct iommu_iotlb_gather *gather, unsigned long iova, size_t size, size_t pgcount, @@ -650,7 +618,7 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, /* If the size matches this level, we're in the right place */ if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) { - max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start; + max_entries = arm_lpae_max_entries(unmap_idx_start, data); num_entries = min_t(int, pgcount, max_entries); /* Find and handle non-leaf entries */ @@ -678,12 +646,8 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, return i * size; } else if (iopte_leaf(pte, lvl, iop->fmt)) { - /* - * Insert a table at the next level to map the old region, - * minus the part we want to unmap - */ - return arm_lpae_split_blk_unmap(data, gather, iova, size, pte, - lvl + 1, ptep, pgcount); + WARN_ONCE(true, "Unmap of a partial large IOPTE is not allowed"); + return 0; } /* Keep on walkin' */ @@ -1035,8 +999,7 @@ arm_64_lpae_alloc_pgtable_s2(struct io_pgtable_cfg *cfg, void *cookie) struct arm_lpae_io_pgtable *data; typeof(&cfg->arm_lpae_s2_cfg.vtcr) vtcr = &cfg->arm_lpae_s2_cfg.vtcr; - /* The NS quirk doesn't apply at stage 2 */ - if (cfg->quirks) + if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_S2FWB)) return NULL; data = arm_lpae_alloc_pgtable(cfg); @@ -1347,19 +1310,6 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) iova += SZ_1G; } - /* Partial unmap */ - size = 1UL << __ffs(cfg->pgsize_bitmap); - if (ops->unmap_pages(ops, SZ_1G + size, size, 1, NULL) != size) - return __FAIL(ops, i); - - /* Remap of partial unmap */ - if (ops->map_pages(ops, SZ_1G + size, size, size, 1, - IOMMU_READ, GFP_KERNEL, &mapped)) - return __FAIL(ops, i); - - if (ops->iova_to_phys(ops, SZ_1G + size + 42) != (size + 42)) - return __FAIL(ops, i); - /* Full unmap */ iova = 0; for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { @@ -1382,6 +1332,23 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) iova += SZ_1G; } + /* + * Map/unmap the last largest supported page of the IAS, this can + * trigger corner cases in the concatednated page tables. + */ + mapped = 0; + size = 1UL << __fls(cfg->pgsize_bitmap); + iova = (1UL << cfg->ias) - size; + if (ops->map_pages(ops, iova, iova, size, 1, + IOMMU_READ | IOMMU_WRITE | + IOMMU_NOEXEC | IOMMU_CACHE, + GFP_KERNEL, &mapped)) + return __FAIL(ops, i); + if (mapped != size) + return __FAIL(ops, i); + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) + return __FAIL(ops, i); + free_io_pgtable_ops(ops); } diff --git a/drivers/iommu/iommu-sysfs.c b/drivers/iommu/iommu-sysfs.c index cbe378c34ba3..170022c09536 100644 --- a/drivers/iommu/iommu-sysfs.c +++ b/drivers/iommu/iommu-sysfs.c @@ -34,7 +34,7 @@ static void release_device(struct device *dev) kfree(dev); } -static struct class iommu_class = { +static const struct class iommu_class = { .name = "iommu", .dev_release = release_device, .dev_groups = dev_groups, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 83c8e617a2c5..7618e9c65d3f 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -32,6 +32,7 @@ #include <trace/events/iommu.h> #include <linux/sched/mm.h> #include <linux/msi.h> +#include <uapi/linux/iommufd.h> #include "dma-iommu.h" #include "iommu-priv.h" @@ -90,15 +91,17 @@ static const char * const iommu_group_resv_type_string[] = { #define IOMMU_CMD_LINE_DMA_API BIT(0) #define IOMMU_CMD_LINE_STRICT BIT(1) +static int bus_iommu_probe(const struct bus_type *bus); static int iommu_bus_notifier(struct notifier_block *nb, unsigned long action, void *data); static void iommu_release_device(struct device *dev); -static struct iommu_domain * -__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type); static int __iommu_attach_device(struct iommu_domain *domain, struct device *dev); static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group); +static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int type, + unsigned int flags); enum { IOMMU_SET_DOMAIN_MUST_SUCCEED = 1 << 0, @@ -133,6 +136,8 @@ static struct group_device *iommu_group_alloc_device(struct iommu_group *group, struct device *dev); static void __iommu_group_free_device(struct iommu_group *group, struct group_device *grp_dev); +static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, + const struct iommu_ops *ops); #define IOMMU_GROUP_ATTR(_name, _mode, _show, _store) \ struct iommu_group_attribute iommu_group_attr_##_name = \ @@ -1141,10 +1146,6 @@ map_end: } } - - if (!list_empty(&mappings) && iommu_is_dma_domain(domain)) - iommu_flush_iotlb_all(domain); - out: iommu_put_resv_regions(dev, &mappings); @@ -1586,12 +1587,59 @@ struct iommu_group *fsl_mc_device_group(struct device *dev) } EXPORT_SYMBOL_GPL(fsl_mc_device_group); +static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + if (ops->identity_domain) + return ops->identity_domain; + + /* Older drivers create the identity domain via ops->domain_alloc() */ + if (!ops->domain_alloc) + return ERR_PTR(-EOPNOTSUPP); + + domain = ops->domain_alloc(IOMMU_DOMAIN_IDENTITY); + if (IS_ERR(domain)) + return domain; + if (!domain) + return ERR_PTR(-ENOMEM); + + iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops); + return domain; +} + static struct iommu_domain * __iommu_group_alloc_default_domain(struct iommu_group *group, int req_type) { + struct device *dev = iommu_group_first_dev(group); + struct iommu_domain *dom; + if (group->default_domain && group->default_domain->type == req_type) return group->default_domain; - return __iommu_group_domain_alloc(group, req_type); + + /* + * When allocating the DMA API domain assume that the driver is going to + * use PASID and make sure the RID's domain is PASID compatible. + */ + if (req_type & __IOMMU_DOMAIN_PAGING) { + dom = __iommu_paging_domain_alloc_flags(dev, req_type, + dev->iommu->max_pasids ? IOMMU_HWPT_ALLOC_PASID : 0); + + /* + * If driver does not support PASID feature then + * try to allocate non-PASID domain + */ + if (PTR_ERR(dom) == -EOPNOTSUPP) + dom = __iommu_paging_domain_alloc_flags(dev, req_type, 0); + + return dom; + } + + if (req_type == IOMMU_DOMAIN_IDENTITY) + return __iommu_alloc_identity_domain(dev); + + return ERR_PTR(-EINVAL); } /* @@ -1795,7 +1843,7 @@ static void iommu_group_do_probe_finalize(struct device *dev) ops->probe_finalize(dev); } -int bus_iommu_probe(const struct bus_type *bus) +static int bus_iommu_probe(const struct bus_type *bus) { struct iommu_group *group, *next; LIST_HEAD(group_list); @@ -1841,31 +1889,6 @@ int bus_iommu_probe(const struct bus_type *bus) } /** - * iommu_present() - make platform-specific assumptions about an IOMMU - * @bus: bus to check - * - * Do not use this function. You want device_iommu_mapped() instead. - * - * Return: true if some IOMMU is present and aware of devices on the given bus; - * in general it may not be the only IOMMU, and it may not have anything to do - * with whatever device you are ultimately interested in. - */ -bool iommu_present(const struct bus_type *bus) -{ - bool ret = false; - - for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { - if (iommu_buses[i] == bus) { - spin_lock(&iommu_device_lock); - ret = !list_empty(&iommu_device_list); - spin_unlock(&iommu_device_lock); - } - } - return ret; -} -EXPORT_SYMBOL_GPL(iommu_present); - -/** * device_iommu_capable() - check for a general IOMMU capability * @dev: device to which the capability would be relevant, if available * @cap: IOMMU capability @@ -1934,117 +1957,67 @@ void iommu_set_fault_handler(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_set_fault_handler); -static struct iommu_domain *__iommu_domain_alloc(const struct iommu_ops *ops, - struct device *dev, - unsigned int type) +static void iommu_domain_init(struct iommu_domain *domain, unsigned int type, + const struct iommu_ops *ops) { - struct iommu_domain *domain; - unsigned int alloc_type = type & IOMMU_DOMAIN_ALLOC_FLAGS; - - if (alloc_type == IOMMU_DOMAIN_IDENTITY && ops->identity_domain) - return ops->identity_domain; - else if (alloc_type == IOMMU_DOMAIN_BLOCKED && ops->blocked_domain) - return ops->blocked_domain; - else if (type & __IOMMU_DOMAIN_PAGING && ops->domain_alloc_paging) - domain = ops->domain_alloc_paging(dev); - else if (ops->domain_alloc) - domain = ops->domain_alloc(alloc_type); - else - return ERR_PTR(-EOPNOTSUPP); - - /* - * Many domain_alloc ops now return ERR_PTR, make things easier for the - * driver by accepting ERR_PTR from all domain_alloc ops instead of - * having two rules. - */ - if (IS_ERR(domain)) - return domain; - if (!domain) - return ERR_PTR(-ENOMEM); - domain->type = type; domain->owner = ops; + if (!domain->ops) + domain->ops = ops->default_domain_ops; + /* * If not already set, assume all sizes by default; the driver * may override this later */ if (!domain->pgsize_bitmap) domain->pgsize_bitmap = ops->pgsize_bitmap; - - if (!domain->ops) - domain->ops = ops->default_domain_ops; - - if (iommu_is_dma_domain(domain)) { - int rc; - - rc = iommu_get_dma_cookie(domain); - if (rc) { - iommu_domain_free(domain); - return ERR_PTR(rc); - } - } - return domain; } static struct iommu_domain * -__iommu_group_domain_alloc(struct iommu_group *group, unsigned int type) -{ - struct device *dev = iommu_group_first_dev(group); - - return __iommu_domain_alloc(dev_iommu_ops(dev), dev, type); -} - -static int __iommu_domain_alloc_dev(struct device *dev, void *data) +__iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, + unsigned int flags) { - const struct iommu_ops **ops = data; + const struct iommu_ops *ops; + struct iommu_domain *domain; if (!dev_has_iommu(dev)) - return 0; - - if (WARN_ONCE(*ops && *ops != dev_iommu_ops(dev), - "Multiple IOMMU drivers present for bus %s, which the public IOMMU API can't fully support yet. You will still need to disable one or more for this to work, sorry!\n", - dev_bus_name(dev))) - return -EBUSY; - - *ops = dev_iommu_ops(dev); - return 0; -} + return ERR_PTR(-ENODEV); -/* - * The iommu ops in bus has been retired. Do not use this interface in - * new drivers. - */ -struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus) -{ - const struct iommu_ops *ops = NULL; - int err = bus_for_each_dev(bus, NULL, &ops, __iommu_domain_alloc_dev); - struct iommu_domain *domain; + ops = dev_iommu_ops(dev); - if (err || !ops) - return NULL; + if (ops->domain_alloc_paging && !flags) + domain = ops->domain_alloc_paging(dev); + else if (ops->domain_alloc_user) + domain = ops->domain_alloc_user(dev, flags, NULL, NULL); + else if (ops->domain_alloc && !flags) + domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED); + else + return ERR_PTR(-EOPNOTSUPP); - domain = __iommu_domain_alloc(ops, NULL, IOMMU_DOMAIN_UNMANAGED); if (IS_ERR(domain)) - return NULL; + return domain; + if (!domain) + return ERR_PTR(-ENOMEM); + + iommu_domain_init(domain, type, ops); return domain; } -EXPORT_SYMBOL_GPL(iommu_domain_alloc); /** - * iommu_paging_domain_alloc() - Allocate a paging domain + * iommu_paging_domain_alloc_flags() - Allocate a paging domain * @dev: device for which the domain is allocated + * @flags: Bitmap of iommufd_hwpt_alloc_flags * * Allocate a paging domain which will be managed by a kernel driver. Return - * allocated domain if successful, or a ERR pointer for failure. + * allocated domain if successful, or an ERR pointer for failure. */ -struct iommu_domain *iommu_paging_domain_alloc(struct device *dev) +struct iommu_domain *iommu_paging_domain_alloc_flags(struct device *dev, + unsigned int flags) { - if (!dev_has_iommu(dev)) - return ERR_PTR(-ENODEV); - - return __iommu_domain_alloc(dev_iommu_ops(dev), dev, IOMMU_DOMAIN_UNMANAGED); + return __iommu_paging_domain_alloc_flags(dev, + IOMMU_DOMAIN_UNMANAGED, flags); } -EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc); +EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags); void iommu_domain_free(struct iommu_domain *domain) { @@ -2216,8 +2189,8 @@ EXPORT_SYMBOL_GPL(iommu_attach_group); /** * iommu_group_replace_domain - replace the domain that a group is attached to - * @new_domain: new IOMMU domain to replace with * @group: IOMMU group that will be attached to the new domain + * @new_domain: new IOMMU domain to replace with * * This API allows the group to switch domains without being forced to go to * the blocking domain in-between. @@ -2586,6 +2559,20 @@ static size_t __iommu_unmap(struct iommu_domain *domain, return unmapped; } +/** + * iommu_unmap() - Remove mappings from a range of IOVA + * @domain: Domain to manipulate + * @iova: IO virtual address to start + * @size: Length of the range starting from @iova + * + * iommu_unmap() will remove a translation created by iommu_map(). It cannot + * subdivide a mapping created by iommu_map(), so it should be called with IOVA + * ranges that match what was passed to iommu_map(). The range can aggregate + * contiguous iommu_map() calls so long as no individual range is split. + * + * Returns: Number of bytes of IOVA unmapped. iova + res will be the point + * unmapping stopped. + */ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { @@ -2723,16 +2710,6 @@ static int __init iommu_init(void) } core_initcall(iommu_init); -int iommu_enable_nesting(struct iommu_domain *domain) -{ - if (domain->type != IOMMU_DOMAIN_UNMANAGED) - return -EINVAL; - if (!domain->ops->enable_nesting) - return -EINVAL; - return domain->ops->enable_nesting(domain); -} -EXPORT_SYMBOL_GPL(iommu_enable_nesting); - int iommu_set_pgtable_quirks(struct iommu_domain *domain, unsigned long quirk) { @@ -2965,6 +2942,14 @@ static int iommu_setup_default_domain(struct iommu_group *group, if (group->default_domain == dom) return 0; + if (iommu_is_dma_domain(dom)) { + ret = iommu_get_dma_cookie(dom); + if (ret) { + iommu_domain_free(dom); + return ret; + } + } + /* * IOMMU_RESV_DIRECT and IOMMU_RESV_DIRECT_RELAXABLE regions must be * mapped before their device is attached, in order to guarantee @@ -3152,22 +3137,25 @@ void iommu_device_unuse_default_domain(struct device *dev) static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) { + struct device *dev = iommu_group_first_dev(group); + const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; if (group->blocking_domain) return 0; - domain = __iommu_group_domain_alloc(group, IOMMU_DOMAIN_BLOCKED); - if (IS_ERR(domain)) { - /* - * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED - * create an empty domain instead. - */ - domain = __iommu_group_domain_alloc(group, - IOMMU_DOMAIN_UNMANAGED); - if (IS_ERR(domain)) - return PTR_ERR(domain); + if (ops->blocked_domain) { + group->blocking_domain = ops->blocked_domain; + return 0; } + + /* + * For drivers that do not yet understand IOMMU_DOMAIN_BLOCKED create an + * empty PAGING domain instead. + */ + domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) + return PTR_ERR(domain); group->blocking_domain = domain; return 0; } @@ -3331,7 +3319,8 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, int ret; for_each_group_device(group, device) { - ret = domain->ops->set_dev_pasid(domain, device->dev, pasid); + ret = domain->ops->set_dev_pasid(domain, device->dev, + pasid, NULL); if (ret) goto err_revert; } diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 76656fe0470d..0a07f9449fd9 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -1,4 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only +config IOMMUFD_DRIVER_CORE + tristate + default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n + config IOMMUFD tristate "IOMMU Userspace API" select INTERVAL_TREE diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index cf4605962bea..cb784da6cddc 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -7,9 +7,13 @@ iommufd-y := \ ioas.o \ main.o \ pages.o \ - vfio_compat.o + vfio_compat.o \ + viommu.o iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o obj-$(CONFIG_IOMMUFD) += iommufd.o obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o + +iommufd_driver-y := driver.o +obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c new file mode 100644 index 000000000000..7b67fdf44134 --- /dev/null +++ b/drivers/iommu/iommufd/driver.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "iommufd_private.h" + +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + /* Starts out bias'd by 1 until it is removed from the xarray */ + refcount_set(&obj->shortterm_users, 1); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b, + GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, IOMMUFD); + +/* Caller should xa_lock(&viommu->vdevs) to protect the return value */ +struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, + unsigned long vdev_id) +{ + struct iommufd_vdevice *vdev; + + lockdep_assert_held(&viommu->vdevs.xa_lock); + + vdev = xa_load(&viommu->vdevs, vdev_id); + return vdev ? vdev->dev : NULL; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, IOMMUFD); + +MODULE_DESCRIPTION("iommufd code shared with builtin modules"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index e590973ce5cf..053b0e30f55a 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -10,6 +10,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/pci.h> +#include <linux/pci-ats.h> #include <linux/poll.h> #include <uapi/linux/iommufd.h> @@ -27,8 +28,12 @@ static int iommufd_fault_iopf_enable(struct iommufd_device *idev) * resource between PF and VFs. There is no coordination for this * shared capability. This waits for a vPRI reset to recover. */ - if (dev_is_pci(dev) && to_pci_dev(dev)->is_virtfn) - return -EINVAL; + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if (pdev->is_virtfn && pci_pri_supported(pdev)) + return -EINVAL; + } mutex_lock(&idev->iopf_lock); /* Device iopf has already been on. */ diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index d06bf6e6c19f..9236e8ca9aa8 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -57,7 +57,10 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) container_of(obj, struct iommufd_hwpt_nested, common.obj); __iommufd_hwpt_destroy(&hwpt_nested->common); - refcount_dec(&hwpt_nested->parent->common.obj.users); + if (hwpt_nested->viommu) + refcount_dec(&hwpt_nested->viommu->obj.users); + else + refcount_dec(&hwpt_nested->parent->common.obj.users); } void iommufd_hwpt_nested_abort(struct iommufd_object *obj) @@ -107,7 +110,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_FAULT_ID_VALID; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; @@ -248,8 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, } hwpt->domain->owner = ops; - if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED || - !hwpt->domain->ops->cache_invalidate_user)) { + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; goto out_abort; } @@ -260,6 +263,58 @@ out_abort: return ERR_PTR(rc); } +/** + * iommufd_viommu_alloc_hwpt_nested() - Get a hwpt_nested for a vIOMMU + * @viommu: vIOMMU ojbect to associate the hwpt_nested/domain with + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new IOMMU_DOMAIN_NESTED for a vIOMMU and return it as a NESTED + * hw_pagetable. + */ +static struct iommufd_hwpt_nested * +iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (!user_data->len) + return ERR_PTR(-EOPNOTSUPP); + if (!viommu->ops || !viommu->ops->alloc_domain_nested) + return ERR_PTR(-EOPNOTSUPP); + + hwpt_nested = __iommufd_object_alloc( + viommu->ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + + hwpt_nested->viommu = viommu; + refcount_inc(&viommu->obj.users); + hwpt_nested->parent = viommu->hwpt; + + hwpt->domain = + viommu->ops->alloc_domain_nested(viommu, flags, user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + hwpt->domain->owner = viommu->iommu_dev->ops; + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EINVAL; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(viommu->ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; @@ -316,6 +371,22 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) goto out_unlock; } hwpt = &hwpt_nested->common; + } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_viommu *viommu; + + viommu = container_of(pt_obj, struct iommufd_viommu, obj); + if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { + rc = -EINVAL; + goto out_unlock; + } + hwpt_nested = iommufd_viommu_alloc_hwpt_nested( + viommu, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; } else { rc = -EINVAL; goto out_put_pt; @@ -412,7 +483,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) .entry_len = cmd->entry_len, .entry_num = cmd->entry_num, }; - struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *pt_obj; u32 done_num = 0; int rc; @@ -426,17 +497,40 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd) goto out; } - hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id); - if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); + pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) { + rc = PTR_ERR(pt_obj); goto out; } + if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + if (!hwpt->domain->ops || + !hwpt->domain->ops->cache_invalidate_user) { + rc = -EOPNOTSUPP; + goto out_put_pt; + } + rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, + &data_array); + } else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) { + struct iommufd_viommu *viommu = + container_of(pt_obj, struct iommufd_viommu, obj); + + if (!viommu->ops || !viommu->ops->cache_invalidate) { + rc = -EOPNOTSUPP; + goto out_put_pt; + } + rc = viommu->ops->cache_invalidate(viommu, &data_array); + } else { + rc = -EINVAL; + goto out_put_pt; + } - rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain, - &data_array); done_num = data_array.entry_num; - iommufd_put_object(ucmd->ictx, &hwpt->obj); +out_put_pt: + iommufd_put_object(ucmd->ictx, pt_obj); out: cmd->entry_num = done_num; if (iommufd_ucmd_respond(ucmd, sizeof(*cmd))) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 4bf7ccd39d46..8a790e597e12 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -107,9 +107,9 @@ static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, * Does not return a 0 IOVA even if it is valid. */ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, - unsigned long uptr, unsigned long length) + unsigned long addr, unsigned long length) { - unsigned long page_offset = uptr % PAGE_SIZE; + unsigned long page_offset = addr % PAGE_SIZE; struct interval_tree_double_span_iter used_span; struct interval_tree_span_iter allowed_span; unsigned long max_alignment = PAGE_SIZE; @@ -122,15 +122,15 @@ static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, return -EOVERFLOW; /* - * Keep alignment present in the uptr when building the IOVA, this + * Keep alignment present in addr when building the IOVA, which * increases the chance we can map a THP. */ - if (!uptr) + if (!addr) iova_alignment = roundup_pow_of_two(length); else iova_alignment = min_t(unsigned long, roundup_pow_of_two(length), - 1UL << __ffs64(uptr)); + 1UL << __ffs64(addr)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE max_alignment = HPAGE_SIZE; @@ -248,6 +248,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, int iommu_prot, unsigned int flags) { struct iopt_pages_list *elm; + unsigned long start; unsigned long iova; int rc = 0; @@ -267,9 +268,15 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, /* Use the first entry to guess the ideal IOVA alignment */ elm = list_first_entry(pages_list, struct iopt_pages_list, next); - rc = iopt_alloc_iova( - iopt, dst_iova, - (uintptr_t)elm->pages->uptr + elm->start_byte, length); + switch (elm->pages->type) { + case IOPT_ADDRESS_USER: + start = elm->start_byte + (uintptr_t)elm->pages->uptr; + break; + case IOPT_ADDRESS_FILE: + start = elm->start_byte + elm->pages->start; + break; + } + rc = iopt_alloc_iova(iopt, dst_iova, start, length); if (rc) goto out_unlock; if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && @@ -384,6 +391,34 @@ out_unlock_domains: return rc; } +static int iopt_map_common(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + struct iopt_pages *pages, unsigned long *iova, + unsigned long length, unsigned long start_byte, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list elm = {}; + LIST_HEAD(pages_list); + int rc; + + elm.pages = pages; + elm.start_byte = start_byte; + if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && + elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) + elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; + elm.length = length; + list_add(&elm.next, &pages_list); + + rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); + if (rc) { + if (elm.area) + iopt_abort_area(elm.area); + if (elm.pages) + iopt_put_pages(elm.pages); + return rc; + } + return 0; +} + /** * iopt_map_user_pages() - Map a user VA to an iova in the io page table * @ictx: iommufd_ctx the iopt is part of @@ -408,29 +443,41 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long length, int iommu_prot, unsigned int flags) { - struct iopt_pages_list elm = {}; - LIST_HEAD(pages_list); - int rc; + struct iopt_pages *pages; - elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); - if (IS_ERR(elm.pages)) - return PTR_ERR(elm.pages); - if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && - elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) - elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; - elm.start_byte = uptr - elm.pages->uptr; - elm.length = length; - list_add(&elm.next, &pages_list); + pages = iopt_alloc_user_pages(uptr, length, iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) + return PTR_ERR(pages); - rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); - if (rc) { - if (elm.area) - iopt_abort_area(elm.area); - if (elm.pages) - iopt_put_pages(elm.pages); - return rc; - } - return 0; + return iopt_map_common(ictx, iopt, pages, iova, length, + uptr - pages->uptr, iommu_prot, flags); +} + +/** + * iopt_map_file_pages() - Like iopt_map_user_pages, but map a file. + * @ictx: iommufd_ctx the iopt is part of + * @iopt: io_pagetable to act on + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains + * the chosen iova on output. Otherwise is the iova to map to on input + * @file: file to map + * @start: map file starting at this byte offset + * @length: Number of bytes to map + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping + * @flags: IOPT_ALLOC_IOVA or zero + */ +int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, struct file *file, + unsigned long start, unsigned long length, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages *pages; + + pages = iopt_alloc_file_pages(file, start, length, + iommu_prot & IOMMU_WRITE); + if (IS_ERR(pages)) + return PTR_ERR(pages); + return iopt_map_common(ictx, iopt, pages, iova, length, + start - pages->start, iommu_prot, flags); } struct iova_bitmap_fn_arg { diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index c61d74471684..10c928a9a463 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -173,6 +173,12 @@ enum { IOPT_PAGES_ACCOUNT_NONE = 0, IOPT_PAGES_ACCOUNT_USER = 1, IOPT_PAGES_ACCOUNT_MM = 2, + IOPT_PAGES_ACCOUNT_MODE_NUM = 3, +}; + +enum iopt_address_type { + IOPT_ADDRESS_USER = 0, + IOPT_ADDRESS_FILE = 1, }; /* @@ -195,7 +201,14 @@ struct iopt_pages { struct task_struct *source_task; struct mm_struct *source_mm; struct user_struct *source_user; - void __user *uptr; + enum iopt_address_type type; + union { + void __user *uptr; /* IOPT_ADDRESS_USER */ + struct { /* IOPT_ADDRESS_FILE */ + struct file *file; + unsigned long start; + }; + }; bool writable:1; u8 account_mode; @@ -206,8 +219,10 @@ struct iopt_pages { struct rb_root_cached domains_itree; }; -struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, - bool writable); +struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, + unsigned long length, bool writable); +struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, + unsigned long length, bool writable); void iopt_release_pages(struct kref *kref); static inline void iopt_put_pages(struct iopt_pages *pages) { @@ -238,4 +253,9 @@ struct iopt_pages_access { unsigned int users; }; +struct pfn_reader_user; + +int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user); + #endif diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 2c4b2bb11e78..1542c5fd10a8 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ +#include <linux/file.h> #include <linux/interval_tree.h> #include <linux/iommu.h> #include <linux/iommufd.h> @@ -51,7 +52,10 @@ int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_table; + + down_read(&ucmd->ictx->ioas_creation_lock); iommufd_object_finalize(ucmd->ictx, &ioas->obj); + up_read(&ucmd->ictx->ioas_creation_lock); return 0; out_table: @@ -197,6 +201,52 @@ static int conv_iommu_prot(u32 map_flags) return iommu_prot; } +int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map_file *cmd = ucmd->cmd; + unsigned long iova = cmd->iova; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + struct file *file; + int rc; + + if (cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) + return -EOPNOTSUPP; + + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + + file = fget(cmd->fd); + if (!file) + return -EBADF; + + rc = iopt_map_file_pages(ucmd->ictx, &ioas->iopt, &iova, file, + cmd->start, cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(ucmd->ictx, &ioas->obj); + fput(file); + return rc; +} + int iommufd_ioas_map(struct iommufd_ucmd *ucmd) { struct iommu_ioas_map *cmd = ucmd->cmd; @@ -327,6 +377,215 @@ out_put: return rc; } +static void iommufd_release_all_iova_rwsem(struct iommufd_ctx *ictx, + struct xarray *ioas_list) +{ + struct iommufd_ioas *ioas; + unsigned long index; + + xa_for_each(ioas_list, index, ioas) { + up_write(&ioas->iopt.iova_rwsem); + refcount_dec(&ioas->obj.users); + } + up_write(&ictx->ioas_creation_lock); + xa_destroy(ioas_list); +} + +static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx, + struct xarray *ioas_list) +{ + struct iommufd_object *obj; + unsigned long index; + int rc; + + /* + * This is very ugly, it is done instead of adding a lock around + * pages->source_mm, which is a performance path for mdev, we just + * obtain the write side of all the iova_rwsems which also protects the + * pages->source_*. Due to copies we can't know which IOAS could read + * from the pages, so we just lock everything. This is the only place + * locks are nested and they are uniformly taken in ID order. + * + * ioas_creation_lock prevents new IOAS from being installed in the + * xarray while we do this, and also prevents more than one thread from + * holding nested locks. + */ + down_write(&ictx->ioas_creation_lock); + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + struct iommufd_ioas *ioas; + + if (!obj || obj->type != IOMMUFD_OBJ_IOAS) + continue; + + if (!refcount_inc_not_zero(&obj->users)) + continue; + + xa_unlock(&ictx->objects); + + ioas = container_of(obj, struct iommufd_ioas, obj); + down_write_nest_lock(&ioas->iopt.iova_rwsem, + &ictx->ioas_creation_lock); + + rc = xa_err(xa_store(ioas_list, index, ioas, GFP_KERNEL)); + if (rc) { + iommufd_release_all_iova_rwsem(ictx, ioas_list); + return rc; + } + + xa_lock(&ictx->objects); + } + xa_unlock(&ictx->objects); + return 0; +} + +static bool need_charge_update(struct iopt_pages *pages) +{ + switch (pages->account_mode) { + case IOPT_PAGES_ACCOUNT_NONE: + return false; + case IOPT_PAGES_ACCOUNT_MM: + return pages->source_mm != current->mm; + case IOPT_PAGES_ACCOUNT_USER: + /* + * Update when mm changes because it also accounts + * in mm->pinned_vm. + */ + return (pages->source_user != current_user()) || + (pages->source_mm != current->mm); + } + return true; +} + +static int charge_current(unsigned long *npinned) +{ + struct iopt_pages tmp = { + .source_mm = current->mm, + .source_task = current->group_leader, + .source_user = current_user(), + }; + unsigned int account_mode; + int rc; + + for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM; + account_mode++) { + if (!npinned[account_mode]) + continue; + + tmp.account_mode = account_mode; + rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true, + NULL); + if (rc) + goto err_undo; + } + return 0; + +err_undo: + while (account_mode != 0) { + account_mode--; + if (!npinned[account_mode]) + continue; + tmp.account_mode = account_mode; + iopt_pages_update_pinned(&tmp, npinned[account_mode], false, + NULL); + } + return rc; +} + +static void change_mm(struct iopt_pages *pages) +{ + struct task_struct *old_task = pages->source_task; + struct user_struct *old_user = pages->source_user; + struct mm_struct *old_mm = pages->source_mm; + + pages->source_mm = current->mm; + mmgrab(pages->source_mm); + mmdrop(old_mm); + + pages->source_task = current->group_leader; + get_task_struct(pages->source_task); + put_task_struct(old_task); + + pages->source_user = get_uid(current_user()); + free_uid(old_user); +} + +#define for_each_ioas_area(_xa, _index, _ioas, _area) \ + xa_for_each((_xa), (_index), (_ioas)) \ + for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \ + _area; \ + _area = iopt_area_iter_next(_area, 0, ULONG_MAX)) + +int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_change_process *cmd = ucmd->cmd; + struct iommufd_ctx *ictx = ucmd->ictx; + unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {}; + struct iommufd_ioas *ioas; + struct iopt_area *area; + struct iopt_pages *pages; + struct xarray ioas_list; + unsigned long index; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + xa_init(&ioas_list); + rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list); + if (rc) + return rc; + + for_each_ioas_area(&ioas_list, index, ioas, area) { + if (area->pages->type != IOPT_ADDRESS_FILE) { + rc = -EINVAL; + goto out; + } + } + + /* + * Count last_pinned pages, then clear it to avoid double counting + * if the same iopt_pages is visited multiple times in this loop. + * Since we are under all the locks, npinned == last_npinned, so we + * can easily restore last_npinned before we return. + */ + for_each_ioas_area(&ioas_list, index, ioas, area) { + pages = area->pages; + + if (need_charge_update(pages)) { + all_npinned[pages->account_mode] += pages->last_npinned; + pages->last_npinned = 0; + } + } + + rc = charge_current(all_npinned); + + if (rc) { + /* Charge failed. Fix last_npinned and bail. */ + for_each_ioas_area(&ioas_list, index, ioas, area) + area->pages->last_npinned = area->pages->npinned; + goto out; + } + + for_each_ioas_area(&ioas_list, index, ioas, area) { + pages = area->pages; + + /* Uncharge the old one (which also restores last_npinned) */ + if (need_charge_update(pages)) { + int r = iopt_pages_update_pinned(pages, pages->npinned, + false, NULL); + + if (WARN_ON(r)) + rc = r; + } + change_mm(pages); + } + +out: + iommufd_release_all_iova_rwsem(ictx, &ioas_list); + return rc; +} + int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index f1d865e6fab6..b6d706cf2c66 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -5,8 +5,8 @@ #define __IOMMUFD_PRIVATE_H #include <linux/iommu.h> +#include <linux/iommufd.h> #include <linux/iova_bitmap.h> -#include <linux/refcount.h> #include <linux/rwsem.h> #include <linux/uaccess.h> #include <linux/xarray.h> @@ -24,6 +24,7 @@ struct iommufd_ctx { struct xarray objects; struct xarray groups; wait_queue_head_t destroy_wait; + struct rw_semaphore ioas_creation_lock; u8 account_mode; /* Compatibility with VFIO no iommu */ @@ -69,6 +70,10 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags); +int iopt_map_file_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, struct file *file, + unsigned long start, unsigned long length, + int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags); @@ -122,29 +127,6 @@ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, return 0; } -enum iommufd_object_type { - IOMMUFD_OBJ_NONE, - IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, - IOMMUFD_OBJ_DEVICE, - IOMMUFD_OBJ_HWPT_PAGING, - IOMMUFD_OBJ_HWPT_NESTED, - IOMMUFD_OBJ_IOAS, - IOMMUFD_OBJ_ACCESS, - IOMMUFD_OBJ_FAULT, -#ifdef CONFIG_IOMMUFD_TEST - IOMMUFD_OBJ_SELFTEST, -#endif - IOMMUFD_OBJ_MAX, -}; - -/* Base struct for all objects with a userspace ID handle. */ -struct iommufd_object { - refcount_t shortterm_users; - refcount_t users; - enum iommufd_object_type type; - unsigned int id; -}; - static inline bool iommufd_lock_obj(struct iommufd_object *obj) { if (!refcount_inc_not_zero(&obj->users)) @@ -225,10 +207,6 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, iommufd_object_remove(ictx, obj, obj->id, 0); } -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type); - #define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ @@ -276,6 +254,8 @@ void iommufd_ioas_destroy(struct iommufd_object *obj); int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd); +int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); @@ -312,6 +292,7 @@ struct iommufd_hwpt_paging { struct iommufd_hwpt_nested { struct iommufd_hw_pagetable common; struct iommufd_hwpt_paging *parent; + struct iommufd_viommu *viommu; }; static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) @@ -528,6 +509,27 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev, return iommu_group_replace_domain(idev->igroup->group, hwpt->domain); } +static inline struct iommufd_viommu * +iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_VIOMMU), + struct iommufd_viommu, obj); +} + +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_viommu_destroy(struct iommufd_object *obj); +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_vdevice_destroy(struct iommufd_object *obj); + +struct iommufd_vdevice { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_viommu *viommu; + struct device *dev; + u64 id; /* per-vIOMMU virtual ID */ +}; + #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index f4bc23a92f9a..a6b7a163f636 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -23,6 +23,7 @@ enum { IOMMU_TEST_OP_DIRTY, IOMMU_TEST_OP_MD_CHECK_IOTLB, IOMMU_TEST_OP_TRIGGER_IOPF, + IOMMU_TEST_OP_DEV_CHECK_CACHE, }; enum { @@ -54,6 +55,11 @@ enum { MOCK_NESTED_DOMAIN_IOTLB_NUM = 4, }; +enum { + MOCK_DEV_CACHE_ID_MAX = 3, + MOCK_DEV_CACHE_NUM = 4, +}; + struct iommu_test_cmd { __u32 size; __u32 op; @@ -135,6 +141,10 @@ struct iommu_test_cmd { __u32 perm; __u64 addr; } trigger_iopf; + struct { + __u32 id; + __u32 cache; + } check_dev_cache; }; __u32 last; }; @@ -152,6 +162,7 @@ struct iommu_test_hw_info { /* Should not be equal to any defined value in enum iommu_hwpt_data_type */ #define IOMMU_HWPT_DATA_SELFTEST 0xdead #define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef +#define IOMMU_TEST_DEV_CACHE_DEFAULT 0xbaddad /** * struct iommu_hwpt_selftest @@ -180,4 +191,25 @@ struct iommu_hwpt_invalidate_selftest { __u32 iotlb_id; }; +#define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef + +/* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */ +#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef +#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef + +/** + * struct iommu_viommu_invalidate_selftest - Invalidation data for Mock VIOMMU + * (IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) + * @flags: Invalidate flags + * @cache_id: Invalidate cache entry index + * + * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @cache_id will be ignored + */ +struct iommu_viommu_invalidate_selftest { +#define IOMMU_TEST_INVALIDATE_FLAG_ALL (1 << 0) + __u32 flags; + __u32 vdev_id; + __u32 cache_id; +}; + #endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index b5f5d27ee963..0a96cc8f27da 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -29,38 +29,6 @@ struct iommufd_object_ops { static const struct iommufd_object_ops iommufd_object_ops[]; static struct miscdevice vfio_misc_dev; -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type) -{ - struct iommufd_object *obj; - int rc; - - obj = kzalloc(size, GFP_KERNEL_ACCOUNT); - if (!obj) - return ERR_PTR(-ENOMEM); - obj->type = type; - /* Starts out bias'd by 1 until it is removed from the xarray */ - refcount_set(&obj->shortterm_users, 1); - refcount_set(&obj->users, 1); - - /* - * Reserve an ID in the xarray but do not publish the pointer yet since - * the caller hasn't initialized it yet. Once the pointer is published - * in the xarray and visible to other threads we can't reliably destroy - * it anymore, so the caller must complete all errorable operations - * before calling iommufd_object_finalize(). - */ - rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, - xa_limit_31b, GFP_KERNEL_ACCOUNT); - if (rc) - goto out_free; - return obj; -out_free: - kfree(obj); - return ERR_PTR(rc); -} - /* * Allow concurrent access to the object. * @@ -73,20 +41,26 @@ out_free: void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + XA_STATE(xas, &ictx->objects, obj->id); void *old; - old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); - /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ - WARN_ON(old); + xa_lock(&ictx->objects); + old = xas_store(&xas, obj); + xa_unlock(&ictx->objects); + /* obj->id was returned from xa_alloc() so the xas_store() cannot fail */ + WARN_ON(old != XA_ZERO_ENTRY); } /* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) { + XA_STATE(xas, &ictx->objects, obj->id); void *old; - old = xa_erase(&ictx->objects, obj->id); - WARN_ON(old); + xa_lock(&ictx->objects); + old = xas_store(&xas, NULL); + xa_unlock(&ictx->objects); + WARN_ON(old != XA_ZERO_ENTRY); kfree(obj); } @@ -248,6 +222,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); } + init_rwsem(&ictx->ioas_creation_lock); xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; @@ -333,6 +308,8 @@ union ucmd_buffer { struct iommu_ioas_unmap unmap; struct iommu_option option; struct iommu_vfio_ioas vfio_ioas; + struct iommu_viommu_alloc viommu; + struct iommu_vdevice_alloc vdev; #ifdef CONFIG_IOMMUFD_TEST struct iommu_test_cmd test; #endif @@ -372,18 +349,26 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process, + struct iommu_ioas_change_process, __reserved), IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, src_iova), IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, struct iommu_ioas_iova_ranges, out_iova_alignment), IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, iova), + IOCTL_OP(IOMMU_IOAS_MAP_FILE, iommufd_ioas_map_file, + struct iommu_ioas_map_file, iova), IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, length), IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), + IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, + struct iommu_viommu_alloc, out_viommu_id), + IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, + struct iommu_vdevice_alloc, virt_id), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif @@ -519,6 +504,12 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, }, + [IOMMUFD_OBJ_VIOMMU] = { + .destroy = iommufd_viommu_destroy, + }, + [IOMMUFD_OBJ_VDEVICE] = { + .destroy = iommufd_vdevice_destroy, + }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { .destroy = iommufd_selftest_destroy, diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 93d806c9c073..3427749bc5ce 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -45,6 +45,7 @@ * last_iova + 1 can overflow. An iopt_pages index will always be much less than * ULONG_MAX so last_index + 1 cannot overflow. */ +#include <linux/file.h> #include <linux/highmem.h> #include <linux/iommu.h> #include <linux/iommufd.h> @@ -346,27 +347,41 @@ static void batch_destroy(struct pfn_batch *batch, void *backup) kfree(batch->pfns); } -/* true if the pfn was added, false otherwise */ -static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +static bool batch_add_pfn_num(struct pfn_batch *batch, unsigned long pfn, + u32 nr) { const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); - - if (batch->end && - pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && - batch->npfns[batch->end - 1] != MAX_NPFNS) { - batch->npfns[batch->end - 1]++; - batch->total_pfns++; - return true; - } - if (batch->end == batch->array_size) + unsigned int end = batch->end; + + if (end && pfn == batch->pfns[end - 1] + batch->npfns[end - 1] && + nr <= MAX_NPFNS - batch->npfns[end - 1]) { + batch->npfns[end - 1] += nr; + } else if (end < batch->array_size) { + batch->pfns[end] = pfn; + batch->npfns[end] = nr; + batch->end++; + } else { return false; - batch->total_pfns++; - batch->pfns[batch->end] = pfn; - batch->npfns[batch->end] = 1; - batch->end++; + } + + batch->total_pfns += nr; return true; } +static void batch_remove_pfn_num(struct pfn_batch *batch, unsigned long nr) +{ + batch->npfns[batch->end - 1] -= nr; + if (batch->npfns[batch->end - 1] == 0) + batch->end--; + batch->total_pfns -= nr; +} + +/* true if the pfn was added, false otherwise */ +static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +{ + return batch_add_pfn_num(batch, pfn, 1); +} + /* * Fill the batch with pfns from the domain. When the batch is full, or it * reaches last_index, the function will return. The caller should use @@ -622,6 +637,41 @@ static void batch_from_pages(struct pfn_batch *batch, struct page **pages, break; } +static int batch_from_folios(struct pfn_batch *batch, struct folio ***folios_p, + unsigned long *offset_p, unsigned long npages) +{ + int rc = 0; + struct folio **folios = *folios_p; + unsigned long offset = *offset_p; + + while (npages) { + struct folio *folio = *folios; + unsigned long nr = folio_nr_pages(folio) - offset; + unsigned long pfn = page_to_pfn(folio_page(folio, offset)); + + nr = min(nr, npages); + npages -= nr; + + if (!batch_add_pfn_num(batch, pfn, nr)) + break; + if (nr > 1) { + rc = folio_add_pins(folio, nr - 1); + if (rc) { + batch_remove_pfn_num(batch, nr); + goto out; + } + } + + folios++; + offset = 0; + } + +out: + *folios_p = folios; + *offset_p = offset; + return rc; +} + static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, unsigned int first_page_off, size_t npages) { @@ -703,19 +753,32 @@ struct pfn_reader_user { * neither */ int locked; + + /* The following are only valid if file != NULL. */ + struct file *file; + struct folio **ufolios; + size_t ufolios_len; + unsigned long ufolios_offset; + struct folio **ufolios_next; }; static void pfn_reader_user_init(struct pfn_reader_user *user, struct iopt_pages *pages) { user->upages = NULL; + user->upages_len = 0; user->upages_start = 0; user->upages_end = 0; user->locked = -1; - user->gup_flags = FOLL_LONGTERM; if (pages->writable) user->gup_flags |= FOLL_WRITE; + + user->file = (pages->type == IOPT_ADDRESS_FILE) ? pages->file : NULL; + user->ufolios = NULL; + user->ufolios_len = 0; + user->ufolios_next = NULL; + user->ufolios_offset = 0; } static void pfn_reader_user_destroy(struct pfn_reader_user *user, @@ -724,13 +787,67 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user, if (user->locked != -1) { if (user->locked) mmap_read_unlock(pages->source_mm); - if (pages->source_mm != current->mm) + if (!user->file && pages->source_mm != current->mm) mmput(pages->source_mm); user->locked = -1; } kfree(user->upages); user->upages = NULL; + kfree(user->ufolios); + user->ufolios = NULL; +} + +static long pin_memfd_pages(struct pfn_reader_user *user, unsigned long start, + unsigned long npages) +{ + unsigned long i; + unsigned long offset; + unsigned long npages_out = 0; + struct page **upages = user->upages; + unsigned long end = start + (npages << PAGE_SHIFT) - 1; + long nfolios = user->ufolios_len / sizeof(*user->ufolios); + + /* + * todo: memfd_pin_folios should return the last pinned offset so + * we can compute npages pinned, and avoid looping over folios here + * if upages == NULL. + */ + nfolios = memfd_pin_folios(user->file, start, end, user->ufolios, + nfolios, &offset); + if (nfolios <= 0) + return nfolios; + + offset >>= PAGE_SHIFT; + user->ufolios_next = user->ufolios; + user->ufolios_offset = offset; + + for (i = 0; i < nfolios; i++) { + struct folio *folio = user->ufolios[i]; + unsigned long nr = folio_nr_pages(folio); + unsigned long npin = min(nr - offset, npages); + + npages -= npin; + npages_out += npin; + + if (upages) { + if (npin == 1) { + *upages++ = folio_page(folio, offset); + } else { + int rc = folio_add_pins(folio, npin - 1); + + if (rc) + return rc; + + while (npin--) + *upages++ = folio_page(folio, offset++); + } + } + + offset = 0; + } + + return npages_out; } static int pfn_reader_user_pin(struct pfn_reader_user *user, @@ -739,7 +856,9 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, unsigned long last_index) { bool remote_mm = pages->source_mm != current->mm; - unsigned long npages; + unsigned long npages = last_index - start_index + 1; + unsigned long start; + unsigned long unum; uintptr_t uptr; long rc; @@ -747,40 +866,50 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user, WARN_ON(last_index < start_index)) return -EINVAL; - if (!user->upages) { + if (!user->file && !user->upages) { /* All undone in pfn_reader_destroy() */ - user->upages_len = - (last_index - start_index + 1) * sizeof(*user->upages); + user->upages_len = npages * sizeof(*user->upages); user->upages = temp_kmalloc(&user->upages_len, NULL, 0); if (!user->upages) return -ENOMEM; } + if (user->file && !user->ufolios) { + user->ufolios_len = npages * sizeof(*user->ufolios); + user->ufolios = temp_kmalloc(&user->ufolios_len, NULL, 0); + if (!user->ufolios) + return -ENOMEM; + } + if (user->locked == -1) { /* * The majority of usages will run the map task within the mm * providing the pages, so we can optimize into * get_user_pages_fast() */ - if (remote_mm) { + if (!user->file && remote_mm) { if (!mmget_not_zero(pages->source_mm)) return -EFAULT; } user->locked = 0; } - npages = min_t(unsigned long, last_index - start_index + 1, - user->upages_len / sizeof(*user->upages)); - + unum = user->file ? user->ufolios_len / sizeof(*user->ufolios) : + user->upages_len / sizeof(*user->upages); + npages = min_t(unsigned long, npages, unum); if (iommufd_should_fail()) return -EFAULT; - uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); - if (!remote_mm) + if (user->file) { + start = pages->start + (start_index * PAGE_SIZE); + rc = pin_memfd_pages(user, start, npages); + } else if (!remote_mm) { + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); rc = pin_user_pages_fast(uptr, npages, user->gup_flags, user->upages); - else { + } else { + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); if (!user->locked) { mmap_read_lock(pages->source_mm); user->locked = 1; @@ -838,7 +967,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, mmap_read_unlock(pages->source_mm); user->locked = 0; /* If we had the lock then we also have a get */ - } else if ((!user || !user->upages) && + + } else if ((!user || (!user->upages && !user->ufolios)) && pages->source_mm != current->mm) { if (!mmget_not_zero(pages->source_mm)) return -EINVAL; @@ -855,8 +985,8 @@ static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, return rc; } -static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, - bool inc, struct pfn_reader_user *user) +int iopt_pages_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) { int rc = 0; @@ -890,8 +1020,8 @@ static void update_unpinned(struct iopt_pages *pages) return; if (pages->npinned == pages->last_npinned) return; - do_update_pinned(pages, pages->last_npinned - pages->npinned, false, - NULL); + iopt_pages_update_pinned(pages, pages->last_npinned - pages->npinned, + false, NULL); } /* @@ -921,7 +1051,7 @@ static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, npages = pages->npinned - pages->last_npinned; inc = true; } - return do_update_pinned(pages, npages, inc, user); + return iopt_pages_update_pinned(pages, npages, inc, user); } /* @@ -978,6 +1108,8 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) { struct interval_tree_double_span_iter *span = &pfns->span; unsigned long start_index = pfns->batch_end_index; + struct pfn_reader_user *user = &pfns->user; + unsigned long npages; struct iopt_area *area; int rc; @@ -1015,11 +1147,17 @@ static int pfn_reader_fill_span(struct pfn_reader *pfns) return rc; } - batch_from_pages(&pfns->batch, - pfns->user.upages + - (start_index - pfns->user.upages_start), - pfns->user.upages_end - start_index); - return 0; + npages = user->upages_end - start_index; + start_index -= user->upages_start; + rc = 0; + + if (!user->file) + batch_from_pages(&pfns->batch, user->upages + start_index, + npages); + else + rc = batch_from_folios(&pfns->batch, &user->ufolios_next, + &user->ufolios_offset, npages); + return rc; } static bool pfn_reader_done(struct pfn_reader *pfns) @@ -1092,16 +1230,25 @@ static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, static void pfn_reader_release_pins(struct pfn_reader *pfns) { struct iopt_pages *pages = pfns->pages; + struct pfn_reader_user *user = &pfns->user; - if (pfns->user.upages_end > pfns->batch_end_index) { - size_t npages = pfns->user.upages_end - pfns->batch_end_index; - + if (user->upages_end > pfns->batch_end_index) { /* Any pages not transferred to the batch are just unpinned */ - unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - - pfns->user.upages_start), - npages); + + unsigned long npages = user->upages_end - pfns->batch_end_index; + unsigned long start_index = pfns->batch_end_index - + user->upages_start; + + if (!user->file) { + unpin_user_pages(user->upages + start_index, npages); + } else { + long n = user->ufolios_len / sizeof(*user->ufolios); + + unpin_folios(user->ufolios_next, + user->ufolios + n - user->ufolios_next); + } iopt_pages_sub_npinned(pages, npages); - pfns->user.upages_end = pfns->batch_end_index; + user->upages_end = pfns->batch_end_index; } if (pfns->batch_start_index != pfns->batch_end_index) { pfn_reader_unpin(pfns); @@ -1139,11 +1286,11 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, return 0; } -struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, - bool writable) +static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte, + unsigned long length, + bool writable) { struct iopt_pages *pages; - unsigned long end; /* * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP @@ -1152,9 +1299,6 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, if (length > SIZE_MAX - PAGE_SIZE || length == 0) return ERR_PTR(-EINVAL); - if (check_add_overflow((unsigned long)uptr, length, &end)) - return ERR_PTR(-EOVERFLOW); - pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); if (!pages) return ERR_PTR(-ENOMEM); @@ -1164,8 +1308,7 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, mutex_init(&pages->mutex); pages->source_mm = current->mm; mmgrab(pages->source_mm); - pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); - pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); + pages->npages = DIV_ROUND_UP(length + start_byte, PAGE_SIZE); pages->access_itree = RB_ROOT_CACHED; pages->domains_itree = RB_ROOT_CACHED; pages->writable = writable; @@ -1179,6 +1322,45 @@ struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, return pages; } +struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, + unsigned long length, bool writable) +{ + struct iopt_pages *pages; + unsigned long end; + void __user *uptr_down = + (void __user *) ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + + if (check_add_overflow((unsigned long)uptr, length, &end)) + return ERR_PTR(-EOVERFLOW); + + pages = iopt_alloc_pages(uptr - uptr_down, length, writable); + if (IS_ERR(pages)) + return pages; + pages->uptr = uptr_down; + pages->type = IOPT_ADDRESS_USER; + return pages; +} + +struct iopt_pages *iopt_alloc_file_pages(struct file *file, unsigned long start, + unsigned long length, bool writable) + +{ + struct iopt_pages *pages; + unsigned long start_down = ALIGN_DOWN(start, PAGE_SIZE); + unsigned long end; + + if (length && check_add_overflow(start, length - 1, &end)) + return ERR_PTR(-EOVERFLOW); + + pages = iopt_alloc_pages(start - start_down, length, writable); + if (IS_ERR(pages)) + return pages; + pages->file = get_file(file); + pages->start = start_down; + pages->type = IOPT_ADDRESS_FILE; + return pages; +} + void iopt_release_pages(struct kref *kref) { struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); @@ -1191,6 +1373,8 @@ void iopt_release_pages(struct kref *kref) mutex_destroy(&pages->mutex); put_task_struct(pages->source_task); free_uid(pages->source_user); + if (pages->type == IOPT_ADDRESS_FILE) + fput(pages->file); kfree(pages); } @@ -1630,11 +1814,11 @@ static int iopt_pages_fill_from_domain(struct iopt_pages *pages, return 0; } -static int iopt_pages_fill_from_mm(struct iopt_pages *pages, - struct pfn_reader_user *user, - unsigned long start_index, - unsigned long last_index, - struct page **out_pages) +static int iopt_pages_fill(struct iopt_pages *pages, + struct pfn_reader_user *user, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) { unsigned long cur_index = start_index; int rc; @@ -1708,8 +1892,8 @@ int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, /* hole */ cur_pages = out_pages + (span.start_hole - start_index); - rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, - span.last_hole, cur_pages); + rc = iopt_pages_fill(pages, &user, span.start_hole, + span.last_hole, cur_pages); if (rc) goto out_clean_xa; rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, @@ -1789,6 +1973,10 @@ static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, struct page *page = NULL; int rc; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(pages->type != IOPT_ADDRESS_USER)) + return -EINVAL; + if (!mmget_not_zero(pages->source_mm)) return iopt_pages_rw_slow(pages, index, index, offset, data, length, flags); @@ -1844,6 +2032,15 @@ int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) return -EPERM; + if (pages->type == IOPT_ADDRESS_FILE) + return iopt_pages_rw_slow(pages, start_index, last_index, + start_byte % PAGE_SIZE, data, length, + flags); + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(pages->type != IOPT_ADDRESS_USER)) + return -EINVAL; + if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { if (start_index == last_index) return iopt_pages_rw_page(pages, start_index, diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 540437be168a..2f9de177dffc 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -126,12 +126,35 @@ struct mock_iommu_domain { struct xarray pfns; }; +static inline struct mock_iommu_domain * +to_mock_domain(struct iommu_domain *domain) +{ + return container_of(domain, struct mock_iommu_domain, domain); +} + struct mock_iommu_domain_nested { struct iommu_domain domain; + struct mock_viommu *mock_viommu; struct mock_iommu_domain *parent; u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; }; +static inline struct mock_iommu_domain_nested * +to_mock_nested(struct iommu_domain *domain) +{ + return container_of(domain, struct mock_iommu_domain_nested, domain); +} + +struct mock_viommu { + struct iommufd_viommu core; + struct mock_iommu_domain *s2_parent; +}; + +static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) +{ + return container_of(viommu, struct mock_viommu, core); +} + enum selftest_obj_type { TYPE_IDEV, }; @@ -140,8 +163,14 @@ struct mock_dev { struct device dev; unsigned long flags; int id; + u32 cache[MOCK_DEV_CACHE_NUM]; }; +static inline struct mock_dev *to_mock_dev(struct device *dev) +{ + return container_of(dev, struct mock_dev, dev); +} + struct selftest_obj { struct iommufd_object obj; enum selftest_obj_type type; @@ -155,10 +184,15 @@ struct selftest_obj { }; }; +static inline struct selftest_obj *to_selftest_obj(struct iommufd_object *obj) +{ + return container_of(obj, struct selftest_obj, obj); +} + static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return -EINVAL; @@ -193,8 +227,7 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, bool enable) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = mock->flags; if (enable && !domain->dirty_ops) @@ -243,8 +276,7 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, unsigned long flags, struct iommu_dirty_bitmap *dirty) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long end = iova + size; void *ent; @@ -281,7 +313,7 @@ static const struct iommu_dirty_ops dirty_ops = { static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); struct mock_iommu_domain *mock; mock = kzalloc(sizeof(*mock), GFP_KERNEL); @@ -298,76 +330,85 @@ static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) return &mock->domain; } -static struct iommu_domain * -__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, - const struct iommu_hwpt_selftest *user_cfg) +static struct mock_iommu_domain_nested * +__mock_domain_alloc_nested(const struct iommu_user_data *user_data) { struct mock_iommu_domain_nested *mock_nested; - int i; + struct iommu_hwpt_selftest user_cfg; + int rc, i; + + if (user_data->type != IOMMU_HWPT_DATA_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + + rc = iommu_copy_struct_from_user(&user_cfg, user_data, + IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); if (!mock_nested) return ERR_PTR(-ENOMEM); - mock_nested->parent = mock_parent; mock_nested->domain.ops = &domain_nested_ops; mock_nested->domain.type = IOMMU_DOMAIN_NESTED; for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) - mock_nested->iotlb[i] = user_cfg->iotlb; - return &mock_nested->domain; + mock_nested->iotlb[i] = user_cfg.iotlb; + return mock_nested; } static struct iommu_domain * -mock_domain_alloc_user(struct device *dev, u32 flags, - struct iommu_domain *parent, - const struct iommu_user_data *user_data) +mock_domain_alloc_nested(struct iommu_domain *parent, u32 flags, + const struct iommu_user_data *user_data) { + struct mock_iommu_domain_nested *mock_nested; struct mock_iommu_domain *mock_parent; - struct iommu_hwpt_selftest user_cfg; - int rc; - - /* must be mock_domain */ - if (!parent) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); - bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; - bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; - struct iommu_domain *domain; - - if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | - IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) - return ERR_PTR(-EOPNOTSUPP); - if (user_data || (has_dirty_flag && no_dirty_ops)) - return ERR_PTR(-EOPNOTSUPP); - domain = mock_domain_alloc_paging(dev); - if (!domain) - return ERR_PTR(-ENOMEM); - if (has_dirty_flag) - container_of(domain, struct mock_iommu_domain, domain) - ->domain.dirty_ops = &dirty_ops; - return domain; - } - /* must be mock_domain_nested */ - if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags) + if (flags) return ERR_PTR(-EOPNOTSUPP); if (!parent || parent->ops != mock_ops.default_domain_ops) return ERR_PTR(-EINVAL); - mock_parent = container_of(parent, struct mock_iommu_domain, domain); + mock_parent = to_mock_domain(parent); if (!mock_parent) return ERR_PTR(-EINVAL); - rc = iommu_copy_struct_from_user(&user_cfg, user_data, - IOMMU_HWPT_DATA_SELFTEST, iotlb); - if (rc) - return ERR_PTR(rc); + mock_nested = __mock_domain_alloc_nested(user_data); + if (IS_ERR(mock_nested)) + return ERR_CAST(mock_nested); + mock_nested->parent = mock_parent; + return &mock_nested->domain; +} + +static struct iommu_domain * +mock_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_NEST_PARENT; + bool no_dirty_ops = to_mock_dev(dev)->flags & + MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_domain *domain; - return __mock_domain_alloc_nested(mock_parent, &user_cfg); + if (parent) + return mock_domain_alloc_nested(parent, flags, user_data); + + if (user_data) + return ERR_PTR(-EOPNOTSUPP); + if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + + domain = mock_domain_alloc_paging(dev); + if (!domain) + return ERR_PTR(-ENOMEM); + if (has_dirty_flag) + domain->dirty_ops = &dirty_ops; + return domain; } static void mock_domain_free(struct iommu_domain *domain) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); WARN_ON(!xa_empty(&mock->pfns)); kfree(mock); @@ -378,8 +419,7 @@ static int mock_domain_map_pages(struct iommu_domain *domain, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); unsigned long flags = MOCK_PFN_START_IOVA; unsigned long start_iova = iova; @@ -430,8 +470,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, size_t pgcount, struct iommu_iotlb_gather *iotlb_gather) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); bool first = true; size_t ret = 0; void *ent; @@ -479,8 +518,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { - struct mock_iommu_domain *mock = - container_of(domain, struct mock_iommu_domain, domain); + struct mock_iommu_domain *mock = to_mock_domain(domain); void *ent; WARN_ON(iova % MOCK_IO_PAGE_SIZE); @@ -491,7 +529,7 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); switch (cap) { case IOMMU_CAP_CACHE_COHERENCY: @@ -507,14 +545,17 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) static struct iopf_queue *mock_iommu_iopf_queue; -static struct iommu_device mock_iommu_device = { -}; +static struct mock_iommu_device { + struct iommu_device iommu_dev; + struct completion complete; + refcount_t users; +} mock_iommu; static struct iommu_device *mock_probe_device(struct device *dev) { if (dev->bus != &iommufd_mock_bus_type.bus) return ERR_PTR(-ENODEV); - return &mock_iommu_device; + return &mock_iommu.iommu_dev; } static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt, @@ -540,6 +581,132 @@ static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features fea return 0; } +static void mock_viommu_destroy(struct iommufd_viommu *viommu) +{ + struct mock_iommu_device *mock_iommu = container_of( + viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + + if (refcount_dec_and_test(&mock_iommu->users)) + complete(&mock_iommu->complete); + + /* iommufd core frees mock_viommu and viommu */ +} + +static struct iommu_domain * +mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data) +{ + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); + struct mock_iommu_domain_nested *mock_nested; + + if (flags & ~IOMMU_HWPT_FAULT_ID_VALID) + return ERR_PTR(-EOPNOTSUPP); + + mock_nested = __mock_domain_alloc_nested(user_data); + if (IS_ERR(mock_nested)) + return ERR_CAST(mock_nested); + mock_nested->mock_viommu = mock_viommu; + mock_nested->parent = mock_viommu->s2_parent; + return &mock_nested->domain; +} + +static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) +{ + struct iommu_viommu_invalidate_selftest *cmds; + struct iommu_viommu_invalidate_selftest *cur; + struct iommu_viommu_invalidate_selftest *end; + int rc; + + /* A zero-length array is allowed to validate the array type */ + if (array->entry_num == 0 && + array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) { + array->entry_num = 0; + return 0; + } + + cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL); + if (!cmds) + return -ENOMEM; + cur = cmds; + end = cmds + array->entry_num; + + static_assert(sizeof(*cmds) == 3 * sizeof(u32)); + rc = iommu_copy_struct_from_full_user_array( + cmds, sizeof(*cmds), array, + IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST); + if (rc) + goto out; + + while (cur != end) { + struct mock_dev *mdev; + struct device *dev; + int i; + + if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) { + rc = -EOPNOTSUPP; + goto out; + } + + if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) { + rc = -EINVAL; + goto out; + } + + xa_lock(&viommu->vdevs); + dev = iommufd_viommu_find_dev(viommu, + (unsigned long)cur->vdev_id); + if (!dev) { + xa_unlock(&viommu->vdevs); + rc = -EINVAL; + goto out; + } + mdev = container_of(dev, struct mock_dev, dev); + + if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) { + /* Invalidate all cache entries and ignore cache_id */ + for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) + mdev->cache[i] = 0; + } else { + mdev->cache[cur->cache_id] = 0; + } + xa_unlock(&viommu->vdevs); + + cur++; + } +out: + array->entry_num = cur - cmds; + kfree(cmds); + return rc; +} + +static struct iommufd_viommu_ops mock_viommu_ops = { + .destroy = mock_viommu_destroy, + .alloc_domain_nested = mock_viommu_alloc_domain_nested, + .cache_invalidate = mock_viommu_cache_invalidate, +}; + +static struct iommufd_viommu *mock_viommu_alloc(struct device *dev, + struct iommu_domain *domain, + struct iommufd_ctx *ictx, + unsigned int viommu_type) +{ + struct mock_iommu_device *mock_iommu = + iommu_get_iommu_dev(dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu; + + if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + + mock_viommu = iommufd_viommu_alloc(ictx, struct mock_viommu, core, + &mock_viommu_ops); + if (IS_ERR(mock_viommu)) + return ERR_CAST(mock_viommu); + + refcount_inc(&mock_iommu->users); + return &mock_viommu->core; +} + static const struct iommu_ops mock_ops = { /* * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type() @@ -559,6 +726,7 @@ static const struct iommu_ops mock_ops = { .dev_enable_feat = mock_dev_enable_feat, .dev_disable_feat = mock_dev_disable_feat, .user_pasid_table = true, + .viommu_alloc = mock_viommu_alloc, .default_domain_ops = &(struct iommu_domain_ops){ .free = mock_domain_free, @@ -571,18 +739,14 @@ static const struct iommu_ops mock_ops = { static void mock_domain_free_nested(struct iommu_domain *domain) { - struct mock_iommu_domain_nested *mock_nested = - container_of(domain, struct mock_iommu_domain_nested, domain); - - kfree(mock_nested); + kfree(to_mock_nested(domain)); } static int mock_domain_cache_invalidate_user(struct iommu_domain *domain, struct iommu_user_data_array *array) { - struct mock_iommu_domain_nested *mock_nested = - container_of(domain, struct mock_iommu_domain_nested, domain); + struct mock_iommu_domain_nested *mock_nested = to_mock_nested(domain); struct iommu_hwpt_invalidate_selftest inv; u32 processed = 0; int i = 0, j; @@ -657,7 +821,7 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } - *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); + *mock = to_mock_domain(hwpt->domain); return hwpt; } @@ -675,14 +839,13 @@ get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, iommufd_put_object(ucmd->ictx, &hwpt->obj); return ERR_PTR(-EINVAL); } - *mock_nested = container_of(hwpt->domain, - struct mock_iommu_domain_nested, domain); + *mock_nested = to_mock_nested(hwpt->domain); return hwpt; } static void mock_dev_release(struct device *dev) { - struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + struct mock_dev *mdev = to_mock_dev(dev); ida_free(&mock_dev_ida, mdev->id); kfree(mdev); @@ -691,7 +854,7 @@ static void mock_dev_release(struct device *dev) static struct mock_dev *mock_dev_create(unsigned long dev_flags) { struct mock_dev *mdev; - int rc; + int rc, i; if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) @@ -705,6 +868,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; + for (i = 0; i < MOCK_DEV_CACHE_NUM; i++) + mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT; rc = ida_alloc(&mock_dev_ida, GFP_KERNEL); if (rc < 0) @@ -813,7 +978,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, if (IS_ERR(dev_obj)) return PTR_ERR(dev_obj); - sobj = container_of(dev_obj, struct selftest_obj, obj); + sobj = to_selftest_obj(dev_obj); if (sobj->type != TYPE_IDEV) { rc = -EINVAL; goto out_dev_obj; @@ -951,8 +1116,7 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, if (IS_ERR(hwpt)) return PTR_ERR(hwpt); - mock_nested = container_of(hwpt->domain, - struct mock_iommu_domain_nested, domain); + mock_nested = to_mock_nested(hwpt->domain); if (iotlb_id > MOCK_NESTED_DOMAIN_IOTLB_ID_MAX || mock_nested->iotlb[iotlb_id] != iotlb) @@ -961,6 +1125,24 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, return rc; } +static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd, u32 idev_id, + unsigned int cache_id, u32 cache) +{ + struct iommufd_device *idev; + struct mock_dev *mdev; + int rc = 0; + + idev = iommufd_get_device(ucmd, idev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + mdev = container_of(idev->dev, struct mock_dev, dev); + + if (cache_id > MOCK_DEV_CACHE_ID_MAX || mdev->cache[cache_id] != cache) + rc = -EINVAL; + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + struct selftest_access { struct iommufd_access *access; struct file *file; @@ -1431,7 +1613,7 @@ static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, void iommufd_selftest_destroy(struct iommufd_object *obj) { - struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); + struct selftest_obj *sobj = to_selftest_obj(obj); switch (sobj->type) { case TYPE_IDEV: @@ -1470,6 +1652,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_md_check_iotlb(ucmd, cmd->id, cmd->check_iotlb.id, cmd->check_iotlb.iotlb); + case IOMMU_TEST_OP_DEV_CHECK_CACHE: + return iommufd_test_dev_check_cache(ucmd, cmd->id, + cmd->check_dev_cache.id, + cmd->check_dev_cache.cache); case IOMMU_TEST_OP_CREATE_ACCESS: return iommufd_test_create_access(ucmd, cmd->id, cmd->create_access.flags); @@ -1536,24 +1722,27 @@ int __init iommufd_test_init(void) if (rc) goto err_platform; - rc = iommu_device_sysfs_add(&mock_iommu_device, + rc = iommu_device_sysfs_add(&mock_iommu.iommu_dev, &selftest_iommu_dev->dev, NULL, "%s", dev_name(&selftest_iommu_dev->dev)); if (rc) goto err_bus; - rc = iommu_device_register_bus(&mock_iommu_device, &mock_ops, + rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); if (rc) goto err_sysfs; + refcount_set(&mock_iommu.users, 1); + init_completion(&mock_iommu.complete); + mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq"); return 0; err_sysfs: - iommu_device_sysfs_remove(&mock_iommu_device); + iommu_device_sysfs_remove(&mock_iommu.iommu_dev); err_bus: bus_unregister(&iommufd_mock_bus_type.bus); err_platform: @@ -1563,6 +1752,22 @@ err_dbgfs: return rc; } +static void iommufd_test_wait_for_users(void) +{ + if (refcount_dec_and_test(&mock_iommu.users)) + return; + /* + * Time out waiting for iommu device user count to become 0. + * + * Note that this is just making an example here, since the selftest is + * built into the iommufd module, i.e. it only unplugs the iommu device + * when unloading the module. So, it is expected that this WARN_ON will + * not trigger, as long as any iommufd FDs are open. + */ + WARN_ON(!wait_for_completion_timeout(&mock_iommu.complete, + msecs_to_jiffies(10000))); +} + void iommufd_test_exit(void) { if (mock_iommu_iopf_queue) { @@ -1570,8 +1775,9 @@ void iommufd_test_exit(void) mock_iommu_iopf_queue = NULL; } - iommu_device_sysfs_remove(&mock_iommu_device); - iommu_device_unregister_bus(&mock_iommu_device, + iommufd_test_wait_for_users(); + iommu_device_sysfs_remove(&mock_iommu.iommu_dev); + iommu_device_unregister_bus(&mock_iommu.iommu_dev, &iommufd_mock_bus_type.bus, &iommufd_mock_bus_type.nb); bus_unregister(&iommufd_mock_bus_type.bus); diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index a3ad5f0b6c59..514aacd64009 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, case VFIO_DMA_CC_IOMMU: return iommufd_vfio_cc_iommu(ictx); - /* - * This is obsolete, and to be removed from VFIO. It was an incomplete - * idea that got merged. - * https://lore.kernel.org/kvm/[email protected]/ - */ - case VFIO_TYPE1_NESTING_IOMMU: + case __VFIO_RESERVED_TYPE1_NESTING_IOMMU: return 0; /* diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c new file mode 100644 index 000000000000..69b88e8c7c26 --- /dev/null +++ b/drivers/iommu/iommufd/viommu.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ +#include "iommufd_private.h" + +void iommufd_viommu_destroy(struct iommufd_object *obj) +{ + struct iommufd_viommu *viommu = + container_of(obj, struct iommufd_viommu, obj); + + if (viommu->ops && viommu->ops->destroy) + viommu->ops->destroy(viommu); + refcount_dec(&viommu->hwpt->common.obj.users); + xa_destroy(&viommu->vdevs); +} + +int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_viommu_alloc *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_viommu *viommu; + struct iommufd_device *idev; + const struct iommu_ops *ops; + int rc; + + if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) + return -EOPNOTSUPP; + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + + ops = dev_iommu_ops(idev->dev); + if (!ops->viommu_alloc) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); + goto out_put_idev; + } + + if (!hwpt_paging->nest_parent) { + rc = -EINVAL; + goto out_put_hwpt; + } + + viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain, + ucmd->ictx, cmd->type); + if (IS_ERR(viommu)) { + rc = PTR_ERR(viommu); + goto out_put_hwpt; + } + + xa_init(&viommu->vdevs); + viommu->type = cmd->type; + viommu->ictx = ucmd->ictx; + viommu->hwpt = hwpt_paging; + refcount_inc(&viommu->hwpt->common.obj.users); + /* + * It is the most likely case that a physical IOMMU is unpluggable. A + * pluggable IOMMU instance (if exists) is responsible for refcounting + * on its own. + */ + viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); + + cmd->out_viommu_id = viommu->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_abort; + iommufd_object_finalize(ucmd->ictx, &viommu->obj); + goto out_put_hwpt; + +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj); +out_put_hwpt: + iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); +out_put_idev: + iommufd_put_object(ucmd->ictx, &idev->obj); + return rc; +} + +void iommufd_vdevice_destroy(struct iommufd_object *obj) +{ + struct iommufd_vdevice *vdev = + container_of(obj, struct iommufd_vdevice, obj); + struct iommufd_viommu *viommu = vdev->viommu; + + /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ + xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL); + refcount_dec(&viommu->obj.users); + put_device(vdev->dev); +} + +int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_vdevice_alloc *cmd = ucmd->cmd; + struct iommufd_vdevice *vdev, *curr; + struct iommufd_viommu *viommu; + struct iommufd_device *idev; + u64 virt_id = cmd->virt_id; + int rc = 0; + + /* virt_id indexes an xarray */ + if (virt_id > ULONG_MAX) + return -EINVAL; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + idev = iommufd_get_device(ucmd, cmd->dev_id); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_put_viommu; + } + + if (viommu->iommu_dev != __iommu_get_iommu_dev(idev->dev)) { + rc = -EINVAL; + goto out_put_idev; + } + + vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE); + if (IS_ERR(vdev)) { + rc = PTR_ERR(vdev); + goto out_put_idev; + } + + vdev->id = virt_id; + vdev->dev = idev->dev; + get_device(idev->dev); + vdev->viommu = viommu; + refcount_inc(&viommu->obj.users); + + curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL); + if (curr) { + rc = xa_err(curr) ?: -EEXIST; + goto out_abort; + } + + cmd->out_vdevice_id = vdev->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_abort; + iommufd_object_finalize(ucmd->ictx, &vdev->obj); + goto out_put_idev; + +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); +out_put_idev: + iommufd_put_object(ucmd->ictx, &idev->obj); +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 16c6adff3eb7..a28197b88c92 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -506,7 +506,7 @@ __adjust_overlap_range(struct iova *iova, * reserve_iova - reserves an iova in the given range * @iovad: - iova domain pointer * @pfn_lo: - lower page frame address - * @pfn_hi:- higher pfn adderss + * @pfn_hi:- higher pfn address * This function allocates reserves the address range from pfn_lo to pfn_hi so * that this address is not dished out as part of alloc_iova. */ diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 6a2707fe7a78..c45313c43b9e 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -1599,7 +1599,7 @@ static const unsigned int mt8186_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK static const struct mtk_iommu_plat_data mt8186_data_mm = { .m4u_plat = M4U_MT8186, .flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN | - WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM, + WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM | PGTABLE_PA_35_EN, .larbid_remap = {{0}, {1, MTK_INVALID_LARBID, 8}, {4}, {7}, {2}, {9, 11, 19, 20}, {MTK_INVALID_LARBID, 14, 16}, {MTK_INVALID_LARBID, 13, MTK_INVALID_LARBID, 17}}, diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index c9528065a59a..3f72aef8bd5b 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1230,25 +1230,24 @@ static int omap_iommu_probe(struct platform_device *pdev) if (err) return err; - err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev); - if (err) - goto out_sysfs; obj->has_iommu_driver = true; } + err = iommu_device_register(&obj->iommu, &omap_iommu_ops, &pdev->dev); + if (err) + goto out_sysfs; + pm_runtime_enable(obj->dev); omap_iommu_debugfs_add(obj); dev_info(&pdev->dev, "%s registered\n", obj->name); - /* Re-probe bus to probe device attached to this IOMMU */ - bus_iommu_probe(&platform_bus_type); - return 0; out_sysfs: - iommu_device_sysfs_remove(&obj->iommu); + if (obj->has_iommu_driver) + iommu_device_sysfs_remove(&obj->iommu); return err; } @@ -1256,10 +1255,10 @@ static void omap_iommu_remove(struct platform_device *pdev) { struct omap_iommu *obj = platform_get_drvdata(pdev); - if (obj->has_iommu_driver) { + if (obj->has_iommu_driver) iommu_device_sysfs_remove(&obj->iommu); - iommu_device_unregister(&obj->iommu); - } + + iommu_device_unregister(&obj->iommu); omap_iommu_debugfs_remove(obj); @@ -1723,12 +1722,19 @@ static void omap_iommu_release_device(struct device *dev) } +static int omap_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) +{ + /* TODO: collect args->np to save re-parsing in probe above */ + return 0; +} + static const struct iommu_ops omap_iommu_ops = { .identity_domain = &omap_iommu_identity_domain, .domain_alloc_paging = omap_iommu_domain_alloc_paging, .probe_device = omap_iommu_probe_device, .release_device = omap_iommu_release_device, .device_group = generic_single_device_group, + .of_xlate = omap_iommu_of_xlate, .pgsize_bitmap = OMAP_IOMMU_PGSIZES, .default_domain_ops = &(const struct iommu_domain_ops) { .attach_dev = omap_iommu_attach_dev, diff --git a/drivers/iommu/riscv/Kconfig b/drivers/iommu/riscv/Kconfig new file mode 100644 index 000000000000..c071816f59a6 --- /dev/null +++ b/drivers/iommu/riscv/Kconfig @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-only +# RISC-V IOMMU support + +config RISCV_IOMMU + bool "RISC-V IOMMU Support" + depends on RISCV && 64BIT + default y + select IOMMU_API + help + Support for implementations of the RISC-V IOMMU architecture that + complements the RISC-V MMU capabilities, providing similar address + translation and protection functions for accesses from I/O devices. + + Say Y here if your SoC includes an IOMMU device implementing + the RISC-V IOMMU architecture. + +config RISCV_IOMMU_PCI + def_bool y if RISCV_IOMMU && PCI_MSI + help + Support for the PCIe implementation of RISC-V IOMMU architecture. diff --git a/drivers/iommu/riscv/Makefile b/drivers/iommu/riscv/Makefile new file mode 100644 index 000000000000..f54c9ed17d41 --- /dev/null +++ b/drivers/iommu/riscv/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_RISCV_IOMMU) += iommu.o iommu-platform.o +obj-$(CONFIG_RISCV_IOMMU_PCI) += iommu-pci.o diff --git a/drivers/iommu/riscv/iommu-bits.h b/drivers/iommu/riscv/iommu-bits.h new file mode 100644 index 000000000000..98daf0e1a306 --- /dev/null +++ b/drivers/iommu/riscv/iommu-bits.h @@ -0,0 +1,784 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * Copyright © 2023 RISC-V IOMMU Task Group + * + * RISC-V IOMMU - Register Layout and Data Structures. + * + * Based on the 'RISC-V IOMMU Architecture Specification', Version 1.0 + * Published at https://github.com/riscv-non-isa/riscv-iommu + * + */ + +#ifndef _RISCV_IOMMU_BITS_H_ +#define _RISCV_IOMMU_BITS_H_ + +#include <linux/types.h> +#include <linux/bitfield.h> +#include <linux/bits.h> + +/* + * Chapter 5: Memory Mapped register interface + */ + +/* Common field positions */ +#define RISCV_IOMMU_PPN_FIELD GENMASK_ULL(53, 10) +#define RISCV_IOMMU_QUEUE_LOG2SZ_FIELD GENMASK_ULL(4, 0) +#define RISCV_IOMMU_QUEUE_INDEX_FIELD GENMASK_ULL(31, 0) +#define RISCV_IOMMU_QUEUE_ENABLE BIT(0) +#define RISCV_IOMMU_QUEUE_INTR_ENABLE BIT(1) +#define RISCV_IOMMU_QUEUE_MEM_FAULT BIT(8) +#define RISCV_IOMMU_QUEUE_OVERFLOW BIT(9) +#define RISCV_IOMMU_QUEUE_ACTIVE BIT(16) +#define RISCV_IOMMU_QUEUE_BUSY BIT(17) + +#define RISCV_IOMMU_ATP_PPN_FIELD GENMASK_ULL(43, 0) +#define RISCV_IOMMU_ATP_MODE_FIELD GENMASK_ULL(63, 60) + +/* 5.3 IOMMU Capabilities (64bits) */ +#define RISCV_IOMMU_REG_CAPABILITIES 0x0000 +#define RISCV_IOMMU_CAPABILITIES_VERSION GENMASK_ULL(7, 0) +#define RISCV_IOMMU_CAPABILITIES_SV32 BIT_ULL(8) +#define RISCV_IOMMU_CAPABILITIES_SV39 BIT_ULL(9) +#define RISCV_IOMMU_CAPABILITIES_SV48 BIT_ULL(10) +#define RISCV_IOMMU_CAPABILITIES_SV57 BIT_ULL(11) +#define RISCV_IOMMU_CAPABILITIES_SVPBMT BIT_ULL(15) +#define RISCV_IOMMU_CAPABILITIES_SV32X4 BIT_ULL(16) +#define RISCV_IOMMU_CAPABILITIES_SV39X4 BIT_ULL(17) +#define RISCV_IOMMU_CAPABILITIES_SV48X4 BIT_ULL(18) +#define RISCV_IOMMU_CAPABILITIES_SV57X4 BIT_ULL(19) +#define RISCV_IOMMU_CAPABILITIES_AMO_MRIF BIT_ULL(21) +#define RISCV_IOMMU_CAPABILITIES_MSI_FLAT BIT_ULL(22) +#define RISCV_IOMMU_CAPABILITIES_MSI_MRIF BIT_ULL(23) +#define RISCV_IOMMU_CAPABILITIES_AMO_HWAD BIT_ULL(24) +#define RISCV_IOMMU_CAPABILITIES_ATS BIT_ULL(25) +#define RISCV_IOMMU_CAPABILITIES_T2GPA BIT_ULL(26) +#define RISCV_IOMMU_CAPABILITIES_END BIT_ULL(27) +#define RISCV_IOMMU_CAPABILITIES_IGS GENMASK_ULL(29, 28) +#define RISCV_IOMMU_CAPABILITIES_HPM BIT_ULL(30) +#define RISCV_IOMMU_CAPABILITIES_DBG BIT_ULL(31) +#define RISCV_IOMMU_CAPABILITIES_PAS GENMASK_ULL(37, 32) +#define RISCV_IOMMU_CAPABILITIES_PD8 BIT_ULL(38) +#define RISCV_IOMMU_CAPABILITIES_PD17 BIT_ULL(39) +#define RISCV_IOMMU_CAPABILITIES_PD20 BIT_ULL(40) + +/** + * enum riscv_iommu_igs_settings - Interrupt Generation Support Settings + * @RISCV_IOMMU_CAPABILITIES_IGS_MSI: IOMMU supports only MSI generation + * @RISCV_IOMMU_CAPABILITIES_IGS_WSI: IOMMU supports only Wired-Signaled interrupt + * @RISCV_IOMMU_CAPABILITIES_IGS_BOTH: IOMMU supports both MSI and WSI generation + * @RISCV_IOMMU_CAPABILITIES_IGS_RSRV: Reserved for standard use + */ +enum riscv_iommu_igs_settings { + RISCV_IOMMU_CAPABILITIES_IGS_MSI = 0, + RISCV_IOMMU_CAPABILITIES_IGS_WSI = 1, + RISCV_IOMMU_CAPABILITIES_IGS_BOTH = 2, + RISCV_IOMMU_CAPABILITIES_IGS_RSRV = 3 +}; + +/* 5.4 Features control register (32bits) */ +#define RISCV_IOMMU_REG_FCTL 0x0008 +#define RISCV_IOMMU_FCTL_BE BIT(0) +#define RISCV_IOMMU_FCTL_WSI BIT(1) +#define RISCV_IOMMU_FCTL_GXL BIT(2) + +/* 5.5 Device-directory-table pointer (64bits) */ +#define RISCV_IOMMU_REG_DDTP 0x0010 +#define RISCV_IOMMU_DDTP_IOMMU_MODE GENMASK_ULL(3, 0) +#define RISCV_IOMMU_DDTP_BUSY BIT_ULL(4) +#define RISCV_IOMMU_DDTP_PPN RISCV_IOMMU_PPN_FIELD + +/** + * enum riscv_iommu_ddtp_modes - IOMMU translation modes + * @RISCV_IOMMU_DDTP_IOMMU_MODE_OFF: No inbound transactions allowed + * @RISCV_IOMMU_DDTP_IOMMU_MODE_BARE: Pass-through mode + * @RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL: One-level DDT + * @RISCV_IOMMU_DDTP_IOMMU_MODE_2LVL: Two-level DDT + * @RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL: Three-level DDT + * @RISCV_IOMMU_DDTP_IOMMU_MODE_MAX: Max value allowed by specification + */ +enum riscv_iommu_ddtp_modes { + RISCV_IOMMU_DDTP_IOMMU_MODE_OFF = 0, + RISCV_IOMMU_DDTP_IOMMU_MODE_BARE = 1, + RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL = 2, + RISCV_IOMMU_DDTP_IOMMU_MODE_2LVL = 3, + RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL = 4, + RISCV_IOMMU_DDTP_IOMMU_MODE_MAX = 4 +}; + +/* 5.6 Command Queue Base (64bits) */ +#define RISCV_IOMMU_REG_CQB 0x0018 +#define RISCV_IOMMU_CQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD +#define RISCV_IOMMU_CQB_PPN RISCV_IOMMU_PPN_FIELD + +/* 5.7 Command Queue head (32bits) */ +#define RISCV_IOMMU_REG_CQH 0x0020 +#define RISCV_IOMMU_CQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.8 Command Queue tail (32bits) */ +#define RISCV_IOMMU_REG_CQT 0x0024 +#define RISCV_IOMMU_CQT_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.9 Fault Queue Base (64bits) */ +#define RISCV_IOMMU_REG_FQB 0x0028 +#define RISCV_IOMMU_FQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD +#define RISCV_IOMMU_FQB_PPN RISCV_IOMMU_PPN_FIELD + +/* 5.10 Fault Queue Head (32bits) */ +#define RISCV_IOMMU_REG_FQH 0x0030 +#define RISCV_IOMMU_FQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.11 Fault Queue tail (32bits) */ +#define RISCV_IOMMU_REG_FQT 0x0034 +#define RISCV_IOMMU_FQT_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.12 Page Request Queue base (64bits) */ +#define RISCV_IOMMU_REG_PQB 0x0038 +#define RISCV_IOMMU_PQB_ENTRIES RISCV_IOMMU_QUEUE_LOG2SZ_FIELD +#define RISCV_IOMMU_PQB_PPN RISCV_IOMMU_PPN_FIELD + +/* 5.13 Page Request Queue head (32bits) */ +#define RISCV_IOMMU_REG_PQH 0x0040 +#define RISCV_IOMMU_PQH_INDEX RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.14 Page Request Queue tail (32bits) */ +#define RISCV_IOMMU_REG_PQT 0x0044 +#define RISCV_IOMMU_PQT_INDEX_MASK RISCV_IOMMU_QUEUE_INDEX_FIELD + +/* 5.15 Command Queue CSR (32bits) */ +#define RISCV_IOMMU_REG_CQCSR 0x0048 +#define RISCV_IOMMU_CQCSR_CQEN RISCV_IOMMU_QUEUE_ENABLE +#define RISCV_IOMMU_CQCSR_CIE RISCV_IOMMU_QUEUE_INTR_ENABLE +#define RISCV_IOMMU_CQCSR_CQMF RISCV_IOMMU_QUEUE_MEM_FAULT +#define RISCV_IOMMU_CQCSR_CMD_TO BIT(9) +#define RISCV_IOMMU_CQCSR_CMD_ILL BIT(10) +#define RISCV_IOMMU_CQCSR_FENCE_W_IP BIT(11) +#define RISCV_IOMMU_CQCSR_CQON RISCV_IOMMU_QUEUE_ACTIVE +#define RISCV_IOMMU_CQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY + +/* 5.16 Fault Queue CSR (32bits) */ +#define RISCV_IOMMU_REG_FQCSR 0x004C +#define RISCV_IOMMU_FQCSR_FQEN RISCV_IOMMU_QUEUE_ENABLE +#define RISCV_IOMMU_FQCSR_FIE RISCV_IOMMU_QUEUE_INTR_ENABLE +#define RISCV_IOMMU_FQCSR_FQMF RISCV_IOMMU_QUEUE_MEM_FAULT +#define RISCV_IOMMU_FQCSR_FQOF RISCV_IOMMU_QUEUE_OVERFLOW +#define RISCV_IOMMU_FQCSR_FQON RISCV_IOMMU_QUEUE_ACTIVE +#define RISCV_IOMMU_FQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY + +/* 5.17 Page Request Queue CSR (32bits) */ +#define RISCV_IOMMU_REG_PQCSR 0x0050 +#define RISCV_IOMMU_PQCSR_PQEN RISCV_IOMMU_QUEUE_ENABLE +#define RISCV_IOMMU_PQCSR_PIE RISCV_IOMMU_QUEUE_INTR_ENABLE +#define RISCV_IOMMU_PQCSR_PQMF RISCV_IOMMU_QUEUE_MEM_FAULT +#define RISCV_IOMMU_PQCSR_PQOF RISCV_IOMMU_QUEUE_OVERFLOW +#define RISCV_IOMMU_PQCSR_PQON RISCV_IOMMU_QUEUE_ACTIVE +#define RISCV_IOMMU_PQCSR_BUSY RISCV_IOMMU_QUEUE_BUSY + +/* 5.18 Interrupt Pending Status (32bits) */ +#define RISCV_IOMMU_REG_IPSR 0x0054 + +#define RISCV_IOMMU_INTR_CQ 0 +#define RISCV_IOMMU_INTR_FQ 1 +#define RISCV_IOMMU_INTR_PM 2 +#define RISCV_IOMMU_INTR_PQ 3 +#define RISCV_IOMMU_INTR_COUNT 4 + +#define RISCV_IOMMU_IPSR_CIP BIT(RISCV_IOMMU_INTR_CQ) +#define RISCV_IOMMU_IPSR_FIP BIT(RISCV_IOMMU_INTR_FQ) +#define RISCV_IOMMU_IPSR_PMIP BIT(RISCV_IOMMU_INTR_PM) +#define RISCV_IOMMU_IPSR_PIP BIT(RISCV_IOMMU_INTR_PQ) + +/* 5.19 Performance monitoring counter overflow status (32bits) */ +#define RISCV_IOMMU_REG_IOCOUNTOVF 0x0058 +#define RISCV_IOMMU_IOCOUNTOVF_CY BIT(0) +#define RISCV_IOMMU_IOCOUNTOVF_HPM GENMASK_ULL(31, 1) + +/* 5.20 Performance monitoring counter inhibits (32bits) */ +#define RISCV_IOMMU_REG_IOCOUNTINH 0x005C +#define RISCV_IOMMU_IOCOUNTINH_CY BIT(0) +#define RISCV_IOMMU_IOCOUNTINH_HPM GENMASK(31, 1) + +/* 5.21 Performance monitoring cycles counter (64bits) */ +#define RISCV_IOMMU_REG_IOHPMCYCLES 0x0060 +#define RISCV_IOMMU_IOHPMCYCLES_COUNTER GENMASK_ULL(62, 0) +#define RISCV_IOMMU_IOHPMCYCLES_OF BIT_ULL(63) + +/* 5.22 Performance monitoring event counters (31 * 64bits) */ +#define RISCV_IOMMU_REG_IOHPMCTR_BASE 0x0068 +#define RISCV_IOMMU_REG_IOHPMCTR(_n) (RISCV_IOMMU_REG_IOHPMCTR_BASE + ((_n) * 0x8)) + +/* 5.23 Performance monitoring event selectors (31 * 64bits) */ +#define RISCV_IOMMU_REG_IOHPMEVT_BASE 0x0160 +#define RISCV_IOMMU_REG_IOHPMEVT(_n) (RISCV_IOMMU_REG_IOHPMEVT_BASE + ((_n) * 0x8)) +#define RISCV_IOMMU_IOHPMEVT_EVENTID GENMASK_ULL(14, 0) +#define RISCV_IOMMU_IOHPMEVT_DMASK BIT_ULL(15) +#define RISCV_IOMMU_IOHPMEVT_PID_PSCID GENMASK_ULL(35, 16) +#define RISCV_IOMMU_IOHPMEVT_DID_GSCID GENMASK_ULL(59, 36) +#define RISCV_IOMMU_IOHPMEVT_PV_PSCV BIT_ULL(60) +#define RISCV_IOMMU_IOHPMEVT_DV_GSCV BIT_ULL(61) +#define RISCV_IOMMU_IOHPMEVT_IDT BIT_ULL(62) +#define RISCV_IOMMU_IOHPMEVT_OF BIT_ULL(63) + +/* Number of defined performance-monitoring event selectors */ +#define RISCV_IOMMU_IOHPMEVT_CNT 31 + +/** + * enum riscv_iommu_hpmevent_id - Performance-monitoring event identifier + * + * @RISCV_IOMMU_HPMEVENT_INVALID: Invalid event, do not count + * @RISCV_IOMMU_HPMEVENT_URQ: Untranslated requests + * @RISCV_IOMMU_HPMEVENT_TRQ: Translated requests + * @RISCV_IOMMU_HPMEVENT_ATS_RQ: ATS translation requests + * @RISCV_IOMMU_HPMEVENT_TLB_MISS: TLB misses + * @RISCV_IOMMU_HPMEVENT_DD_WALK: Device directory walks + * @RISCV_IOMMU_HPMEVENT_PD_WALK: Process directory walks + * @RISCV_IOMMU_HPMEVENT_S_VS_WALKS: First-stage page table walks + * @RISCV_IOMMU_HPMEVENT_G_WALKS: Second-stage page table walks + * @RISCV_IOMMU_HPMEVENT_MAX: Value to denote maximum Event IDs + */ +enum riscv_iommu_hpmevent_id { + RISCV_IOMMU_HPMEVENT_INVALID = 0, + RISCV_IOMMU_HPMEVENT_URQ = 1, + RISCV_IOMMU_HPMEVENT_TRQ = 2, + RISCV_IOMMU_HPMEVENT_ATS_RQ = 3, + RISCV_IOMMU_HPMEVENT_TLB_MISS = 4, + RISCV_IOMMU_HPMEVENT_DD_WALK = 5, + RISCV_IOMMU_HPMEVENT_PD_WALK = 6, + RISCV_IOMMU_HPMEVENT_S_VS_WALKS = 7, + RISCV_IOMMU_HPMEVENT_G_WALKS = 8, + RISCV_IOMMU_HPMEVENT_MAX = 9 +}; + +/* 5.24 Translation request IOVA (64bits) */ +#define RISCV_IOMMU_REG_TR_REQ_IOVA 0x0258 +#define RISCV_IOMMU_TR_REQ_IOVA_VPN GENMASK_ULL(63, 12) + +/* 5.25 Translation request control (64bits) */ +#define RISCV_IOMMU_REG_TR_REQ_CTL 0x0260 +#define RISCV_IOMMU_TR_REQ_CTL_GO_BUSY BIT_ULL(0) +#define RISCV_IOMMU_TR_REQ_CTL_PRIV BIT_ULL(1) +#define RISCV_IOMMU_TR_REQ_CTL_EXE BIT_ULL(2) +#define RISCV_IOMMU_TR_REQ_CTL_NW BIT_ULL(3) +#define RISCV_IOMMU_TR_REQ_CTL_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_TR_REQ_CTL_PV BIT_ULL(32) +#define RISCV_IOMMU_TR_REQ_CTL_DID GENMASK_ULL(63, 40) + +/* 5.26 Translation request response (64bits) */ +#define RISCV_IOMMU_REG_TR_RESPONSE 0x0268 +#define RISCV_IOMMU_TR_RESPONSE_FAULT BIT_ULL(0) +#define RISCV_IOMMU_TR_RESPONSE_PBMT GENMASK_ULL(8, 7) +#define RISCV_IOMMU_TR_RESPONSE_SZ BIT_ULL(9) +#define RISCV_IOMMU_TR_RESPONSE_PPN RISCV_IOMMU_PPN_FIELD + +/* 5.27 Interrupt cause to vector (64bits) */ +#define RISCV_IOMMU_REG_ICVEC 0x02F8 +#define RISCV_IOMMU_ICVEC_CIV GENMASK_ULL(3, 0) +#define RISCV_IOMMU_ICVEC_FIV GENMASK_ULL(7, 4) +#define RISCV_IOMMU_ICVEC_PMIV GENMASK_ULL(11, 8) +#define RISCV_IOMMU_ICVEC_PIV GENMASK_ULL(15, 12) + +/* 5.28 MSI Configuration table (32 * 64bits) */ +#define RISCV_IOMMU_REG_MSI_CFG_TBL 0x0300 +#define RISCV_IOMMU_REG_MSI_CFG_TBL_ADDR(_n) \ + (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10)) +#define RISCV_IOMMU_MSI_CFG_TBL_ADDR GENMASK_ULL(55, 2) +#define RISCV_IOMMU_REG_MSI_CFG_TBL_DATA(_n) \ + (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10) + 0x08) +#define RISCV_IOMMU_MSI_CFG_TBL_DATA GENMASK_ULL(31, 0) +#define RISCV_IOMMU_REG_MSI_CFG_TBL_CTRL(_n) \ + (RISCV_IOMMU_REG_MSI_CFG_TBL + ((_n) * 0x10) + 0x0C) +#define RISCV_IOMMU_MSI_CFG_TBL_CTRL_M BIT_ULL(0) + +#define RISCV_IOMMU_REG_SIZE 0x1000 + +/* + * Chapter 2: Data structures + */ + +/* + * Device Directory Table macros for non-leaf nodes + */ +#define RISCV_IOMMU_DDTE_V BIT_ULL(0) +#define RISCV_IOMMU_DDTE_PPN RISCV_IOMMU_PPN_FIELD + +/** + * struct riscv_iommu_dc - Device Context + * @tc: Translation Control + * @iohgatp: I/O Hypervisor guest address translation and protection + * (Second stage context) + * @ta: Translation Attributes + * @fsc: First stage context + * @msiptp: MSI page table pointer + * @msi_addr_mask: MSI address mask + * @msi_addr_pattern: MSI address pattern + * @_reserved: Reserved for future use, padding + * + * This structure is used for leaf nodes on the Device Directory Table, + * in case RISCV_IOMMU_CAPABILITIES_MSI_FLAT is not set, the bottom 4 fields + * are not present and are skipped with pointer arithmetic to avoid + * casting, check out riscv_iommu_get_dc(). + * See section 2.1 for more details + */ +struct riscv_iommu_dc { + u64 tc; + u64 iohgatp; + u64 ta; + u64 fsc; + u64 msiptp; + u64 msi_addr_mask; + u64 msi_addr_pattern; + u64 _reserved; +}; + +/* Translation control fields */ +#define RISCV_IOMMU_DC_TC_V BIT_ULL(0) +#define RISCV_IOMMU_DC_TC_EN_ATS BIT_ULL(1) +#define RISCV_IOMMU_DC_TC_EN_PRI BIT_ULL(2) +#define RISCV_IOMMU_DC_TC_T2GPA BIT_ULL(3) +#define RISCV_IOMMU_DC_TC_DTF BIT_ULL(4) +#define RISCV_IOMMU_DC_TC_PDTV BIT_ULL(5) +#define RISCV_IOMMU_DC_TC_PRPR BIT_ULL(6) +#define RISCV_IOMMU_DC_TC_GADE BIT_ULL(7) +#define RISCV_IOMMU_DC_TC_SADE BIT_ULL(8) +#define RISCV_IOMMU_DC_TC_DPE BIT_ULL(9) +#define RISCV_IOMMU_DC_TC_SBE BIT_ULL(10) +#define RISCV_IOMMU_DC_TC_SXL BIT_ULL(11) + +/* Second-stage (aka G-stage) context fields */ +#define RISCV_IOMMU_DC_IOHGATP_PPN RISCV_IOMMU_ATP_PPN_FIELD +#define RISCV_IOMMU_DC_IOHGATP_GSCID GENMASK_ULL(59, 44) +#define RISCV_IOMMU_DC_IOHGATP_MODE RISCV_IOMMU_ATP_MODE_FIELD + +/** + * enum riscv_iommu_dc_iohgatp_modes - Guest address translation/protection modes + * @RISCV_IOMMU_DC_IOHGATP_MODE_BARE: No translation/protection + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4: Sv32x4 (2-bit extension of Sv32), when fctl.GXL == 1 + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4: Sv39x4 (2-bit extension of Sv39), when fctl.GXL == 0 + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4: Sv48x4 (2-bit extension of Sv48), when fctl.GXL == 0 + * @RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4: Sv57x4 (2-bit extension of Sv57), when fctl.GXL == 0 + */ +enum riscv_iommu_dc_iohgatp_modes { + RISCV_IOMMU_DC_IOHGATP_MODE_BARE = 0, + RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4 = 8, + RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4 = 8, + RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4 = 9, + RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4 = 10 +}; + +/* Translation attributes fields */ +#define RISCV_IOMMU_DC_TA_PSCID GENMASK_ULL(31, 12) + +/* First-stage context fields */ +#define RISCV_IOMMU_DC_FSC_PPN RISCV_IOMMU_ATP_PPN_FIELD +#define RISCV_IOMMU_DC_FSC_MODE RISCV_IOMMU_ATP_MODE_FIELD + +/** + * enum riscv_iommu_dc_fsc_atp_modes - First stage address translation/protection modes + * @RISCV_IOMMU_DC_FSC_MODE_BARE: No translation/protection + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32: Sv32, when dc.tc.SXL == 1 + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: Sv39, when dc.tc.SXL == 0 + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: Sv48, when dc.tc.SXL == 0 + * @RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: Sv57, when dc.tc.SXL == 0 + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8: 1lvl PDT, 8bit process ids + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17: 2lvl PDT, 17bit process ids + * @RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20: 3lvl PDT, 20bit process ids + * + * FSC holds IOSATP when RISCV_IOMMU_DC_TC_PDTV is 0 and PDTP otherwise. + * IOSATP controls the first stage address translation (same as the satp register on + * the RISC-V MMU), and PDTP holds the process directory table, used to select a + * first stage page table based on a process id (for devices that support multiple + * process ids). + */ +enum riscv_iommu_dc_fsc_atp_modes { + RISCV_IOMMU_DC_FSC_MODE_BARE = 0, + RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 = 8, + RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 = 8, + RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48 = 9, + RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57 = 10, + RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8 = 1, + RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17 = 2, + RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20 = 3 +}; + +/* MSI page table pointer */ +#define RISCV_IOMMU_DC_MSIPTP_PPN RISCV_IOMMU_ATP_PPN_FIELD +#define RISCV_IOMMU_DC_MSIPTP_MODE RISCV_IOMMU_ATP_MODE_FIELD +#define RISCV_IOMMU_DC_MSIPTP_MODE_OFF 0 +#define RISCV_IOMMU_DC_MSIPTP_MODE_FLAT 1 + +/* MSI address mask */ +#define RISCV_IOMMU_DC_MSI_ADDR_MASK GENMASK_ULL(51, 0) + +/* MSI address pattern */ +#define RISCV_IOMMU_DC_MSI_PATTERN GENMASK_ULL(51, 0) + +/** + * struct riscv_iommu_pc - Process Context + * @ta: Translation Attributes + * @fsc: First stage context + * + * This structure is used for leaf nodes on the Process Directory Table + * See section 2.3 for more details + */ +struct riscv_iommu_pc { + u64 ta; + u64 fsc; +}; + +/* Translation attributes fields */ +#define RISCV_IOMMU_PC_TA_V BIT_ULL(0) +#define RISCV_IOMMU_PC_TA_ENS BIT_ULL(1) +#define RISCV_IOMMU_PC_TA_SUM BIT_ULL(2) +#define RISCV_IOMMU_PC_TA_PSCID GENMASK_ULL(31, 12) + +/* First stage context fields */ +#define RISCV_IOMMU_PC_FSC_PPN RISCV_IOMMU_ATP_PPN_FIELD +#define RISCV_IOMMU_PC_FSC_MODE RISCV_IOMMU_ATP_MODE_FIELD + +/* + * Chapter 3: In-memory queue interface + */ + +/** + * struct riscv_iommu_command - Generic IOMMU command structure + * @dword0: Includes the opcode and the function identifier + * @dword1: Opcode specific data + * + * The commands are interpreted as two 64bit fields, where the first + * 7bits of the first field are the opcode which also defines the + * command's format, followed by a 3bit field that specifies the + * function invoked by that command, and the rest is opcode-specific. + * This is a generic struct which will be populated differently + * according to each command. For more infos on the commands and + * the command queue check section 3.1. + */ +struct riscv_iommu_command { + u64 dword0; + u64 dword1; +}; + +/* Fields on dword0, common for all commands */ +#define RISCV_IOMMU_CMD_OPCODE GENMASK_ULL(6, 0) +#define RISCV_IOMMU_CMD_FUNC GENMASK_ULL(9, 7) + +/* 3.1.1 IOMMU Page-table cache invalidation */ +/* Fields on dword0 */ +#define RISCV_IOMMU_CMD_IOTINVAL_OPCODE 1 +#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA 0 +#define RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA 1 +#define RISCV_IOMMU_CMD_IOTINVAL_AV BIT_ULL(10) +#define RISCV_IOMMU_CMD_IOTINVAL_PSCID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_CMD_IOTINVAL_PSCV BIT_ULL(32) +#define RISCV_IOMMU_CMD_IOTINVAL_GV BIT_ULL(33) +#define RISCV_IOMMU_CMD_IOTINVAL_GSCID GENMASK_ULL(59, 44) +/* dword1[61:10] is the 4K-aligned page address */ +#define RISCV_IOMMU_CMD_IOTINVAL_ADDR GENMASK_ULL(61, 10) + +/* 3.1.2 IOMMU Command Queue Fences */ +/* Fields on dword0 */ +#define RISCV_IOMMU_CMD_IOFENCE_OPCODE 2 +#define RISCV_IOMMU_CMD_IOFENCE_FUNC_C 0 +#define RISCV_IOMMU_CMD_IOFENCE_AV BIT_ULL(10) +#define RISCV_IOMMU_CMD_IOFENCE_WSI BIT_ULL(11) +#define RISCV_IOMMU_CMD_IOFENCE_PR BIT_ULL(12) +#define RISCV_IOMMU_CMD_IOFENCE_PW BIT_ULL(13) +#define RISCV_IOMMU_CMD_IOFENCE_DATA GENMASK_ULL(63, 32) +/* dword1 is the address, word-size aligned and shifted to the right by two bits. */ + +/* 3.1.3 IOMMU Directory cache invalidation */ +/* Fields on dword0 */ +#define RISCV_IOMMU_CMD_IODIR_OPCODE 3 +#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT 0 +#define RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT 1 +#define RISCV_IOMMU_CMD_IODIR_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_CMD_IODIR_DV BIT_ULL(33) +#define RISCV_IOMMU_CMD_IODIR_DID GENMASK_ULL(63, 40) +/* dword1 is reserved for standard use */ + +/* 3.1.4 IOMMU PCIe ATS */ +/* Fields on dword0 */ +#define RISCV_IOMMU_CMD_ATS_OPCODE 4 +#define RISCV_IOMMU_CMD_ATS_FUNC_INVAL 0 +#define RISCV_IOMMU_CMD_ATS_FUNC_PRGR 1 +#define RISCV_IOMMU_CMD_ATS_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_CMD_ATS_PV BIT_ULL(32) +#define RISCV_IOMMU_CMD_ATS_DSV BIT_ULL(33) +#define RISCV_IOMMU_CMD_ATS_RID GENMASK_ULL(55, 40) +#define RISCV_IOMMU_CMD_ATS_DSEG GENMASK_ULL(63, 56) +/* dword1 is the ATS payload, two different payload types for INVAL and PRGR */ + +/* ATS.INVAL payload*/ +#define RISCV_IOMMU_CMD_ATS_INVAL_G BIT_ULL(0) +/* Bits 1 - 10 are zeroed */ +#define RISCV_IOMMU_CMD_ATS_INVAL_S BIT_ULL(11) +#define RISCV_IOMMU_CMD_ATS_INVAL_UADDR GENMASK_ULL(63, 12) + +/* ATS.PRGR payload */ +/* Bits 0 - 31 are zeroed */ +#define RISCV_IOMMU_CMD_ATS_PRGR_PRG_INDEX GENMASK_ULL(40, 32) +/* Bits 41 - 43 are zeroed */ +#define RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE GENMASK_ULL(47, 44) +#define RISCV_IOMMU_CMD_ATS_PRGR_DST_ID GENMASK_ULL(63, 48) + +/** + * struct riscv_iommu_fq_record - Fault/Event Queue Record + * @hdr: Header, includes fault/event cause, PID/DID, transaction type etc + * @_reserved: Low 32bits for custom use, high 32bits for standard use + * @iotval: Transaction-type/cause specific format + * @iotval2: Cause specific format + * + * The fault/event queue reports events and failures raised when + * processing transactions. Each record is a 32byte structure where + * the first dword has a fixed format for providing generic infos + * regarding the fault/event, and two more dwords are there for + * fault/event-specific information. For more details see section + * 3.2. + */ +struct riscv_iommu_fq_record { + u64 hdr; + u64 _reserved; + u64 iotval; + u64 iotval2; +}; + +/* Fields on header */ +#define RISCV_IOMMU_FQ_HDR_CAUSE GENMASK_ULL(11, 0) +#define RISCV_IOMMU_FQ_HDR_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_FQ_HDR_PV BIT_ULL(32) +#define RISCV_IOMMU_FQ_HDR_PRIV BIT_ULL(33) +#define RISCV_IOMMU_FQ_HDR_TTYP GENMASK_ULL(39, 34) +#define RISCV_IOMMU_FQ_HDR_DID GENMASK_ULL(63, 40) + +/** + * enum riscv_iommu_fq_causes - Fault/event cause values + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT: Instruction access fault + * @RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED: Read address misaligned + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT: Read load fault + * @RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED: Write/AMO address misaligned + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT: Write/AMO access fault + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S: Instruction page fault + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S: Read page fault + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S: Write/AMO page fault + * @RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS: Instruction guest page fault + * @RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS: Read guest page fault + * @RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS: Write/AMO guest page fault + * @RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED: All inbound transactions disallowed + * @RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT: DDT entry load access fault + * @RISCV_IOMMU_FQ_CAUSE_DDT_INVALID: DDT entry invalid + * @RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED: DDT entry misconfigured + * @RISCV_IOMMU_FQ_CAUSE_TTYP_BLOCKED: Transaction type disallowed + * @RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT: MSI PTE load access fault + * @RISCV_IOMMU_FQ_CAUSE_MSI_INVALID: MSI PTE invalid + * @RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED: MSI PTE misconfigured + * @RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT: MRIF access fault + * @RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT: PDT entry load access fault + * @RISCV_IOMMU_FQ_CAUSE_PDT_INVALID: PDT entry invalid + * @RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED: PDT entry misconfigured + * @RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED: DDT data corruption + * @RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED: PDT data corruption + * @RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED: MSI page table data corruption + * @RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED: MRIF data corruption + * @RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR: Internal data path error + * @RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT: IOMMU MSI write access fault + * @RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED: First/second stage page table data corruption + * + * Values are on table 11 of the spec, encodings 275 - 2047 are reserved for standard + * use, and 2048 - 4095 for custom use. + */ +enum riscv_iommu_fq_causes { + RISCV_IOMMU_FQ_CAUSE_INST_FAULT = 1, + RISCV_IOMMU_FQ_CAUSE_RD_ADDR_MISALIGNED = 4, + RISCV_IOMMU_FQ_CAUSE_RD_FAULT = 5, + RISCV_IOMMU_FQ_CAUSE_WR_ADDR_MISALIGNED = 6, + RISCV_IOMMU_FQ_CAUSE_WR_FAULT = 7, + RISCV_IOMMU_FQ_CAUSE_INST_FAULT_S = 12, + RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S = 13, + RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S = 15, + RISCV_IOMMU_FQ_CAUSE_INST_FAULT_VS = 20, + RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS = 21, + RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS = 23, + RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED = 256, + RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT = 257, + RISCV_IOMMU_FQ_CAUSE_DDT_INVALID = 258, + RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED = 259, + RISCV_IOMMU_FQ_CAUSE_TTYP_BLOCKED = 260, + RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT = 261, + RISCV_IOMMU_FQ_CAUSE_MSI_INVALID = 262, + RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED = 263, + RISCV_IOMMU_FQ_CAUSE_MRIF_FAULT = 264, + RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT = 265, + RISCV_IOMMU_FQ_CAUSE_PDT_INVALID = 266, + RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED = 267, + RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED = 268, + RISCV_IOMMU_FQ_CAUSE_PDT_CORRUPTED = 269, + RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED = 270, + RISCV_IOMMU_FQ_CAUSE_MRIF_CORRUIPTED = 271, + RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR = 272, + RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT = 273, + RISCV_IOMMU_FQ_CAUSE_PT_CORRUPTED = 274 +}; + +/** + * enum riscv_iommu_fq_ttypes: Fault/event transaction types + * @RISCV_IOMMU_FQ_TTYP_NONE: None. Fault not caused by an inbound transaction. + * @RISCV_IOMMU_FQ_TTYP_UADDR_INST_FETCH: Instruction fetch from untranslated address + * @RISCV_IOMMU_FQ_TTYP_UADDR_RD: Read from untranslated address + * @RISCV_IOMMU_FQ_TTYP_UADDR_WR: Write/AMO to untranslated address + * @RISCV_IOMMU_FQ_TTYP_TADDR_INST_FETCH: Instruction fetch from translated address + * @RISCV_IOMMU_FQ_TTYP_TADDR_RD: Read from translated address + * @RISCV_IOMMU_FQ_TTYP_TADDR_WR: Write/AMO to translated address + * @RISCV_IOMMU_FQ_TTYP_PCIE_ATS_REQ: PCIe ATS translation request + * @RISCV_IOMMU_FQ_TTYP_PCIE_MSG_REQ: PCIe message request + * + * Values are on table 12 of the spec, type 4 and 10 - 31 are reserved for standard use + * and 31 - 63 for custom use. + */ +enum riscv_iommu_fq_ttypes { + RISCV_IOMMU_FQ_TTYP_NONE = 0, + RISCV_IOMMU_FQ_TTYP_UADDR_INST_FETCH = 1, + RISCV_IOMMU_FQ_TTYP_UADDR_RD = 2, + RISCV_IOMMU_FQ_TTYP_UADDR_WR = 3, + RISCV_IOMMU_FQ_TTYP_TADDR_INST_FETCH = 5, + RISCV_IOMMU_FQ_TTYP_TADDR_RD = 6, + RISCV_IOMMU_FQ_TTYP_TADDR_WR = 7, + RISCV_IOMMU_FQ_TTYP_PCIE_ATS_REQ = 8, + RISCV_IOMMU_FQ_TTYP_PCIE_MSG_REQ = 9, +}; + +/** + * struct riscv_iommu_pq_record - PCIe Page Request record + * @hdr: Header, includes PID, DID etc + * @payload: Holds the page address, request group and permission bits + * + * For more infos on the PCIe Page Request queue see chapter 3.3. + */ +struct riscv_iommu_pq_record { + u64 hdr; + u64 payload; +}; + +/* Header fields */ +#define RISCV_IOMMU_PQ_HDR_PID GENMASK_ULL(31, 12) +#define RISCV_IOMMU_PQ_HDR_PV BIT_ULL(32) +#define RISCV_IOMMU_PQ_HDR_PRIV BIT_ULL(33) +#define RISCV_IOMMU_PQ_HDR_EXEC BIT_ULL(34) +#define RISCV_IOMMU_PQ_HDR_DID GENMASK_ULL(63, 40) + +/* Payload fields */ +#define RISCV_IOMMU_PQ_PAYLOAD_R BIT_ULL(0) +#define RISCV_IOMMU_PQ_PAYLOAD_W BIT_ULL(1) +#define RISCV_IOMMU_PQ_PAYLOAD_L BIT_ULL(2) +#define RISCV_IOMMU_PQ_PAYLOAD_RWL_MASK GENMASK_ULL(2, 0) +#define RISCV_IOMMU_PQ_PAYLOAD_PRGI GENMASK_ULL(11, 3) /* Page Request Group Index */ +#define RISCV_IOMMU_PQ_PAYLOAD_ADDR GENMASK_ULL(63, 12) + +/** + * struct riscv_iommu_msipte - MSI Page Table Entry + * @pte: MSI PTE + * @mrif_info: Memory-resident interrupt file info + * + * The MSI Page Table is used for virtualizing MSIs, so that when + * a device sends an MSI to a guest, the IOMMU can reroute it + * by translating the MSI address, either to a guest interrupt file + * or a memory resident interrupt file (MRIF). Note that this page table + * is an array of MSI PTEs, not a multi-level pt, each entry + * is a leaf entry. For more infos check out the AIA spec, chapter 9.5. + * + * Also in basic mode the mrif_info field is ignored by the IOMMU and can + * be used by software, any other reserved fields on pte must be zeroed-out + * by software. + */ +struct riscv_iommu_msipte { + u64 pte; + u64 mrif_info; +}; + +/* Fields on pte */ +#define RISCV_IOMMU_MSIPTE_V BIT_ULL(0) +#define RISCV_IOMMU_MSIPTE_M GENMASK_ULL(2, 1) +#define RISCV_IOMMU_MSIPTE_MRIF_ADDR GENMASK_ULL(53, 7) /* When M == 1 (MRIF mode) */ +#define RISCV_IOMMU_MSIPTE_PPN RISCV_IOMMU_PPN_FIELD /* When M == 3 (basic mode) */ +#define RISCV_IOMMU_MSIPTE_C BIT_ULL(63) + +/* Fields on mrif_info */ +#define RISCV_IOMMU_MSIPTE_MRIF_NID GENMASK_ULL(9, 0) +#define RISCV_IOMMU_MSIPTE_MRIF_NPPN RISCV_IOMMU_PPN_FIELD +#define RISCV_IOMMU_MSIPTE_MRIF_NID_MSB BIT_ULL(60) + +/* Helper functions: command structure builders. */ + +static inline void riscv_iommu_cmd_inval_vma(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOTINVAL_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd, + u64 addr) +{ + cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr)); + cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV; +} + +static inline void riscv_iommu_cmd_inval_set_pscid(struct riscv_iommu_command *cmd, + int pscid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_PSCID, pscid) | + RISCV_IOMMU_CMD_IOTINVAL_PSCV; +} + +static inline void riscv_iommu_cmd_inval_set_gscid(struct riscv_iommu_command *cmd, + int gscid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_GSCID, gscid) | + RISCV_IOMMU_CMD_IOTINVAL_GV; +} + +static inline void riscv_iommu_cmd_iofence(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) | + RISCV_IOMMU_CMD_IOFENCE_PR | RISCV_IOMMU_CMD_IOFENCE_PW; + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iofence_set_av(struct riscv_iommu_command *cmd, + u64 addr, u32 data) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IOFENCE_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IOFENCE_FUNC_C) | + FIELD_PREP(RISCV_IOMMU_CMD_IOFENCE_DATA, data) | + RISCV_IOMMU_CMD_IOFENCE_AV; + cmd->dword1 = addr >> 2; +} + +static inline void riscv_iommu_cmd_iodir_inval_ddt(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iodir_inval_pdt(struct riscv_iommu_command *cmd) +{ + cmd->dword0 = FIELD_PREP(RISCV_IOMMU_CMD_OPCODE, RISCV_IOMMU_CMD_IODIR_OPCODE) | + FIELD_PREP(RISCV_IOMMU_CMD_FUNC, RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT); + cmd->dword1 = 0; +} + +static inline void riscv_iommu_cmd_iodir_set_did(struct riscv_iommu_command *cmd, + unsigned int devid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_DID, devid) | + RISCV_IOMMU_CMD_IODIR_DV; +} + +static inline void riscv_iommu_cmd_iodir_set_pid(struct riscv_iommu_command *cmd, + unsigned int pasid) +{ + cmd->dword0 |= FIELD_PREP(RISCV_IOMMU_CMD_IODIR_PID, pasid); +} + +#endif /* _RISCV_IOMMU_BITS_H_ */ diff --git a/drivers/iommu/riscv/iommu-pci.c b/drivers/iommu/riscv/iommu-pci.c new file mode 100644 index 000000000000..c7a89143014c --- /dev/null +++ b/drivers/iommu/riscv/iommu-pci.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * + * RISCV IOMMU as a PCIe device + * + * Authors + * Tomasz Jeznach <[email protected]> + * Nick Kossifidis <[email protected]> + */ + +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/iommu.h> +#include <linux/kernel.h> +#include <linux/pci.h> + +#include "iommu-bits.h" +#include "iommu.h" + +/* QEMU RISC-V IOMMU implementation */ +#define PCI_DEVICE_ID_REDHAT_RISCV_IOMMU 0x0014 + +/* Rivos Inc. assigned PCI Vendor and Device IDs */ +#ifndef PCI_VENDOR_ID_RIVOS +#define PCI_VENDOR_ID_RIVOS 0x1efd +#endif + +#define PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA 0x0008 + +static int riscv_iommu_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + struct device *dev = &pdev->dev; + struct riscv_iommu_device *iommu; + int rc, vec; + + rc = pcim_enable_device(pdev); + if (rc) + return rc; + + if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) + return -ENODEV; + + if (pci_resource_len(pdev, 0) < RISCV_IOMMU_REG_SIZE) + return -ENODEV; + + rc = pcim_iomap_regions(pdev, BIT(0), pci_name(pdev)); + if (rc) + return dev_err_probe(dev, rc, "pcim_iomap_regions failed\n"); + + iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return -ENOMEM; + + iommu->dev = dev; + iommu->reg = pcim_iomap_table(pdev)[0]; + + pci_set_master(pdev); + dev_set_drvdata(dev, iommu); + + /* Check device reported capabilities / features. */ + iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES); + iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); + + /* The PCI driver only uses MSIs, make sure the IOMMU supports this */ + switch (FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps)) { + case RISCV_IOMMU_CAPABILITIES_IGS_MSI: + case RISCV_IOMMU_CAPABILITIES_IGS_BOTH: + break; + default: + return dev_err_probe(dev, -ENODEV, + "unable to use message-signaled interrupts\n"); + } + + /* Allocate and assign IRQ vectors for the various events */ + rc = pci_alloc_irq_vectors(pdev, 1, RISCV_IOMMU_INTR_COUNT, + PCI_IRQ_MSIX | PCI_IRQ_MSI); + if (rc <= 0) + return dev_err_probe(dev, -ENODEV, + "unable to allocate irq vectors\n"); + + iommu->irqs_count = rc; + for (vec = 0; vec < iommu->irqs_count; vec++) + iommu->irqs[vec] = msi_get_virq(dev, vec); + + /* Enable message-signaled interrupts, fctl.WSI */ + if (iommu->fctl & RISCV_IOMMU_FCTL_WSI) { + iommu->fctl ^= RISCV_IOMMU_FCTL_WSI; + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl); + } + + return riscv_iommu_init(iommu); +} + +static void riscv_iommu_pci_remove(struct pci_dev *pdev) +{ + struct riscv_iommu_device *iommu = dev_get_drvdata(&pdev->dev); + + riscv_iommu_remove(iommu); +} + +static const struct pci_device_id riscv_iommu_pci_tbl[] = { + {PCI_VDEVICE(REDHAT, PCI_DEVICE_ID_REDHAT_RISCV_IOMMU), 0}, + {PCI_VDEVICE(RIVOS, PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA), 0}, + {0,} +}; + +static struct pci_driver riscv_iommu_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = riscv_iommu_pci_tbl, + .probe = riscv_iommu_pci_probe, + .remove = riscv_iommu_pci_remove, + .driver = { + .suppress_bind_attrs = true, + }, +}; + +builtin_pci_driver(riscv_iommu_pci_driver); diff --git a/drivers/iommu/riscv/iommu-platform.c b/drivers/iommu/riscv/iommu-platform.c new file mode 100644 index 000000000000..da336863f152 --- /dev/null +++ b/drivers/iommu/riscv/iommu-platform.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RISC-V IOMMU as a platform device + * + * Copyright © 2023 FORTH-ICS/CARV + * Copyright © 2023-2024 Rivos Inc. + * + * Authors + * Nick Kossifidis <[email protected]> + * Tomasz Jeznach <[email protected]> + */ + +#include <linux/kernel.h> +#include <linux/of_platform.h> +#include <linux/platform_device.h> + +#include "iommu-bits.h" +#include "iommu.h" + +static int riscv_iommu_platform_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct riscv_iommu_device *iommu = NULL; + struct resource *res = NULL; + int vec; + + iommu = devm_kzalloc(dev, sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return -ENOMEM; + + iommu->dev = dev; + iommu->reg = devm_platform_get_and_ioremap_resource(pdev, 0, &res); + if (IS_ERR(iommu->reg)) + return dev_err_probe(dev, PTR_ERR(iommu->reg), + "could not map register region\n"); + + dev_set_drvdata(dev, iommu); + + /* Check device reported capabilities / features. */ + iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES); + iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); + + /* For now we only support WSI */ + switch (FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps)) { + case RISCV_IOMMU_CAPABILITIES_IGS_WSI: + case RISCV_IOMMU_CAPABILITIES_IGS_BOTH: + break; + default: + return dev_err_probe(dev, -ENODEV, + "unable to use wire-signaled interrupts\n"); + } + + iommu->irqs_count = platform_irq_count(pdev); + if (iommu->irqs_count <= 0) + return dev_err_probe(dev, -ENODEV, + "no IRQ resources provided\n"); + if (iommu->irqs_count > RISCV_IOMMU_INTR_COUNT) + iommu->irqs_count = RISCV_IOMMU_INTR_COUNT; + + for (vec = 0; vec < iommu->irqs_count; vec++) + iommu->irqs[vec] = platform_get_irq(pdev, vec); + + /* Enable wire-signaled interrupts, fctl.WSI */ + if (!(iommu->fctl & RISCV_IOMMU_FCTL_WSI)) { + iommu->fctl |= RISCV_IOMMU_FCTL_WSI; + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, iommu->fctl); + } + + return riscv_iommu_init(iommu); +}; + +static void riscv_iommu_platform_remove(struct platform_device *pdev) +{ + riscv_iommu_remove(dev_get_drvdata(&pdev->dev)); +}; + +static const struct of_device_id riscv_iommu_of_match[] = { + {.compatible = "riscv,iommu",}, + {}, +}; + +static struct platform_driver riscv_iommu_platform_driver = { + .probe = riscv_iommu_platform_probe, + .remove_new = riscv_iommu_platform_remove, + .driver = { + .name = "riscv,iommu", + .of_match_table = riscv_iommu_of_match, + .suppress_bind_attrs = true, + }, +}; + +builtin_platform_driver(riscv_iommu_platform_driver); diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c new file mode 100644 index 000000000000..8a05def774bd --- /dev/null +++ b/drivers/iommu/riscv/iommu.c @@ -0,0 +1,1661 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * IOMMU API for RISC-V IOMMU implementations. + * + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * + * Authors + * Tomasz Jeznach <[email protected]> + * Nick Kossifidis <[email protected]> + */ + +#define pr_fmt(fmt) "riscv-iommu: " fmt + +#include <linux/compiler.h> +#include <linux/crash_dump.h> +#include <linux/init.h> +#include <linux/iommu.h> +#include <linux/iopoll.h> +#include <linux/kernel.h> +#include <linux/pci.h> + +#include "../iommu-pages.h" +#include "iommu-bits.h" +#include "iommu.h" + +/* Timeouts in [us] */ +#define RISCV_IOMMU_QCSR_TIMEOUT 150000 +#define RISCV_IOMMU_QUEUE_TIMEOUT 150000 +#define RISCV_IOMMU_DDTP_TIMEOUT 10000000 +#define RISCV_IOMMU_IOTINVAL_TIMEOUT 90000000 + +/* Number of entries per CMD/FLT queue, should be <= INT_MAX */ +#define RISCV_IOMMU_DEF_CQ_COUNT 8192 +#define RISCV_IOMMU_DEF_FQ_COUNT 4096 + +/* RISC-V IOMMU PPN <> PHYS address conversions, PHYS <=> PPN[53:10] */ +#define phys_to_ppn(pa) (((pa) >> 2) & (((1ULL << 44) - 1) << 10)) +#define ppn_to_phys(pn) (((pn) << 2) & (((1ULL << 44) - 1) << 12)) + +#define dev_to_iommu(dev) \ + iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu) + +/* IOMMU PSCID allocation namespace. */ +static DEFINE_IDA(riscv_iommu_pscids); +#define RISCV_IOMMU_MAX_PSCID (BIT(20) - 1) + +/* Device resource-managed allocations */ +struct riscv_iommu_devres { + void *addr; + int order; +}; + +static void riscv_iommu_devres_pages_release(struct device *dev, void *res) +{ + struct riscv_iommu_devres *devres = res; + + iommu_free_pages(devres->addr, devres->order); +} + +static int riscv_iommu_devres_pages_match(struct device *dev, void *res, void *p) +{ + struct riscv_iommu_devres *devres = res; + struct riscv_iommu_devres *target = p; + + return devres->addr == target->addr; +} + +static void *riscv_iommu_get_pages(struct riscv_iommu_device *iommu, int order) +{ + struct riscv_iommu_devres *devres; + void *addr; + + addr = iommu_alloc_pages_node(dev_to_node(iommu->dev), + GFP_KERNEL_ACCOUNT, order); + if (unlikely(!addr)) + return NULL; + + devres = devres_alloc(riscv_iommu_devres_pages_release, + sizeof(struct riscv_iommu_devres), GFP_KERNEL); + + if (unlikely(!devres)) { + iommu_free_pages(addr, order); + return NULL; + } + + devres->addr = addr; + devres->order = order; + + devres_add(iommu->dev, devres); + + return addr; +} + +static void riscv_iommu_free_pages(struct riscv_iommu_device *iommu, void *addr) +{ + struct riscv_iommu_devres devres = { .addr = addr }; + + devres_release(iommu->dev, riscv_iommu_devres_pages_release, + riscv_iommu_devres_pages_match, &devres); +} + +/* + * Hardware queue allocation and management. + */ + +/* Setup queue base, control registers and default queue length */ +#define RISCV_IOMMU_QUEUE_INIT(q, name) do { \ + struct riscv_iommu_queue *_q = q; \ + _q->qid = RISCV_IOMMU_INTR_ ## name; \ + _q->qbr = RISCV_IOMMU_REG_ ## name ## B; \ + _q->qcr = RISCV_IOMMU_REG_ ## name ## CSR; \ + _q->mask = _q->mask ?: (RISCV_IOMMU_DEF_ ## name ## _COUNT) - 1;\ +} while (0) + +/* Note: offsets are the same for all queues */ +#define Q_HEAD(q) ((q)->qbr + (RISCV_IOMMU_REG_CQH - RISCV_IOMMU_REG_CQB)) +#define Q_TAIL(q) ((q)->qbr + (RISCV_IOMMU_REG_CQT - RISCV_IOMMU_REG_CQB)) +#define Q_ITEM(q, index) ((q)->mask & (index)) +#define Q_IPSR(q) BIT((q)->qid) + +/* + * Discover queue ring buffer hardware configuration, allocate in-memory + * ring buffer or use fixed I/O memory location, configure queue base register. + * Must be called before hardware queue is enabled. + * + * @queue - data structure, configured with RISCV_IOMMU_QUEUE_INIT() + * @entry_size - queue single element size in bytes. + */ +static int riscv_iommu_queue_alloc(struct riscv_iommu_device *iommu, + struct riscv_iommu_queue *queue, + size_t entry_size) +{ + unsigned int logsz; + u64 qb, rb; + + /* + * Use WARL base register property to discover maximum allowed + * number of entries and optional fixed IO address for queue location. + */ + riscv_iommu_writeq(iommu, queue->qbr, RISCV_IOMMU_QUEUE_LOG2SZ_FIELD); + qb = riscv_iommu_readq(iommu, queue->qbr); + + /* + * Calculate and verify hardware supported queue length, as reported + * by the field LOG2SZ, where max queue length is equal to 2^(LOG2SZ + 1). + * Update queue size based on hardware supported value. + */ + logsz = ilog2(queue->mask); + if (logsz > FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb)) + logsz = FIELD_GET(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, qb); + + /* + * Use WARL base register property to discover an optional fixed IO + * address for queue ring buffer location. Otherwise allocate contiguous + * system memory. + */ + if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { + const size_t queue_size = entry_size << (logsz + 1); + + queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); + queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); + } else { + do { + const size_t queue_size = entry_size << (logsz + 1); + const int order = get_order(queue_size); + + queue->base = riscv_iommu_get_pages(iommu, order); + queue->phys = __pa(queue->base); + } while (!queue->base && logsz-- > 0); + } + + if (!queue->base) + return -ENOMEM; + + qb = phys_to_ppn(queue->phys) | + FIELD_PREP(RISCV_IOMMU_QUEUE_LOG2SZ_FIELD, logsz); + + /* Update base register and read back to verify hw accepted our write */ + riscv_iommu_writeq(iommu, queue->qbr, qb); + rb = riscv_iommu_readq(iommu, queue->qbr); + if (rb != qb) { + dev_err(iommu->dev, "queue #%u allocation failed\n", queue->qid); + return -ENODEV; + } + + /* Update actual queue mask */ + queue->mask = (2U << logsz) - 1; + + dev_dbg(iommu->dev, "queue #%u allocated 2^%u entries", + queue->qid, logsz + 1); + + return 0; +} + +/* Check interrupt queue status, IPSR */ +static irqreturn_t riscv_iommu_queue_ipsr(int irq, void *data) +{ + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + + if (riscv_iommu_readl(queue->iommu, RISCV_IOMMU_REG_IPSR) & Q_IPSR(queue)) + return IRQ_WAKE_THREAD; + + return IRQ_NONE; +} + +static int riscv_iommu_queue_vec(struct riscv_iommu_device *iommu, int n) +{ + /* Reuse ICVEC.CIV mask for all interrupt vectors mapping. */ + return (iommu->icvec >> (n * 4)) & RISCV_IOMMU_ICVEC_CIV; +} + +/* + * Enable queue processing in the hardware, register interrupt handler. + * + * @queue - data structure, already allocated with riscv_iommu_queue_alloc() + * @irq_handler - threaded interrupt handler. + */ +static int riscv_iommu_queue_enable(struct riscv_iommu_device *iommu, + struct riscv_iommu_queue *queue, + irq_handler_t irq_handler) +{ + const unsigned int irq = iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)]; + u32 csr; + int rc; + + if (queue->iommu) + return -EBUSY; + + /* Polling not implemented */ + if (!irq) + return -ENODEV; + + queue->iommu = iommu; + rc = request_threaded_irq(irq, riscv_iommu_queue_ipsr, irq_handler, + IRQF_ONESHOT | IRQF_SHARED, + dev_name(iommu->dev), queue); + if (rc) { + queue->iommu = NULL; + return rc; + } + + /* + * Enable queue with interrupts, clear any memory fault if any. + * Wait for the hardware to acknowledge request and activate queue + * processing. + * Note: All CSR bitfields are in the same offsets for all queues. + */ + riscv_iommu_writel(iommu, queue->qcr, + RISCV_IOMMU_QUEUE_ENABLE | + RISCV_IOMMU_QUEUE_INTR_ENABLE | + RISCV_IOMMU_QUEUE_MEM_FAULT); + + riscv_iommu_readl_timeout(iommu, queue->qcr, + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), + 10, RISCV_IOMMU_QCSR_TIMEOUT); + + if (RISCV_IOMMU_QUEUE_ACTIVE != (csr & (RISCV_IOMMU_QUEUE_ACTIVE | + RISCV_IOMMU_QUEUE_BUSY | + RISCV_IOMMU_QUEUE_MEM_FAULT))) { + /* Best effort to stop and disable failing hardware queue. */ + riscv_iommu_writel(iommu, queue->qcr, 0); + free_irq(irq, queue); + queue->iommu = NULL; + dev_err(iommu->dev, "queue #%u failed to start\n", queue->qid); + return -EBUSY; + } + + /* Clear any pending interrupt flag. */ + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + return 0; +} + +/* + * Disable queue. Wait for the hardware to acknowledge request and + * stop processing enqueued requests. Report errors but continue. + */ +static void riscv_iommu_queue_disable(struct riscv_iommu_queue *queue) +{ + struct riscv_iommu_device *iommu = queue->iommu; + u32 csr; + + if (!iommu) + return; + + free_irq(iommu->irqs[riscv_iommu_queue_vec(iommu, queue->qid)], queue); + riscv_iommu_writel(iommu, queue->qcr, 0); + riscv_iommu_readl_timeout(iommu, queue->qcr, + csr, !(csr & RISCV_IOMMU_QUEUE_BUSY), + 10, RISCV_IOMMU_QCSR_TIMEOUT); + + if (csr & (RISCV_IOMMU_QUEUE_ACTIVE | RISCV_IOMMU_QUEUE_BUSY)) + dev_err(iommu->dev, "fail to disable hardware queue #%u, csr 0x%x\n", + queue->qid, csr); + + queue->iommu = NULL; +} + +/* + * Returns number of available valid queue entries and the first item index. + * Update shadow producer index if necessary. + */ +static int riscv_iommu_queue_consume(struct riscv_iommu_queue *queue, + unsigned int *index) +{ + unsigned int head = atomic_read(&queue->head); + unsigned int tail = atomic_read(&queue->tail); + unsigned int last = Q_ITEM(queue, tail); + int available = (int)(tail - head); + + *index = head; + + if (available > 0) + return available; + + /* read hardware producer index, check reserved register bits are not set. */ + if (riscv_iommu_readl_timeout(queue->iommu, Q_TAIL(queue), + tail, (tail & ~queue->mask) == 0, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) { + dev_err_once(queue->iommu->dev, + "Hardware error: queue access timeout\n"); + return 0; + } + + if (tail == last) + return 0; + + /* update shadow producer index */ + return (int)(atomic_add_return((tail - last) & queue->mask, &queue->tail) - head); +} + +/* + * Release processed queue entries, should match riscv_iommu_queue_consume() calls. + */ +static void riscv_iommu_queue_release(struct riscv_iommu_queue *queue, int count) +{ + const unsigned int head = atomic_add_return(count, &queue->head); + + riscv_iommu_writel(queue->iommu, Q_HEAD(queue), Q_ITEM(queue, head)); +} + +/* Return actual consumer index based on hardware reported queue head index. */ +static unsigned int riscv_iommu_queue_cons(struct riscv_iommu_queue *queue) +{ + const unsigned int cons = atomic_read(&queue->head); + const unsigned int last = Q_ITEM(queue, cons); + unsigned int head; + + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, + !(head & ~queue->mask), + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + return cons; + + return cons + ((head - last) & queue->mask); +} + +/* Wait for submitted item to be processed. */ +static int riscv_iommu_queue_wait(struct riscv_iommu_queue *queue, + unsigned int index, + unsigned int timeout_us) +{ + unsigned int cons = atomic_read(&queue->head); + + /* Already processed by the consumer */ + if ((int)(cons - index) > 0) + return 0; + + /* Monitor consumer index */ + return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, + (int)(cons - index) > 0, 0, timeout_us); +} + +/* Enqueue an entry and wait to be processed if timeout_us > 0 + * + * Error handling for IOMMU hardware not responding in reasonable time + * will be added as separate patch series along with other RAS features. + * For now, only report hardware failure and continue. + */ +static unsigned int riscv_iommu_queue_send(struct riscv_iommu_queue *queue, + void *entry, size_t entry_size) +{ + unsigned int prod; + unsigned int head; + unsigned int tail; + unsigned long flags; + + /* Do not preempt submission flow. */ + local_irq_save(flags); + + /* 1. Allocate some space in the queue */ + prod = atomic_inc_return(&queue->prod) - 1; + head = atomic_read(&queue->head); + + /* 2. Wait for space availability. */ + if ((prod - head) > queue->mask) { + if (readx_poll_timeout(atomic_read, &queue->head, + head, (prod - head) < queue->mask, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + } else if ((prod - head) == queue->mask) { + const unsigned int last = Q_ITEM(queue, head); + + if (riscv_iommu_readl_timeout(queue->iommu, Q_HEAD(queue), head, + !(head & ~queue->mask) && head != last, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + atomic_add((head - last) & queue->mask, &queue->head); + } + + /* 3. Store entry in the ring buffer */ + memcpy(queue->base + Q_ITEM(queue, prod) * entry_size, entry, entry_size); + + /* 4. Wait for all previous entries to be ready */ + if (readx_poll_timeout(atomic_read, &queue->tail, tail, prod == tail, + 0, RISCV_IOMMU_QUEUE_TIMEOUT)) + goto err_busy; + + /* + * 5. Make sure the ring buffer update (whether in normal or I/O memory) is + * completed and visible before signaling the tail doorbell to fetch + * the next command. 'fence ow, ow' + */ + dma_wmb(); + riscv_iommu_writel(queue->iommu, Q_TAIL(queue), Q_ITEM(queue, prod + 1)); + + /* + * 6. Make sure the doorbell write to the device has finished before updating + * the shadow tail index in normal memory. 'fence o, w' + */ + mmiowb(); + atomic_inc(&queue->tail); + + /* 7. Complete submission and restore local interrupts */ + local_irq_restore(flags); + + return prod; + +err_busy: + local_irq_restore(flags); + dev_err_once(queue->iommu->dev, "Hardware error: command enqueue failed\n"); + + return prod; +} + +/* + * IOMMU Command queue chapter 3.1 + */ + +/* Command queue interrupt handler thread function */ +static irqreturn_t riscv_iommu_cmdq_process(int irq, void *data) +{ + const struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + unsigned int ctrl; + + /* Clear MF/CQ errors, complete error recovery to be implemented. */ + ctrl = riscv_iommu_readl(queue->iommu, queue->qcr); + if (ctrl & (RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | + RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_FENCE_W_IP)) { + riscv_iommu_writel(queue->iommu, queue->qcr, ctrl); + dev_warn(queue->iommu->dev, + "Queue #%u error; fault:%d timeout:%d illegal:%d fence_w_ip:%d\n", + queue->qid, + !!(ctrl & RISCV_IOMMU_CQCSR_CQMF), + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_TO), + !!(ctrl & RISCV_IOMMU_CQCSR_CMD_ILL), + !!(ctrl & RISCV_IOMMU_CQCSR_FENCE_W_IP)); + } + + /* Placeholder for command queue interrupt notifiers */ + + /* Clear command interrupt pending. */ + riscv_iommu_writel(queue->iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + return IRQ_HANDLED; +} + +/* Send command to the IOMMU command queue */ +static void riscv_iommu_cmd_send(struct riscv_iommu_device *iommu, + struct riscv_iommu_command *cmd) +{ + riscv_iommu_queue_send(&iommu->cmdq, cmd, sizeof(*cmd)); +} + +/* Send IOFENCE.C command and wait for all scheduled commands to complete. */ +static void riscv_iommu_cmd_sync(struct riscv_iommu_device *iommu, + unsigned int timeout_us) +{ + struct riscv_iommu_command cmd; + unsigned int prod; + + riscv_iommu_cmd_iofence(&cmd); + prod = riscv_iommu_queue_send(&iommu->cmdq, &cmd, sizeof(cmd)); + + if (!timeout_us) + return; + + if (riscv_iommu_queue_wait(&iommu->cmdq, prod, timeout_us)) + dev_err_once(iommu->dev, + "Hardware error: command execution timeout\n"); +} + +/* + * IOMMU Fault/Event queue chapter 3.2 + */ + +static void riscv_iommu_fault(struct riscv_iommu_device *iommu, + struct riscv_iommu_fq_record *event) +{ + unsigned int err = FIELD_GET(RISCV_IOMMU_FQ_HDR_CAUSE, event->hdr); + unsigned int devid = FIELD_GET(RISCV_IOMMU_FQ_HDR_DID, event->hdr); + + /* Placeholder for future fault handling implementation, report only. */ + if (err) + dev_warn_ratelimited(iommu->dev, + "Fault %d devid: 0x%x iotval: %llx iotval2: %llx\n", + err, devid, event->iotval, event->iotval2); +} + +/* Fault queue interrupt handler thread function */ +static irqreturn_t riscv_iommu_fltq_process(int irq, void *data) +{ + struct riscv_iommu_queue *queue = (struct riscv_iommu_queue *)data; + struct riscv_iommu_device *iommu = queue->iommu; + struct riscv_iommu_fq_record *events; + unsigned int ctrl, idx; + int cnt, len; + + events = (struct riscv_iommu_fq_record *)queue->base; + + /* Clear fault interrupt pending and process all received fault events. */ + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_IPSR, Q_IPSR(queue)); + + do { + cnt = riscv_iommu_queue_consume(queue, &idx); + for (len = 0; len < cnt; idx++, len++) + riscv_iommu_fault(iommu, &events[Q_ITEM(queue, idx)]); + riscv_iommu_queue_release(queue, cnt); + } while (cnt > 0); + + /* Clear MF/OF errors, complete error recovery to be implemented. */ + ctrl = riscv_iommu_readl(iommu, queue->qcr); + if (ctrl & (RISCV_IOMMU_FQCSR_FQMF | RISCV_IOMMU_FQCSR_FQOF)) { + riscv_iommu_writel(iommu, queue->qcr, ctrl); + dev_warn(iommu->dev, + "Queue #%u error; memory fault:%d overflow:%d\n", + queue->qid, + !!(ctrl & RISCV_IOMMU_FQCSR_FQMF), + !!(ctrl & RISCV_IOMMU_FQCSR_FQOF)); + } + + return IRQ_HANDLED; +} + +/* Lookup and initialize device context info structure. */ +static struct riscv_iommu_dc *riscv_iommu_get_dc(struct riscv_iommu_device *iommu, + unsigned int devid) +{ + const bool base_format = !(iommu->caps & RISCV_IOMMU_CAPABILITIES_MSI_FLAT); + unsigned int depth; + unsigned long ddt, old, new; + void *ptr; + u8 ddi_bits[3] = { 0 }; + u64 *ddtp = NULL; + + /* Make sure the mode is valid */ + if (iommu->ddt_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL || + iommu->ddt_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_3LVL) + return NULL; + + /* + * Device id partitioning for base format: + * DDI[0]: bits 0 - 6 (1st level) (7 bits) + * DDI[1]: bits 7 - 15 (2nd level) (9 bits) + * DDI[2]: bits 16 - 23 (3rd level) (8 bits) + * + * For extended format: + * DDI[0]: bits 0 - 5 (1st level) (6 bits) + * DDI[1]: bits 6 - 14 (2nd level) (9 bits) + * DDI[2]: bits 15 - 23 (3rd level) (9 bits) + */ + if (base_format) { + ddi_bits[0] = 7; + ddi_bits[1] = 7 + 9; + ddi_bits[2] = 7 + 9 + 8; + } else { + ddi_bits[0] = 6; + ddi_bits[1] = 6 + 9; + ddi_bits[2] = 6 + 9 + 9; + } + + /* Make sure device id is within range */ + depth = iommu->ddt_mode - RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL; + if (devid >= (1 << ddi_bits[depth])) + return NULL; + + /* Get to the level of the non-leaf node that holds the device context */ + for (ddtp = iommu->ddt_root; depth-- > 0;) { + const int split = ddi_bits[depth]; + /* + * Each non-leaf node is 64bits wide and on each level + * nodes are indexed by DDI[depth]. + */ + ddtp += (devid >> split) & 0x1FF; + + /* + * Check if this node has been populated and if not + * allocate a new level and populate it. + */ + do { + ddt = READ_ONCE(*(unsigned long *)ddtp); + if (ddt & RISCV_IOMMU_DDTE_V) { + ddtp = __va(ppn_to_phys(ddt)); + break; + } + + ptr = riscv_iommu_get_pages(iommu, 0); + if (!ptr) + return NULL; + + new = phys_to_ppn(__pa(ptr)) | RISCV_IOMMU_DDTE_V; + old = cmpxchg_relaxed((unsigned long *)ddtp, ddt, new); + + if (old == ddt) { + ddtp = (u64 *)ptr; + break; + } + + /* Race setting DDT detected, re-read and retry. */ + riscv_iommu_free_pages(iommu, ptr); + } while (1); + } + + /* + * Grab the node that matches DDI[depth], note that when using base + * format the device context is 4 * 64bits, and the extended format + * is 8 * 64bits, hence the (3 - base_format) below. + */ + ddtp += (devid & ((64 << base_format) - 1)) << (3 - base_format); + + return (struct riscv_iommu_dc *)ddtp; +} + +/* + * This is best effort IOMMU translation shutdown flow. + * Disable IOMMU without waiting for hardware response. + */ +static void riscv_iommu_disable(struct riscv_iommu_device *iommu) +{ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, 0); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_CQCSR, 0); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FQCSR, 0); + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_PQCSR, 0); +} + +#define riscv_iommu_read_ddtp(iommu) ({ \ + u64 ddtp; \ + riscv_iommu_readq_timeout((iommu), RISCV_IOMMU_REG_DDTP, ddtp, \ + !(ddtp & RISCV_IOMMU_DDTP_BUSY), 10, \ + RISCV_IOMMU_DDTP_TIMEOUT); \ + ddtp; }) + +static int riscv_iommu_iodir_alloc(struct riscv_iommu_device *iommu) +{ + u64 ddtp; + unsigned int mode; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* + * It is optional for the hardware to report a fixed address for device + * directory root page when DDT.MODE is OFF or BARE. + */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + if (mode == RISCV_IOMMU_DDTP_IOMMU_MODE_BARE || + mode == RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) { + /* Use WARL to discover hardware fixed DDT PPN */ + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, + FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, mode)); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + iommu->ddt_phys = ppn_to_phys(ddtp); + if (iommu->ddt_phys) + iommu->ddt_root = devm_ioremap(iommu->dev, + iommu->ddt_phys, PAGE_SIZE); + if (iommu->ddt_root) + memset(iommu->ddt_root, 0, PAGE_SIZE); + } + + if (!iommu->ddt_root) { + iommu->ddt_root = riscv_iommu_get_pages(iommu, 0); + iommu->ddt_phys = __pa(iommu->ddt_root); + } + + if (!iommu->ddt_root) + return -ENOMEM; + + return 0; +} + +/* + * Discover supported DDT modes starting from requested value, + * configure DDTP register with accepted mode and root DDT address. + * Accepted iommu->ddt_mode is updated on success. + */ +static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu, + unsigned int ddtp_mode) +{ + struct device *dev = iommu->dev; + u64 ddtp, rq_ddtp; + unsigned int mode, rq_mode = ddtp_mode; + struct riscv_iommu_command cmd; + + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + /* Disallow state transition from xLVL to xLVL. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + if (mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && + mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF && + rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_BARE && + rq_mode != RISCV_IOMMU_DDTP_IOMMU_MODE_OFF) + return -EINVAL; + + do { + rq_ddtp = FIELD_PREP(RISCV_IOMMU_DDTP_IOMMU_MODE, rq_mode); + if (rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) + rq_ddtp |= phys_to_ppn(iommu->ddt_phys); + + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_DDTP, rq_ddtp); + ddtp = riscv_iommu_read_ddtp(iommu); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) { + dev_err(dev, "timeout when setting ddtp (ddt mode: %u, read: %llx)\n", + rq_mode, ddtp); + return -EBUSY; + } + + /* Verify IOMMU hardware accepts new DDTP config. */ + mode = FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp); + + if (rq_mode == mode) + break; + + /* Hardware mandatory DDTP mode has not been accepted. */ + if (rq_mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && rq_ddtp != ddtp) { + dev_err(dev, "DDTP update failed hw: %llx vs %llx\n", + ddtp, rq_ddtp); + return -EINVAL; + } + + /* + * Mode field is WARL, an IOMMU may support a subset of + * directory table levels in which case if we tried to set + * an unsupported number of levels we'll readback either + * a valid xLVL or off/bare. If we got off/bare, try again + * with a smaller xLVL. + */ + if (mode < RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL && + rq_mode > RISCV_IOMMU_DDTP_IOMMU_MODE_1LVL) { + dev_dbg(dev, "DDTP hw mode %u vs %u\n", mode, rq_mode); + rq_mode--; + continue; + } + + /* + * We tried all supported modes and IOMMU hardware failed to + * accept new settings, something went very wrong since off/bare + * and at least one xLVL must be supported. + */ + dev_err(dev, "DDTP hw mode %u, failed to set %u\n", + mode, ddtp_mode); + return -EINVAL; + } while (1); + + iommu->ddt_mode = mode; + if (mode != ddtp_mode) + dev_dbg(dev, "DDTP hw mode %u, requested %u\n", mode, ddtp_mode); + + /* Invalidate device context cache */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_send(iommu, &cmd); + + /* Invalidate address translation cache */ + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_send(iommu, &cmd); + + /* IOFENCE.C */ + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + + return 0; +} + +/* This struct contains protection domain specific IOMMU driver data. */ +struct riscv_iommu_domain { + struct iommu_domain domain; + struct list_head bonds; + spinlock_t lock; /* protect bonds list updates. */ + int pscid; + bool amo_enabled; + int numa_node; + unsigned int pgd_mode; + unsigned long *pgd_root; +}; + +#define iommu_domain_to_riscv(iommu_domain) \ + container_of(iommu_domain, struct riscv_iommu_domain, domain) + +/* Private IOMMU data for managed devices, dev_iommu_priv_* */ +struct riscv_iommu_info { + struct riscv_iommu_domain *domain; +}; + +/* + * Linkage between an iommu_domain and attached devices. + * + * Protection domain requiring IOATC and DevATC translation cache invalidations, + * should be linked to attached devices using a riscv_iommu_bond structure. + * Devices should be linked to the domain before first use and unlinked after + * the translations from the referenced protection domain can no longer be used. + * Blocking and identity domains are not tracked here, as the IOMMU hardware + * does not cache negative and/or identity (BARE mode) translations, and DevATC + * is disabled for those protection domains. + * + * The device pointer and IOMMU data remain stable in the bond struct after + * _probe_device() where it's attached to the managed IOMMU, up to the + * completion of the _release_device() call. The release of the bond structure + * is synchronized with the device release. + */ +struct riscv_iommu_bond { + struct list_head list; + struct rcu_head rcu; + struct device *dev; +}; + +static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_bond *bond; + struct list_head *bonds; + + bond = kzalloc(sizeof(*bond), GFP_KERNEL); + if (!bond) + return -ENOMEM; + bond->dev = dev; + + /* + * List of devices attached to the domain is arranged based on + * managed IOMMU device. + */ + + spin_lock(&domain->lock); + list_for_each(bonds, &domain->bonds) + if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu) + break; + list_add_rcu(&bond->list, bonds); + spin_unlock(&domain->lock); + + /* Synchronize with riscv_iommu_iotlb_inval() sequence. See comment below. */ + smp_mb(); + + return 0; +} + +static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_bond *bond, *found = NULL; + struct riscv_iommu_command cmd; + int count = 0; + + if (!domain) + return; + + spin_lock(&domain->lock); + list_for_each_entry(bond, &domain->bonds, list) { + if (found && count) + break; + else if (bond->dev == dev) + found = bond; + else if (dev_to_iommu(bond->dev) == iommu) + count++; + } + if (found) + list_del_rcu(&found->list); + spin_unlock(&domain->lock); + kfree_rcu(found, rcu); + + /* + * If this was the last bond between this domain and the IOMMU + * invalidate all cached entries for domain's PSCID. + */ + if (!count) { + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); + riscv_iommu_cmd_send(iommu, &cmd); + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + } +} + +/* + * Send IOTLB.INVAL for whole address space for ranges larger than 2MB. + * This limit will be replaced with range invalidations, if supported by + * the hardware, when RISC-V IOMMU architecture specification update for + * range invalidations update will be available. + */ +#define RISCV_IOMMU_IOTLB_INVAL_LIMIT (2 << 20) + +static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain, + unsigned long start, unsigned long end) +{ + struct riscv_iommu_bond *bond; + struct riscv_iommu_device *iommu, *prev; + struct riscv_iommu_command cmd; + unsigned long len = end - start + 1; + unsigned long iova; + + /* + * For each IOMMU linked with this protection domain (via bonds->dev), + * an IOTLB invaliation command will be submitted and executed. + * + * Possbile race with domain attach flow is handled by sequencing + * bond creation - riscv_iommu_bond_link(), and device directory + * update - riscv_iommu_iodir_update(). + * + * PTE Update / IOTLB Inval Device attach & directory update + * -------------------------- -------------------------- + * update page table entries add dev to the bond list + * FENCE RW,RW FENCE RW,RW + * For all IOMMUs: (can be empty) Update FSC/PSCID + * FENCE IOW,IOW FENCE IOW,IOW + * IOTLB.INVAL IODIR.INVAL + * IOFENCE.C + * + * If bond list is not updated with new device, directory context will + * be configured with already valid page table content. If an IOMMU is + * linked to the protection domain it will receive invalidation + * requests for updated page table entries. + */ + smp_mb(); + + rcu_read_lock(); + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + + /* + * IOTLB invalidation request can be safely omitted if already sent + * to the IOMMU for the same PSCID, and with domain->bonds list + * arranged based on the device's IOMMU, it's sufficient to check + * last device the invalidation was sent to. + */ + if (iommu == prev) + continue; + + riscv_iommu_cmd_inval_vma(&cmd); + riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); + if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { + for (iova = start; iova < end; iova += PAGE_SIZE) { + riscv_iommu_cmd_inval_set_addr(&cmd, iova); + riscv_iommu_cmd_send(iommu, &cmd); + } + } else { + riscv_iommu_cmd_send(iommu, &cmd); + } + prev = iommu; + } + + prev = NULL; + list_for_each_entry_rcu(bond, &domain->bonds, list) { + iommu = dev_to_iommu(bond->dev); + if (iommu == prev) + continue; + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + prev = iommu; + } + rcu_read_unlock(); +} + +#define RISCV_IOMMU_FSC_BARE 0 + +/* + * Update IODIR for the device. + * + * During the execution of riscv_iommu_probe_device(), IODIR entries are + * allocated for the device's identifiers. Device context invalidation + * becomes necessary only if one of the updated entries was previously + * marked as valid, given that invalid device context entries are not + * cached by the IOMMU hardware. + * In this implementation, updating a valid device context while the + * device is not quiesced might be disruptive, potentially causing + * interim translation faults. + */ +static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu, + struct device *dev, u64 fsc, u64 ta) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_dc *dc; + struct riscv_iommu_command cmd; + bool sync_required = false; + u64 tc; + int i; + + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + if (!(tc & RISCV_IOMMU_DC_TC_V)) + continue; + + WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V); + + /* Invalidate device context cached values */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); + riscv_iommu_cmd_send(iommu, &cmd); + sync_required = true; + } + + if (sync_required) + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); + + /* + * For device context with DC_TC_PDTV = 0, translation attributes valid bit + * is stored as DC_TC_V bit (both sharing the same location at BIT(0)). + */ + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + tc = READ_ONCE(dc->tc); + tc |= ta & RISCV_IOMMU_DC_TC_V; + + WRITE_ONCE(dc->fsc, fsc); + WRITE_ONCE(dc->ta, ta & RISCV_IOMMU_PC_TA_PSCID); + /* Update device context, write TC.V as the last step. */ + dma_wmb(); + WRITE_ONCE(dc->tc, tc); + + /* Invalidate device context after update */ + riscv_iommu_cmd_iodir_inval_ddt(&cmd); + riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); + riscv_iommu_cmd_send(iommu, &cmd); + } + + riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); +} + +/* + * IOVA page translation tree management. + */ + +static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + + riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); +} + +static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain, + struct iommu_iotlb_gather *gather) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + + riscv_iommu_iotlb_inval(domain, gather->start, gather->end); +} + +#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) + +#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) +#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) +#define _io_pte_none(pte) ((pte) == 0) +#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) + +static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, + unsigned long pte, struct list_head *freelist) +{ + unsigned long *ptr; + int i; + + if (!_io_pte_present(pte) || _io_pte_leaf(pte)) + return; + + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + + /* Recursively free all sub page table pages */ + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = READ_ONCE(ptr[i]); + if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) + riscv_iommu_pte_free(domain, pte, freelist); + } + + if (freelist) + list_add_tail(&virt_to_page(ptr)->lru, freelist); + else + iommu_free_page(ptr); +} + +static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, + unsigned long iova, size_t pgsize, + gfp_t gfp) +{ + unsigned long *ptr = domain->pgd_root; + unsigned long pte, old; + int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; + void *addr; + + do { + const int shift = PAGE_SHIFT + PT_SHIFT * level; + + ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); + /* + * Note: returned entry might be a non-leaf if there was + * existing mapping with smaller granularity. Up to the caller + * to replace and invalidate. + */ + if (((size_t)1 << shift) == pgsize) + return ptr; +pte_retry: + pte = READ_ONCE(*ptr); + /* + * This is very likely incorrect as we should not be adding + * new mapping with smaller granularity on top + * of existing 2M/1G mapping. Fail. + */ + if (_io_pte_present(pte) && _io_pte_leaf(pte)) + return NULL; + /* + * Non-leaf entry is missing, allocate and try to add to the + * page table. This might race with other mappings, retry. + */ + if (_io_pte_none(pte)) { + addr = iommu_alloc_page_node(domain->numa_node, gfp); + if (!addr) + return NULL; + old = pte; + pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); + if (cmpxchg_relaxed(ptr, old, pte) != old) { + iommu_free_page(addr); + goto pte_retry; + } + } + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + } while (level-- > 0); + + return NULL; +} + +static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, + unsigned long iova, size_t *pte_pgsize) +{ + unsigned long *ptr = domain->pgd_root; + unsigned long pte; + int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; + + do { + const int shift = PAGE_SHIFT + PT_SHIFT * level; + + ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); + pte = READ_ONCE(*ptr); + if (_io_pte_present(pte) && _io_pte_leaf(pte)) { + *pte_pgsize = (size_t)1 << shift; + return ptr; + } + if (_io_pte_none(pte)) + return NULL; + ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); + } while (level-- > 0); + + return NULL; +} + +static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, + unsigned long iova, phys_addr_t phys, + size_t pgsize, size_t pgcount, int prot, + gfp_t gfp, size_t *mapped) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t size = 0; + unsigned long *ptr; + unsigned long pte, old, pte_prot; + int rc = 0; + LIST_HEAD(freelist); + + if (!(prot & IOMMU_WRITE)) + pte_prot = _PAGE_BASE | _PAGE_READ; + else if (domain->amo_enabled) + pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; + else + pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; + + while (pgcount) { + ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); + if (!ptr) { + rc = -ENOMEM; + break; + } + + old = READ_ONCE(*ptr); + pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); + if (cmpxchg_relaxed(ptr, old, pte) != old) + continue; + + riscv_iommu_pte_free(domain, old, &freelist); + + size += pgsize; + iova += pgsize; + phys += pgsize; + --pgcount; + } + + *mapped = size; + + if (!list_empty(&freelist)) { + /* + * In 1.0 spec version, the smallest scope we can use to + * invalidate all levels of page table (i.e. leaf and non-leaf) + * is an invalidate-all-PSCID IOTINVAL.VMA with AV=0. + * This will be updated with hardware support for + * capability.NL (non-leaf) IOTINVAL command. + */ + riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); + iommu_put_pages_list(&freelist); + } + + return rc; +} + +static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *gather) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + size_t size = pgcount << __ffs(pgsize); + unsigned long *ptr, old; + size_t unmapped = 0; + size_t pte_size; + + while (unmapped < size) { + ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); + if (!ptr) + return unmapped; + + /* partial unmap is not allowed, fail. */ + if (iova & (pte_size - 1)) + return unmapped; + + old = READ_ONCE(*ptr); + if (cmpxchg_relaxed(ptr, old, 0) != old) + continue; + + iommu_iotlb_gather_add_page(&domain->domain, gather, iova, + pte_size); + + iova += pte_size; + unmapped += pte_size; + } + + return unmapped; +} + +static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, + dma_addr_t iova) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + unsigned long pte_size; + unsigned long *ptr; + + ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); + if (_io_pte_none(*ptr) || !_io_pte_present(*ptr)) + return 0; + + return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); +} + +static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + const unsigned long pfn = virt_to_pfn(domain->pgd_root); + + WARN_ON(!list_empty(&domain->bonds)); + + if ((int)domain->pscid > 0) + ida_free(&riscv_iommu_pscids, domain->pscid); + + riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); + kfree(domain); +} + +static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode) +{ + switch (pgd_mode) { + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39; + + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48; + + case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57: + return iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57; + } + return false; +} + +static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + u64 fsc, ta; + + if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) + return -ENODEV; + + fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | + FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); + ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | + RISCV_IOMMU_PC_TA_V; + + if (riscv_iommu_bond_link(domain, dev)) + return -ENOMEM; + + riscv_iommu_iodir_update(iommu, dev, fsc, ta); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = domain; + + return 0; +} + +static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { + .attach_dev = riscv_iommu_attach_paging_domain, + .free = riscv_iommu_free_paging_domain, + .map_pages = riscv_iommu_map_pages, + .unmap_pages = riscv_iommu_unmap_pages, + .iova_to_phys = riscv_iommu_iova_to_phys, + .iotlb_sync = riscv_iommu_iotlb_sync, + .flush_iotlb_all = riscv_iommu_iotlb_flush_all, +}; + +static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) +{ + struct riscv_iommu_domain *domain; + struct riscv_iommu_device *iommu; + unsigned int pgd_mode; + dma_addr_t va_mask; + int va_bits; + + iommu = dev_to_iommu(dev); + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; + va_bits = 57; + } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; + va_bits = 48; + } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { + pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; + va_bits = 39; + } else { + dev_err(dev, "cannot find supported page table mode\n"); + return ERR_PTR(-ENODEV); + } + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD_RCU(&domain->bonds); + spin_lock_init(&domain->lock); + domain->numa_node = dev_to_node(iommu->dev); + domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); + domain->pgd_mode = pgd_mode; + domain->pgd_root = iommu_alloc_page_node(domain->numa_node, + GFP_KERNEL_ACCOUNT); + if (!domain->pgd_root) { + kfree(domain); + return ERR_PTR(-ENOMEM); + } + + domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, + RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); + if (domain->pscid < 0) { + iommu_free_page(domain->pgd_root); + kfree(domain); + return ERR_PTR(-ENOMEM); + } + + /* + * Note: RISC-V Privilege spec mandates that virtual addresses + * need to be sign-extended, so if (VA_BITS - 1) is set, all + * bits >= VA_BITS need to also be set or else we'll get a + * page fault. However the code that creates the mappings + * above us (e.g. iommu_dma_alloc_iova()) won't do that for us + * for now, so we'll end up with invalid virtual addresses + * to map. As a workaround until we get this sorted out + * limit the available virtual addresses to VA_BITS - 1. + */ + va_mask = DMA_BIT_MASK(va_bits - 1); + + domain->domain.geometry.aperture_start = 0; + domain->domain.geometry.aperture_end = va_mask; + domain->domain.geometry.force_aperture = true; + domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); + + domain->domain.ops = &riscv_iommu_paging_domain_ops; + + return &domain->domain; +} + +static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + /* Make device context invalid, translation requests will fault w/ #258 */ + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = NULL; + + return 0; +} + +static struct iommu_domain riscv_iommu_blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = riscv_iommu_attach_blocking_domain, + } +}; + +static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, + struct device *dev) +{ + struct riscv_iommu_device *iommu = dev_to_iommu(dev); + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V); + riscv_iommu_bond_unlink(info->domain, dev); + info->domain = NULL; + + return 0; +} + +static struct iommu_domain riscv_iommu_identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = riscv_iommu_attach_identity_domain, + } +}; + +static struct iommu_group *riscv_iommu_device_group(struct device *dev) +{ + if (dev_is_pci(dev)) + return pci_device_group(dev); + return generic_device_group(dev); +} + +static int riscv_iommu_of_xlate(struct device *dev, const struct of_phandle_args *args) +{ + return iommu_fwspec_add_ids(dev, args->args, 1); +} + +static struct iommu_device *riscv_iommu_probe_device(struct device *dev) +{ + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct riscv_iommu_device *iommu; + struct riscv_iommu_info *info; + struct riscv_iommu_dc *dc; + u64 tc; + int i; + + if (!fwspec || !fwspec->iommu_fwnode->dev || !fwspec->num_ids) + return ERR_PTR(-ENODEV); + + iommu = dev_get_drvdata(fwspec->iommu_fwnode->dev); + if (!iommu) + return ERR_PTR(-ENODEV); + + /* + * IOMMU hardware operating in fail-over BARE mode will provide + * identity translation for all connected devices anyway... + */ + if (iommu->ddt_mode <= RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) + return ERR_PTR(-ENODEV); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + /* + * Allocate and pre-configure device context entries in + * the device directory. Do not mark the context valid yet. + */ + tc = 0; + if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) + tc |= RISCV_IOMMU_DC_TC_SADE; + for (i = 0; i < fwspec->num_ids; i++) { + dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); + if (!dc) { + kfree(info); + return ERR_PTR(-ENODEV); + } + if (READ_ONCE(dc->tc) & RISCV_IOMMU_DC_TC_V) + dev_warn(dev, "already attached to IOMMU device directory\n"); + WRITE_ONCE(dc->tc, tc); + } + + dev_iommu_priv_set(dev, info); + + return &iommu->iommu; +} + +static void riscv_iommu_release_device(struct device *dev) +{ + struct riscv_iommu_info *info = dev_iommu_priv_get(dev); + + kfree_rcu_mightsleep(info); +} + +static const struct iommu_ops riscv_iommu_ops = { + .pgsize_bitmap = SZ_4K, + .of_xlate = riscv_iommu_of_xlate, + .identity_domain = &riscv_iommu_identity_domain, + .blocked_domain = &riscv_iommu_blocking_domain, + .release_domain = &riscv_iommu_blocking_domain, + .domain_alloc_paging = riscv_iommu_alloc_paging_domain, + .device_group = riscv_iommu_device_group, + .probe_device = riscv_iommu_probe_device, + .release_device = riscv_iommu_release_device, +}; + +static int riscv_iommu_init_check(struct riscv_iommu_device *iommu) +{ + u64 ddtp; + + /* + * Make sure the IOMMU is switched off or in pass-through mode during + * regular boot flow and disable translation when we boot into a kexec + * kernel and the previous kernel left them enabled. + */ + ddtp = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_DDTP); + if (ddtp & RISCV_IOMMU_DDTP_BUSY) + return -EBUSY; + + if (FIELD_GET(RISCV_IOMMU_DDTP_IOMMU_MODE, ddtp) > + RISCV_IOMMU_DDTP_IOMMU_MODE_BARE) { + if (!is_kdump_kernel()) + return -EBUSY; + riscv_iommu_disable(iommu); + } + + /* Configure accesses to in-memory data structures for CPU-native byte order. */ + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != + !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) { + if (!(iommu->caps & RISCV_IOMMU_CAPABILITIES_END)) + return -EINVAL; + riscv_iommu_writel(iommu, RISCV_IOMMU_REG_FCTL, + iommu->fctl ^ RISCV_IOMMU_FCTL_BE); + iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); + if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) != + !!(iommu->fctl & RISCV_IOMMU_FCTL_BE)) + return -EINVAL; + } + + /* + * Distribute interrupt vectors, always use first vector for CIV. + * At least one interrupt is required. Read back and verify. + */ + if (!iommu->irqs_count) + return -EINVAL; + + iommu->icvec = FIELD_PREP(RISCV_IOMMU_ICVEC_FIV, 1 % iommu->irqs_count) | + FIELD_PREP(RISCV_IOMMU_ICVEC_PIV, 2 % iommu->irqs_count) | + FIELD_PREP(RISCV_IOMMU_ICVEC_PMIV, 3 % iommu->irqs_count); + riscv_iommu_writeq(iommu, RISCV_IOMMU_REG_ICVEC, iommu->icvec); + iommu->icvec = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_ICVEC); + if (max(max(FIELD_GET(RISCV_IOMMU_ICVEC_CIV, iommu->icvec), + FIELD_GET(RISCV_IOMMU_ICVEC_FIV, iommu->icvec)), + max(FIELD_GET(RISCV_IOMMU_ICVEC_PIV, iommu->icvec), + FIELD_GET(RISCV_IOMMU_ICVEC_PMIV, iommu->icvec))) >= iommu->irqs_count) + return -EINVAL; + + return 0; +} + +void riscv_iommu_remove(struct riscv_iommu_device *iommu) +{ + iommu_device_unregister(&iommu->iommu); + iommu_device_sysfs_remove(&iommu->iommu); + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); + riscv_iommu_queue_disable(&iommu->cmdq); + riscv_iommu_queue_disable(&iommu->fltq); +} + +int riscv_iommu_init(struct riscv_iommu_device *iommu) +{ + int rc; + + RISCV_IOMMU_QUEUE_INIT(&iommu->cmdq, CQ); + RISCV_IOMMU_QUEUE_INIT(&iommu->fltq, FQ); + + rc = riscv_iommu_init_check(iommu); + if (rc) + return dev_err_probe(iommu->dev, rc, "unexpected device state\n"); + + rc = riscv_iommu_iodir_alloc(iommu); + if (rc) + return rc; + + rc = riscv_iommu_queue_alloc(iommu, &iommu->cmdq, + sizeof(struct riscv_iommu_command)); + if (rc) + return rc; + + rc = riscv_iommu_queue_alloc(iommu, &iommu->fltq, + sizeof(struct riscv_iommu_fq_record)); + if (rc) + return rc; + + rc = riscv_iommu_queue_enable(iommu, &iommu->cmdq, riscv_iommu_cmdq_process); + if (rc) + return rc; + + rc = riscv_iommu_queue_enable(iommu, &iommu->fltq, riscv_iommu_fltq_process); + if (rc) + goto err_queue_disable; + + rc = riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_MAX); + if (rc) + goto err_queue_disable; + + rc = iommu_device_sysfs_add(&iommu->iommu, NULL, NULL, "riscv-iommu@%s", + dev_name(iommu->dev)); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register sysfs interface\n"); + goto err_iodir_off; + } + + rc = iommu_device_register(&iommu->iommu, &riscv_iommu_ops, iommu->dev); + if (rc) { + dev_err_probe(iommu->dev, rc, "cannot register iommu interface\n"); + goto err_remove_sysfs; + } + + return 0; + +err_remove_sysfs: + iommu_device_sysfs_remove(&iommu->iommu); +err_iodir_off: + riscv_iommu_iodir_set_mode(iommu, RISCV_IOMMU_DDTP_IOMMU_MODE_OFF); +err_queue_disable: + riscv_iommu_queue_disable(&iommu->fltq); + riscv_iommu_queue_disable(&iommu->cmdq); + return rc; +} diff --git a/drivers/iommu/riscv/iommu.h b/drivers/iommu/riscv/iommu.h new file mode 100644 index 000000000000..b1c4664542b4 --- /dev/null +++ b/drivers/iommu/riscv/iommu.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright © 2022-2024 Rivos Inc. + * Copyright © 2023 FORTH-ICS/CARV + * + * Authors + * Tomasz Jeznach <[email protected]> + * Nick Kossifidis <[email protected]> + */ + +#ifndef _RISCV_IOMMU_H_ +#define _RISCV_IOMMU_H_ + +#include <linux/iommu.h> +#include <linux/types.h> +#include <linux/iopoll.h> + +#include "iommu-bits.h" + +struct riscv_iommu_device; + +struct riscv_iommu_queue { + atomic_t prod; /* unbounded producer allocation index */ + atomic_t head; /* unbounded shadow ring buffer consumer index */ + atomic_t tail; /* unbounded shadow ring buffer producer index */ + unsigned int mask; /* index mask, queue length - 1 */ + unsigned int irq; /* allocated interrupt number */ + struct riscv_iommu_device *iommu; /* iommu device handling the queue when active */ + void *base; /* ring buffer kernel pointer */ + dma_addr_t phys; /* ring buffer physical address */ + u16 qbr; /* base register offset, head and tail reference */ + u16 qcr; /* control and status register offset */ + u8 qid; /* queue identifier, same as RISCV_IOMMU_INTR_XX */ +}; + +struct riscv_iommu_device { + /* iommu core interface */ + struct iommu_device iommu; + + /* iommu hardware */ + struct device *dev; + + /* hardware control register space */ + void __iomem *reg; + + /* supported and enabled hardware capabilities */ + u64 caps; + u32 fctl; + + /* available interrupt numbers, MSI or WSI */ + unsigned int irqs[RISCV_IOMMU_INTR_COUNT]; + unsigned int irqs_count; + unsigned int icvec; + + /* hardware queues */ + struct riscv_iommu_queue cmdq; + struct riscv_iommu_queue fltq; + + /* device directory */ + unsigned int ddt_mode; + dma_addr_t ddt_phys; + u64 *ddt_root; +}; + +int riscv_iommu_init(struct riscv_iommu_device *iommu); +void riscv_iommu_remove(struct riscv_iommu_device *iommu); + +#define riscv_iommu_readl(iommu, addr) \ + readl_relaxed((iommu)->reg + (addr)) + +#define riscv_iommu_readq(iommu, addr) \ + readq_relaxed((iommu)->reg + (addr)) + +#define riscv_iommu_writel(iommu, addr, val) \ + writel_relaxed((val), (iommu)->reg + (addr)) + +#define riscv_iommu_writeq(iommu, addr, val) \ + writeq_relaxed((val), (iommu)->reg + (addr)) + +#define riscv_iommu_readq_timeout(iommu, addr, val, cond, delay_us, timeout_us) \ + readx_poll_timeout(readq_relaxed, (iommu)->reg + (addr), val, cond, \ + delay_us, timeout_us) + +#define riscv_iommu_readl_timeout(iommu, addr, val, cond, delay_us, timeout_us) \ + readx_poll_timeout(readl_relaxed, (iommu)->reg + (addr), val, cond, \ + delay_us, timeout_us) + +#endif diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c index d8eaa7ea380b..fbdeded3d48b 100644 --- a/drivers/iommu/s390-iommu.c +++ b/drivers/iommu/s390-iommu.c @@ -33,6 +33,8 @@ struct s390_domain { struct rcu_head rcu; }; +static struct iommu_domain blocking_domain; + static inline unsigned int calc_rtx(dma_addr_t ptr) { return ((unsigned long)ptr >> ZPCI_RT_SHIFT) & ZPCI_INDEX_MASK; @@ -369,20 +371,36 @@ static void s390_domain_free(struct iommu_domain *domain) call_rcu(&s390_domain->rcu, s390_iommu_rcu_free_domain); } -static void s390_iommu_detach_device(struct iommu_domain *domain, - struct device *dev) +static void zdev_s390_domain_update(struct zpci_dev *zdev, + struct iommu_domain *domain) +{ + unsigned long flags; + + spin_lock_irqsave(&zdev->dom_lock, flags); + zdev->s390_domain = domain; + spin_unlock_irqrestore(&zdev->dom_lock, flags); +} + +static int blocking_domain_attach_device(struct iommu_domain *domain, + struct device *dev) { - struct s390_domain *s390_domain = to_s390_domain(domain); struct zpci_dev *zdev = to_zpci_dev(dev); + struct s390_domain *s390_domain; unsigned long flags; + if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED) + return 0; + + s390_domain = to_s390_domain(zdev->s390_domain); spin_lock_irqsave(&s390_domain->list_lock, flags); list_del_rcu(&zdev->iommu_list); spin_unlock_irqrestore(&s390_domain->list_lock, flags); zpci_unregister_ioat(zdev, 0); - zdev->s390_domain = NULL; zdev->dma_table = NULL; + zdev_s390_domain_update(zdev, domain); + + return 0; } static int s390_iommu_attach_device(struct iommu_domain *domain, @@ -401,20 +419,15 @@ static int s390_iommu_attach_device(struct iommu_domain *domain, domain->geometry.aperture_end < zdev->start_dma)) return -EINVAL; - if (zdev->s390_domain) - s390_iommu_detach_device(&zdev->s390_domain->domain, dev); + blocking_domain_attach_device(&blocking_domain, dev); + /* If we fail now DMA remains blocked via blocking domain */ cc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, virt_to_phys(s390_domain->dma_table), &status); - /* - * If the device is undergoing error recovery the reset code - * will re-establish the new domain. - */ if (cc && status != ZPCI_PCI_ST_FUNC_NOT_AVAIL) return -EIO; - zdev->dma_table = s390_domain->dma_table; - zdev->s390_domain = s390_domain; + zdev_s390_domain_update(zdev, domain); spin_lock_irqsave(&s390_domain->list_lock, flags); list_add_rcu(&zdev->iommu_list, &s390_domain->devices); @@ -466,19 +479,11 @@ static struct iommu_device *s390_iommu_probe_device(struct device *dev) if (zdev->tlb_refresh) dev->iommu->shadow_on_flush = 1; - return &zdev->iommu_dev; -} + /* Start with DMA blocked */ + spin_lock_init(&zdev->dom_lock); + zdev_s390_domain_update(zdev, &blocking_domain); -static void s390_iommu_release_device(struct device *dev) -{ - struct zpci_dev *zdev = to_zpci_dev(dev); - - /* - * release_device is expected to detach any domain currently attached - * to the device, but keep it attached to other devices in the group. - */ - if (zdev) - s390_iommu_detach_device(&zdev->s390_domain->domain, dev); + return &zdev->iommu_dev; } static int zpci_refresh_all(struct zpci_dev *zdev) @@ -697,9 +702,15 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain, struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev) { - if (!zdev || !zdev->s390_domain) + struct s390_domain *s390_domain; + + lockdep_assert_held(&zdev->dom_lock); + + if (zdev->s390_domain->type == IOMMU_DOMAIN_BLOCKED) return NULL; - return &zdev->s390_domain->ctrs; + + s390_domain = to_s390_domain(zdev->s390_domain); + return &s390_domain->ctrs; } int zpci_init_iommu(struct zpci_dev *zdev) @@ -776,11 +787,19 @@ static int __init s390_iommu_init(void) } subsys_initcall(s390_iommu_init); +static struct iommu_domain blocking_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocking_domain_attach_device, + } +}; + static const struct iommu_ops s390_iommu_ops = { + .blocked_domain = &blocking_domain, + .release_domain = &blocking_domain, .capable = s390_iommu_capable, .domain_alloc_paging = s390_domain_alloc_paging, .probe_device = s390_iommu_probe_device, - .release_device = s390_iommu_release_device, .device_group = generic_device_group, .pgsize_bitmap = SZ_4K, .get_resv_regions = s390_iommu_get_resv_regions, |