diff options
Diffstat (limited to 'drivers/iommu/intel')
-rw-r--r-- | drivers/iommu/intel/Kconfig | 6 | ||||
-rw-r--r-- | drivers/iommu/intel/Makefile | 1 | ||||
-rw-r--r-- | drivers/iommu/intel/debugfs.c | 111 | ||||
-rw-r--r-- | drivers/iommu/intel/dmar.c | 58 | ||||
-rw-r--r-- | drivers/iommu/intel/iommu.c | 215 | ||||
-rw-r--r-- | drivers/iommu/intel/pasid.c | 15 | ||||
-rw-r--r-- | drivers/iommu/intel/pasid.h | 6 | ||||
-rw-r--r-- | drivers/iommu/intel/perf.c | 166 | ||||
-rw-r--r-- | drivers/iommu/intel/perf.h | 73 | ||||
-rw-r--r-- | drivers/iommu/intel/svm.c | 644 |
10 files changed, 875 insertions, 420 deletions
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 28a3d1596c76..43ebd8af11c5 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -3,6 +3,9 @@ config DMAR_TABLE bool +config DMAR_PERF + bool + config INTEL_IOMMU bool "Support for Intel IOMMU using DMA Remapping Devices" depends on PCI_MSI && ACPI && (X86 || IA64) @@ -14,6 +17,7 @@ config INTEL_IOMMU select SWIOTLB select IOASID select IOMMU_DMA + select PCI_ATS help DMA remapping (DMAR) devices support enables independent address translations for Direct Memory Access (DMA) from devices. @@ -24,6 +28,7 @@ config INTEL_IOMMU config INTEL_IOMMU_DEBUGFS bool "Export Intel IOMMU internals in Debugfs" depends on INTEL_IOMMU && IOMMU_DEBUGFS + select DMAR_PERF help !!!WARNING!!! @@ -41,6 +46,7 @@ config INTEL_IOMMU_SVM select PCI_PRI select MMU_NOTIFIER select IOASID + select IOMMU_SVA_LIB help Shared Virtual Memory (SVM) provides a facility for devices to access DMA resources through process address space by diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index ae236ec7d219..fa0dae16441c 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o +obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o obj-$(CONFIG_INTEL_IOMMU_SVM) += svm.o obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o diff --git a/drivers/iommu/intel/debugfs.c b/drivers/iommu/intel/debugfs.c index efea7f02abd9..62e23ff3c987 100644 --- a/drivers/iommu/intel/debugfs.c +++ b/drivers/iommu/intel/debugfs.c @@ -16,6 +16,7 @@ #include <asm/irq_remapping.h> #include "pasid.h" +#include "perf.h" struct tbl_walk { u16 bus; @@ -31,6 +32,9 @@ struct iommu_regset { const char *regs; }; +#define DEBUG_BUFFER_SIZE 1024 +static char debug_buf[DEBUG_BUFFER_SIZE]; + #define IOMMU_REGSET_ENTRY(_reg_) \ { DMAR_##_reg_##_REG, __stringify(_reg_) } @@ -538,6 +542,111 @@ static int ir_translation_struct_show(struct seq_file *m, void *unused) DEFINE_SHOW_ATTRIBUTE(ir_translation_struct); #endif +static void latency_show_one(struct seq_file *m, struct intel_iommu *iommu, + struct dmar_drhd_unit *drhd) +{ + int ret; + + seq_printf(m, "IOMMU: %s Register Base Address: %llx\n", + iommu->name, drhd->reg_base_addr); + + ret = dmar_latency_snapshot(iommu, debug_buf, DEBUG_BUFFER_SIZE); + if (ret < 0) + seq_puts(m, "Failed to get latency snapshot"); + else + seq_puts(m, debug_buf); + seq_puts(m, "\n"); +} + +static int latency_show(struct seq_file *m, void *v) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + latency_show_one(m, iommu, drhd); + rcu_read_unlock(); + + return 0; +} + +static int dmar_perf_latency_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, latency_show, NULL); +} + +static ssize_t dmar_perf_latency_write(struct file *filp, + const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int counting; + char buf[64]; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (kstrtoint(buf, 0, &counting)) + return -EINVAL; + + switch (counting) { + case 0: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) { + dmar_latency_disable(iommu, DMAR_LATENCY_INV_IOTLB); + dmar_latency_disable(iommu, DMAR_LATENCY_INV_DEVTLB); + dmar_latency_disable(iommu, DMAR_LATENCY_INV_IEC); + dmar_latency_disable(iommu, DMAR_LATENCY_PRQ); + } + rcu_read_unlock(); + break; + case 1: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + dmar_latency_enable(iommu, DMAR_LATENCY_INV_IOTLB); + rcu_read_unlock(); + break; + case 2: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + dmar_latency_enable(iommu, DMAR_LATENCY_INV_DEVTLB); + rcu_read_unlock(); + break; + case 3: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + dmar_latency_enable(iommu, DMAR_LATENCY_INV_IEC); + rcu_read_unlock(); + break; + case 4: + rcu_read_lock(); + for_each_active_iommu(iommu, drhd) + dmar_latency_enable(iommu, DMAR_LATENCY_PRQ); + rcu_read_unlock(); + break; + default: + return -EINVAL; + } + + *ppos += cnt; + return cnt; +} + +static const struct file_operations dmar_perf_latency_fops = { + .open = dmar_perf_latency_open, + .write = dmar_perf_latency_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + void __init intel_iommu_debugfs_init(void) { struct dentry *intel_iommu_debug = debugfs_create_dir("intel", @@ -556,4 +665,6 @@ void __init intel_iommu_debugfs_init(void) debugfs_create_file("ir_translation_struct", 0444, intel_iommu_debug, NULL, &ir_translation_struct_fops); #endif + debugfs_create_file("dmar_perf_latency", 0644, intel_iommu_debug, + NULL, &dmar_perf_latency_fops); } diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 1757ac1e1623..d66f79acd14d 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -34,6 +34,7 @@ #include <trace/events/intel_iommu.h> #include "../irq_remapping.h" +#include "perf.h" typedef int (*dmar_res_handler_t)(struct acpi_dmar_header *, void *); struct dmar_res_callback { @@ -1142,7 +1143,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) err = iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); if (err) - goto err_unmap; + goto err_sysfs; } drhd->iommu = iommu; @@ -1150,6 +1151,8 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) return 0; +err_sysfs: + iommu_device_sysfs_remove(&iommu->iommu); err_unmap: unmap_iommu(iommu); error_free_seq_id: @@ -1340,15 +1343,33 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, unsigned int count, unsigned long options) { struct q_inval *qi = iommu->qi; + s64 devtlb_start_ktime = 0; + s64 iotlb_start_ktime = 0; + s64 iec_start_ktime = 0; struct qi_desc wait_desc; int wait_index, index; unsigned long flags; int offset, shift; int rc, i; + u64 type; if (!qi) return 0; + type = desc->qw0 & GENMASK_ULL(3, 0); + + if ((type == QI_IOTLB_TYPE || type == QI_EIOTLB_TYPE) && + dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IOTLB)) + iotlb_start_ktime = ktime_to_ns(ktime_get()); + + if ((type == QI_DIOTLB_TYPE || type == QI_DEIOTLB_TYPE) && + dmar_latency_enabled(iommu, DMAR_LATENCY_INV_DEVTLB)) + devtlb_start_ktime = ktime_to_ns(ktime_get()); + + if (type == QI_IEC_TYPE && + dmar_latency_enabled(iommu, DMAR_LATENCY_INV_IEC)) + iec_start_ktime = ktime_to_ns(ktime_get()); + restart: rc = 0; @@ -1423,6 +1444,18 @@ restart: if (rc == -EAGAIN) goto restart; + if (iotlb_start_ktime) + dmar_latency_update(iommu, DMAR_LATENCY_INV_IOTLB, + ktime_to_ns(ktime_get()) - iotlb_start_ktime); + + if (devtlb_start_ktime) + dmar_latency_update(iommu, DMAR_LATENCY_INV_DEVTLB, + ktime_to_ns(ktime_get()) - devtlb_start_ktime); + + if (iec_start_ktime) + dmar_latency_update(iommu, DMAR_LATENCY_INV_IEC, + ktime_to_ns(ktime_get()) - iec_start_ktime); + return rc; } @@ -1911,16 +1944,23 @@ static int dmar_fault_do_one(struct intel_iommu *iommu, int type, reason = dmar_get_fault_reason(fault_reason, &fault_type); if (fault_type == INTR_REMAP) - pr_err("[INTR-REMAP] Request device [%02x:%02x.%d] fault index %llx [fault reason %02d] %s\n", - source_id >> 8, PCI_SLOT(source_id & 0xFF), - PCI_FUNC(source_id & 0xFF), addr >> 48, - fault_reason, reason); - else - pr_err("[%s] Request device [%02x:%02x.%d] PASID %x fault addr %llx [fault reason %02d] %s\n", + pr_err("[INTR-REMAP] Request device [0x%02x:0x%02x.%d] fault index 0x%llx [fault reason 0x%02x] %s\n", + source_id >> 8, PCI_SLOT(source_id & 0xFF), + PCI_FUNC(source_id & 0xFF), addr >> 48, + fault_reason, reason); + else if (pasid == INVALID_IOASID) + pr_err("[%s NO_PASID] Request device [0x%02x:0x%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n", type ? "DMA Read" : "DMA Write", source_id >> 8, PCI_SLOT(source_id & 0xFF), - PCI_FUNC(source_id & 0xFF), pasid, addr, + PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason); + else + pr_err("[%s PASID 0x%x] Request device [0x%02x:0x%02x.%d] fault addr 0x%llx [fault reason 0x%02x] %s\n", + type ? "DMA Read" : "DMA Write", pasid, + source_id >> 8, PCI_SLOT(source_id & 0xFF), + PCI_FUNC(source_id & 0xFF), addr, + fault_reason, reason); + return 0; } @@ -1987,7 +2027,7 @@ irqreturn_t dmar_fault(int irq, void *dev_id) if (!ratelimited) /* Using pasid -1 if pasid is not present */ dmar_fault_do_one(iommu, type, fault_reason, - pasid_present ? pasid : -1, + pasid_present ? pasid : INVALID_IOASID, source_id, guest_addr); fault_index++; diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 708f430af1c4..dd22fc7d5176 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -46,6 +46,7 @@ #include <asm/iommu.h> #include "../irq_remapping.h" +#include "../iommu-sva-lib.h" #include "pasid.h" #include "cap_audit.h" @@ -564,7 +565,7 @@ static inline int domain_pfn_supported(struct dmar_domain *domain, static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) { unsigned long sagaw; - int agaw = -1; + int agaw; sagaw = cap_sagaw(iommu->cap); for (agaw = width_to_agaw(max_gaw); @@ -625,12 +626,12 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain) bool found = false; int i; - domain->iommu_coherency = 1; + domain->iommu_coherency = true; for_each_domain_iommu(i, domain) { found = true; if (!iommu_paging_structure_coherency(g_iommus[i])) { - domain->iommu_coherency = 0; + domain->iommu_coherency = false; break; } } @@ -641,18 +642,18 @@ static void domain_update_iommu_coherency(struct dmar_domain *domain) rcu_read_lock(); for_each_active_iommu(iommu, drhd) { if (!iommu_paging_structure_coherency(iommu)) { - domain->iommu_coherency = 0; + domain->iommu_coherency = false; break; } } rcu_read_unlock(); } -static int domain_update_iommu_snooping(struct intel_iommu *skip) +static bool domain_update_iommu_snooping(struct intel_iommu *skip) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; - int ret = 1; + bool ret = true; rcu_read_lock(); for_each_active_iommu(iommu, drhd) { @@ -665,7 +666,7 @@ static int domain_update_iommu_snooping(struct intel_iommu *skip) */ if (!sm_supported(iommu) && !ecap_sc_support(iommu->ecap)) { - ret = 0; + ret = false; break; } } @@ -682,9 +683,8 @@ static int domain_update_iommu_superpage(struct dmar_domain *domain, struct intel_iommu *iommu; int mask = 0x3; - if (!intel_iommu_superpage) { + if (!intel_iommu_superpage) return 0; - } /* set iommu_superpage to the smallest common denominator */ rcu_read_lock(); @@ -1919,7 +1919,6 @@ static int domain_attach_iommu(struct dmar_domain *domain, assert_spin_locked(&iommu->lock); domain->iommu_refcnt[iommu->seq_id] += 1; - domain->iommu_count += 1; if (domain->iommu_refcnt[iommu->seq_id] == 1) { ndomains = cap_ndoms(iommu->cap); num = find_first_zero_bit(iommu->domain_ids, ndomains); @@ -1927,7 +1926,6 @@ static int domain_attach_iommu(struct dmar_domain *domain, if (num >= ndomains) { pr_err("%s: No free domain ids\n", iommu->name); domain->iommu_refcnt[iommu->seq_id] -= 1; - domain->iommu_count -= 1; return -ENOSPC; } @@ -1943,16 +1941,15 @@ static int domain_attach_iommu(struct dmar_domain *domain, return 0; } -static int domain_detach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +static void domain_detach_iommu(struct dmar_domain *domain, + struct intel_iommu *iommu) { - int num, count; + int num; assert_spin_locked(&device_domain_lock); assert_spin_locked(&iommu->lock); domain->iommu_refcnt[iommu->seq_id] -= 1; - count = --domain->iommu_count; if (domain->iommu_refcnt[iommu->seq_id] == 0) { num = domain->iommu_did[iommu->seq_id]; clear_bit(num, iommu->domain_ids); @@ -1961,8 +1958,6 @@ static int domain_detach_iommu(struct dmar_domain *domain, domain_update_iommu_cap(domain); domain->iommu_did[iommu->seq_id] = 0; } - - return count; } static inline int guestwidth_to_adjustwidth(int gaw) @@ -2434,10 +2429,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, return 0; } -static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn) +static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn) { - unsigned long flags; + struct intel_iommu *iommu = info->iommu; struct context_entry *context; + unsigned long flags; u16 did_old; if (!iommu) @@ -2449,7 +2445,16 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn spin_unlock_irqrestore(&iommu->lock, flags); return; } - did_old = context_domain_id(context); + + if (sm_supported(iommu)) { + if (hw_pass_through && domain_type_is_si(info->domain)) + did_old = FLPT_DEFAULT_DID; + else + did_old = info->domain->iommu_did[iommu->seq_id]; + } else { + did_old = context_domain_id(context); + } + context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock_irqrestore(&iommu->lock, flags); @@ -2467,6 +2472,8 @@ static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn 0, 0, DMA_TLB_DSI_FLUSH); + + __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH); } static inline void unlink_domain_info(struct device_domain_info *info) @@ -2525,9 +2532,9 @@ static int domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, u32 pasid) { - int flags = PASID_FLAG_SUPERVISOR_MODE; struct dma_pte *pgd = domain->pgd; int agaw, level; + int flags = 0; /* * Skip top levels of page tables for iommu which has @@ -2543,7 +2550,10 @@ static int domain_setup_first_level(struct intel_iommu *iommu, if (level != 4 && level != 5) return -EINVAL; - flags |= (level == 5) ? PASID_FLAG_FL5LP : 0; + if (pasid != PASID_RID2PASID) + flags |= PASID_FLAG_SUPERVISOR_MODE; + if (level == 5) + flags |= PASID_FLAG_FL5LP; if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED) flags |= PASID_FLAG_PAGE_SNOOP; @@ -4135,62 +4145,56 @@ static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev) return container_of(iommu_dev, struct intel_iommu, iommu); } -static ssize_t intel_iommu_show_version(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); u32 ver = readl(iommu->reg + DMAR_VER_REG); return sprintf(buf, "%d:%d\n", DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver)); } -static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL); +static DEVICE_ATTR_RO(version); -static ssize_t intel_iommu_show_address(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t address_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%llx\n", iommu->reg_phys); } -static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL); +static DEVICE_ATTR_RO(address); -static ssize_t intel_iommu_show_cap(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t cap_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%llx\n", iommu->cap); } -static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL); +static DEVICE_ATTR_RO(cap); -static ssize_t intel_iommu_show_ecap(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t ecap_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%llx\n", iommu->ecap); } -static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL); +static DEVICE_ATTR_RO(ecap); -static ssize_t intel_iommu_show_ndoms(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t domains_supported_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap)); } -static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL); +static DEVICE_ATTR_RO(domains_supported); -static ssize_t intel_iommu_show_ndoms_used(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t domains_used_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct intel_iommu *iommu = dev_to_intel_iommu(dev); return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))); } -static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL); +static DEVICE_ATTR_RO(domains_used); static struct attribute *intel_iommu_attrs[] = { &dev_attr_version.attr, @@ -4433,9 +4437,9 @@ out_free_dmar: static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque) { - struct intel_iommu *iommu = opaque; + struct device_domain_info *info = opaque; - domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff); + domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff); return 0; } @@ -4445,12 +4449,13 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op * devices, unbinding the driver from any one of them will possibly leave * the others unable to operate. */ -static void domain_context_clear(struct intel_iommu *iommu, struct device *dev) +static void domain_context_clear(struct device_domain_info *info) { - if (!iommu || !dev || !dev_is_pci(dev)) + if (!info->iommu || !info->dev || !dev_is_pci(info->dev)) return; - pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu); + pci_for_each_dma_alias(to_pci_dev(info->dev), + &domain_context_clear_one_cb, info); } static void __dmar_remove_one_dev_info(struct device_domain_info *info) @@ -4467,14 +4472,13 @@ static void __dmar_remove_one_dev_info(struct device_domain_info *info) iommu = info->iommu; domain = info->domain; - if (info->dev) { + if (info->dev && !dev_is_real_dma_subdevice(info->dev)) { if (dev_is_pci(info->dev) && sm_supported(iommu)) intel_pasid_tear_down_entry(iommu, info->dev, PASID_RID2PASID, false); iommu_disable_dev_iotlb(info); - if (!dev_is_real_dma_subdevice(info->dev)) - domain_context_clear(iommu, info->dev); + domain_context_clear(info); intel_pasid_free_table(info->dev); } @@ -4508,13 +4512,13 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) adjust_width = guestwidth_to_adjustwidth(guest_width); domain->agaw = width_to_agaw(adjust_width); - domain->iommu_coherency = 0; - domain->iommu_snooping = 0; + domain->iommu_coherency = false; + domain->iommu_snooping = false; domain->iommu_superpage = 0; domain->max_addr = 0; /* always allocate the top pgd */ - domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid); + domain->pgd = alloc_pgtable_page(domain->nid); if (!domain->pgd) return -ENOMEM; domain_flush_cache(domain, domain->pgd, PAGE_SIZE); @@ -4606,6 +4610,8 @@ static int auxiliary_link_device(struct dmar_domain *domain, if (!sinfo) { sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC); + if (!sinfo) + return -ENOMEM; sinfo->domain = domain; sinfo->pdev = dev; list_add(&sinfo->link_phys, &info->subdevices); @@ -4752,6 +4758,13 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, if (!iommu) return -ENODEV; + if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) && + !ecap_nest(iommu->ecap)) { + dev_err(dev, "%s: iommu not support nested translation\n", + iommu->name); + return -EINVAL; + } + /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) @@ -4773,8 +4786,7 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, pte = dmar_domain->pgd; if (dma_pte_present(pte)) { - dmar_domain->pgd = (struct dma_pte *) - phys_to_virt(dma_pte_addr(pte)); + dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte)); free_pgtable_page(pte); } dmar_domain->agaw--; @@ -5124,7 +5136,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, static bool intel_iommu_capable(enum iommu_cap cap) { if (cap == IOMMU_CAP_CACHE_COHERENCY) - return domain_update_iommu_snooping(NULL) == 1; + return domain_update_iommu_snooping(NULL); if (cap == IOMMU_CAP_INTR_REMAP) return irq_remapping_enabled == 1; @@ -5160,13 +5172,10 @@ static void intel_iommu_release_device(struct device *dev) static void intel_iommu_probe_finalize(struct device *dev) { - dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT; struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); if (domain && domain->type == IOMMU_DOMAIN_DMA) - iommu_setup_dma_ops(dev, base, - __DOMAIN_MAX_ADDR(dmar_domain->gaw) - base); + iommu_setup_dma_ops(dev, 0, U64_MAX); else set_dma_ops(dev, NULL); } @@ -5326,6 +5335,48 @@ static int intel_iommu_disable_auxd(struct device *dev) return 0; } +static int intel_iommu_enable_sva(struct device *dev) +{ + struct device_domain_info *info = get_domain_info(dev); + struct intel_iommu *iommu; + int ret; + + if (!info || dmar_disabled) + return -EINVAL; + + iommu = info->iommu; + if (!iommu) + return -EINVAL; + + if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE)) + return -ENODEV; + + if (intel_iommu_enable_pasid(iommu, dev)) + return -ENODEV; + + if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) + return -EINVAL; + + ret = iopf_queue_add_device(iommu->iopf_queue, dev); + if (!ret) + ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev); + + return ret; +} + +static int intel_iommu_disable_sva(struct device *dev) +{ + struct device_domain_info *info = get_domain_info(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + ret = iommu_unregister_device_fault_handler(dev); + if (!ret) + ret = iopf_queue_remove_device(iommu->iopf_queue, dev); + + return ret; +} + /* * A PCI express designated vendor specific extended capability is defined * in the section 3.7 of Intel scalable I/O virtualization technical spec @@ -5387,35 +5438,37 @@ intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat) static int intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat) { - if (feat == IOMMU_DEV_FEAT_AUX) + switch (feat) { + case IOMMU_DEV_FEAT_AUX: return intel_iommu_enable_auxd(dev); - if (feat == IOMMU_DEV_FEAT_IOPF) + case IOMMU_DEV_FEAT_IOPF: return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV; - if (feat == IOMMU_DEV_FEAT_SVA) { - struct device_domain_info *info = get_domain_info(dev); - - if (!info) - return -EINVAL; + case IOMMU_DEV_FEAT_SVA: + return intel_iommu_enable_sva(dev); - if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled) - return -EINVAL; - - if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) - return 0; + default: + return -ENODEV; } - - return -ENODEV; } static int intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat) { - if (feat == IOMMU_DEV_FEAT_AUX) + switch (feat) { + case IOMMU_DEV_FEAT_AUX: return intel_iommu_disable_auxd(dev); - return -ENODEV; + case IOMMU_DEV_FEAT_IOPF: + return 0; + + case IOMMU_DEV_FEAT_SVA: + return intel_iommu_disable_sva(dev); + + default: + return -ENODEV; + } } static bool @@ -5452,7 +5505,7 @@ intel_iommu_enable_nesting(struct iommu_domain *domain) int ret = -ENODEV; spin_lock_irqsave(&device_domain_lock, flags); - if (nested_mode_support() && list_empty(&dmar_domain->devices)) { + if (list_empty(&dmar_domain->devices)) { dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE; dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL; ret = 0; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 72646bafc52f..9ec374e17469 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/** +/* * intel-pasid.c - PASID idr, table and entry manipulation * * Copyright (C) 2018 Intel Corporation @@ -511,7 +511,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore) { struct pasid_entry *pte; - u16 did; + u16 did, pgtt; pte = intel_pasid_get_entry(dev, pasid); if (WARN_ON(!pte)) @@ -521,13 +521,19 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, return; did = pasid_get_domain_id(pte); + pgtt = pasid_pte_get_pgtt(pte); + intel_pasid_clear_entry(dev, pasid, fault_ignore); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(pte, sizeof(*pte)); pasid_cache_invalidation_with_pasid(iommu, did, pasid); - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + + if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY) + qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); + else + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); /* Device IOTLB doesn't need to be flushed in caching mode. */ if (!cap_caching_mode(iommu->cap)) @@ -699,7 +705,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, * Since it is a second level only translation setup, we should * set SRE bit as well (addresses are expected to be GPAs). */ - pasid_set_sre(pte); + if (pasid != PASID_RID2PASID) + pasid_set_sre(pte); pasid_set_present(pte); pasid_flush_caches(iommu, pte, pasid, did); diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 5ff61c3d401f..c11bc8b833b8 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -99,6 +99,12 @@ static inline bool pasid_pte_is_present(struct pasid_entry *pte) return READ_ONCE(pte->val[0]) & PASID_PTE_PRESENT; } +/* Get PGTT field of a PASID table entry */ +static inline u16 pasid_pte_get_pgtt(struct pasid_entry *pte) +{ + return (u16)((READ_ONCE(pte->val[0]) >> 6) & 0x7); +} + extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); void intel_pasid_free_table(struct device *dev); diff --git a/drivers/iommu/intel/perf.c b/drivers/iommu/intel/perf.c new file mode 100644 index 000000000000..73b7ec705552 --- /dev/null +++ b/drivers/iommu/intel/perf.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/** + * perf.c - performance monitor + * + * Copyright (C) 2021 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + * Fenghua Yu <fenghua.yu@intel.com> + */ + +#include <linux/spinlock.h> +#include <linux/intel-iommu.h> + +#include "perf.h" + +static DEFINE_SPINLOCK(latency_lock); + +bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type) +{ + struct latency_statistic *lstat = iommu->perf_statistic; + + return lstat && lstat[type].enabled; +} + +int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type) +{ + struct latency_statistic *lstat; + unsigned long flags; + int ret = -EBUSY; + + if (dmar_latency_enabled(iommu, type)) + return 0; + + spin_lock_irqsave(&latency_lock, flags); + if (!iommu->perf_statistic) { + iommu->perf_statistic = kzalloc(sizeof(*lstat) * DMAR_LATENCY_NUM, + GFP_ATOMIC); + if (!iommu->perf_statistic) { + ret = -ENOMEM; + goto unlock_out; + } + } + + lstat = iommu->perf_statistic; + + if (!lstat[type].enabled) { + lstat[type].enabled = true; + lstat[type].counter[COUNTS_MIN] = UINT_MAX; + ret = 0; + } +unlock_out: + spin_unlock_irqrestore(&latency_lock, flags); + + return ret; +} + +void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type) +{ + struct latency_statistic *lstat = iommu->perf_statistic; + unsigned long flags; + + if (!dmar_latency_enabled(iommu, type)) + return; + + spin_lock_irqsave(&latency_lock, flags); + memset(&lstat[type], 0, sizeof(*lstat) * DMAR_LATENCY_NUM); + spin_unlock_irqrestore(&latency_lock, flags); +} + +void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency) +{ + struct latency_statistic *lstat = iommu->perf_statistic; + unsigned long flags; + u64 min, max; + + if (!dmar_latency_enabled(iommu, type)) + return; + + spin_lock_irqsave(&latency_lock, flags); + if (latency < 100) + lstat[type].counter[COUNTS_10e2]++; + else if (latency < 1000) + lstat[type].counter[COUNTS_10e3]++; + else if (latency < 10000) + lstat[type].counter[COUNTS_10e4]++; + else if (latency < 100000) + lstat[type].counter[COUNTS_10e5]++; + else if (latency < 1000000) + lstat[type].counter[COUNTS_10e6]++; + else if (latency < 10000000) + lstat[type].counter[COUNTS_10e7]++; + else + lstat[type].counter[COUNTS_10e8_plus]++; + + min = lstat[type].counter[COUNTS_MIN]; + max = lstat[type].counter[COUNTS_MAX]; + lstat[type].counter[COUNTS_MIN] = min_t(u64, min, latency); + lstat[type].counter[COUNTS_MAX] = max_t(u64, max, latency); + lstat[type].counter[COUNTS_SUM] += latency; + lstat[type].samples++; + spin_unlock_irqrestore(&latency_lock, flags); +} + +static char *latency_counter_names[] = { + " <0.1us", + " 0.1us-1us", " 1us-10us", " 10us-100us", + " 100us-1ms", " 1ms-10ms", " >=10ms", + " min(us)", " max(us)", " average(us)" +}; + +static char *latency_type_names[] = { + " inv_iotlb", " inv_devtlb", " inv_iec", + " svm_prq" +}; + +int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) +{ + struct latency_statistic *lstat = iommu->perf_statistic; + unsigned long flags; + int bytes = 0, i, j; + + memset(str, 0, size); + + for (i = 0; i < COUNTS_NUM; i++) + bytes += snprintf(str + bytes, size - bytes, + "%s", latency_counter_names[i]); + + spin_lock_irqsave(&latency_lock, flags); + for (i = 0; i < DMAR_LATENCY_NUM; i++) { + if (!dmar_latency_enabled(iommu, i)) + continue; + + bytes += snprintf(str + bytes, size - bytes, + "\n%s", latency_type_names[i]); + + for (j = 0; j < COUNTS_NUM; j++) { + u64 val = lstat[i].counter[j]; + + switch (j) { + case COUNTS_MIN: + if (val == UINT_MAX) + val = 0; + else + val = div_u64(val, 1000); + break; + case COUNTS_MAX: + val = div_u64(val, 1000); + break; + case COUNTS_SUM: + if (lstat[i].samples) + val = div_u64(val, (lstat[i].samples * 1000)); + else + val = 0; + break; + default: + break; + } + + bytes += snprintf(str + bytes, size - bytes, + "%12lld", val); + } + } + spin_unlock_irqrestore(&latency_lock, flags); + + return bytes; +} diff --git a/drivers/iommu/intel/perf.h b/drivers/iommu/intel/perf.h new file mode 100644 index 000000000000..fd6db8049d1a --- /dev/null +++ b/drivers/iommu/intel/perf.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * perf.h - performance monitor header + * + * Copyright (C) 2021 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + */ + +enum latency_type { + DMAR_LATENCY_INV_IOTLB = 0, + DMAR_LATENCY_INV_DEVTLB, + DMAR_LATENCY_INV_IEC, + DMAR_LATENCY_PRQ, + DMAR_LATENCY_NUM +}; + +enum latency_count { + COUNTS_10e2 = 0, /* < 0.1us */ + COUNTS_10e3, /* 0.1us ~ 1us */ + COUNTS_10e4, /* 1us ~ 10us */ + COUNTS_10e5, /* 10us ~ 100us */ + COUNTS_10e6, /* 100us ~ 1ms */ + COUNTS_10e7, /* 1ms ~ 10ms */ + COUNTS_10e8_plus, /* 10ms and plus*/ + COUNTS_MIN, + COUNTS_MAX, + COUNTS_SUM, + COUNTS_NUM +}; + +struct latency_statistic { + bool enabled; + u64 counter[COUNTS_NUM]; + u64 samples; +}; + +#ifdef CONFIG_DMAR_PERF +int dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type); +void dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type); +bool dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type); +void dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, + u64 latency); +int dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size); +#else +static inline int +dmar_latency_enable(struct intel_iommu *iommu, enum latency_type type) +{ + return -EINVAL; +} + +static inline void +dmar_latency_disable(struct intel_iommu *iommu, enum latency_type type) +{ +} + +static inline bool +dmar_latency_enabled(struct intel_iommu *iommu, enum latency_type type) +{ + return false; +} + +static inline void +dmar_latency_update(struct intel_iommu *iommu, enum latency_type type, u64 latency) +{ +} + +static inline int +dmar_latency_snapshot(struct intel_iommu *iommu, char *str, size_t size) +{ + return 0; +} +#endif /* CONFIG_DMAR_PERF */ diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 5165cea90421..4b9b3f35ba0e 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -17,19 +17,76 @@ #include <linux/dmar.h> #include <linux/interrupt.h> #include <linux/mm_types.h> +#include <linux/xarray.h> #include <linux/ioasid.h> #include <asm/page.h> #include <asm/fpu/api.h> +#include <trace/events/intel_iommu.h> #include "pasid.h" +#include "perf.h" +#include "../iommu-sva-lib.h" static irqreturn_t prq_event_thread(int irq, void *d); static void intel_svm_drain_prq(struct device *dev, u32 pasid); +#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) #define PRQ_ORDER 0 +static DEFINE_XARRAY_ALLOC(pasid_private_array); +static int pasid_private_add(ioasid_t pasid, void *priv) +{ + return xa_alloc(&pasid_private_array, &pasid, priv, + XA_LIMIT(pasid, pasid), GFP_ATOMIC); +} + +static void pasid_private_remove(ioasid_t pasid) +{ + xa_erase(&pasid_private_array, pasid); +} + +static void *pasid_private_find(ioasid_t pasid) +{ + return xa_load(&pasid_private_array, pasid); +} + +static struct intel_svm_dev * +svm_lookup_device_by_sid(struct intel_svm *svm, u16 sid) +{ + struct intel_svm_dev *sdev = NULL, *t; + + rcu_read_lock(); + list_for_each_entry_rcu(t, &svm->devs, list) { + if (t->sid == sid) { + sdev = t; + break; + } + } + rcu_read_unlock(); + + return sdev; +} + +static struct intel_svm_dev * +svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev) +{ + struct intel_svm_dev *sdev = NULL, *t; + + rcu_read_lock(); + list_for_each_entry_rcu(t, &svm->devs, list) { + if (t->dev == dev) { + sdev = t; + break; + } + } + rcu_read_unlock(); + + return sdev; +} + int intel_svm_enable_prq(struct intel_iommu *iommu) { + struct iopf_queue *iopfq; struct page *pages; int irq, ret; @@ -46,13 +103,20 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) pr_err("IOMMU: %s: Failed to create IRQ vector for page request queue\n", iommu->name); ret = -EINVAL; - err: - free_pages((unsigned long)iommu->prq, PRQ_ORDER); - iommu->prq = NULL; - return ret; + goto free_prq; } iommu->pr_irq = irq; + snprintf(iommu->iopfq_name, sizeof(iommu->iopfq_name), + "dmar%d-iopfq", iommu->seq_id); + iopfq = iopf_queue_alloc(iommu->iopfq_name); + if (!iopfq) { + pr_err("IOMMU: %s: Failed to allocate iopf queue\n", iommu->name); + ret = -ENOMEM; + goto free_hwirq; + } + iommu->iopf_queue = iopfq; + snprintf(iommu->prq_name, sizeof(iommu->prq_name), "dmar%d-prq", iommu->seq_id); ret = request_threaded_irq(irq, NULL, prq_event_thread, IRQF_ONESHOT, @@ -60,9 +124,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) if (ret) { pr_err("IOMMU: %s: Failed to request IRQ for page request queue\n", iommu->name); - dmar_free_hwirq(irq); - iommu->pr_irq = 0; - goto err; + goto free_iopfq; } dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); @@ -71,6 +133,18 @@ int intel_svm_enable_prq(struct intel_iommu *iommu) init_completion(&iommu->prq_complete); return 0; + +free_iopfq: + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; +free_hwirq: + dmar_free_hwirq(irq); + iommu->pr_irq = 0; +free_prq: + free_pages((unsigned long)iommu->prq, PRQ_ORDER); + iommu->prq = NULL; + + return ret; } int intel_svm_finish_prq(struct intel_iommu *iommu) @@ -85,6 +159,11 @@ int intel_svm_finish_prq(struct intel_iommu *iommu) iommu->pr_irq = 0; } + if (iommu->iopf_queue) { + iopf_queue_free(iommu->iopf_queue); + iommu->iopf_queue = NULL; + } + free_pages((unsigned long)iommu->prq, PRQ_ORDER); iommu->prq = NULL; @@ -204,17 +283,12 @@ static const struct mmu_notifier_ops intel_mmuops = { }; static DEFINE_MUTEX(pasid_mutex); -static LIST_HEAD(global_svm_list); - -#define for_each_svm_dev(sdev, svm, d) \ - list_for_each_entry((sdev), &(svm)->devs, list) \ - if ((d) != (sdev)->dev) {} else static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, struct intel_svm **rsvm, struct intel_svm_dev **rsdev) { - struct intel_svm_dev *d, *sdev = NULL; + struct intel_svm_dev *sdev = NULL; struct intel_svm *svm; /* The caller should hold the pasid_mutex lock */ @@ -224,7 +298,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, if (pasid == INVALID_IOASID || pasid >= PASID_MAX) return -EINVAL; - svm = ioasid_find(NULL, pasid, NULL); + svm = pasid_private_find(pasid); if (IS_ERR(svm)) return PTR_ERR(svm); @@ -237,15 +311,7 @@ static int pasid_to_svm_sdev(struct device *dev, unsigned int pasid, */ if (WARN_ON(list_empty(&svm->devs))) return -EINVAL; - - rcu_read_lock(); - list_for_each_entry_rcu(d, &svm->devs, list) { - if (d->dev == dev) { - sdev = d; - break; - } - } - rcu_read_unlock(); + sdev = svm_lookup_device_by_dev(svm, dev); out: *rsvm = svm; @@ -334,7 +400,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, svm->gpasid = data->gpasid; svm->flags |= SVM_FLAG_GUEST_PASID; } - ioasid_set_data(data->hpasid, svm); + pasid_private_add(data->hpasid, svm); INIT_LIST_HEAD_RCU(&svm->devs); mmput(svm->mm); } @@ -388,7 +454,7 @@ int intel_svm_bind_gpasid(struct iommu_domain *domain, struct device *dev, list_add_rcu(&sdev->list, &svm->devs); out: if (!IS_ERR_OR_NULL(svm) && list_empty(&svm->devs)) { - ioasid_set_data(data->hpasid, NULL); + pasid_private_remove(data->hpasid); kfree(svm); } @@ -431,7 +497,7 @@ int intel_svm_unbind_gpasid(struct device *dev, u32 pasid) * the unbind, IOMMU driver will get notified * and perform cleanup. */ - ioasid_set_data(pasid, NULL); + pasid_private_remove(pasid); kfree(svm); } } @@ -459,79 +525,81 @@ static void load_pasid(struct mm_struct *mm, u32 pasid) mutex_unlock(&mm->context.lock); } -/* Caller must hold pasid_mutex, mm reference */ -static int -intel_svm_bind_mm(struct device *dev, unsigned int flags, - struct mm_struct *mm, struct intel_svm_dev **sd) +static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm, + unsigned int flags) { - struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); - struct intel_svm *svm = NULL, *t; - struct device_domain_info *info; - struct intel_svm_dev *sdev; - unsigned long iflags; - int pasid_max; - int ret; + ioasid_t max_pasid = dev_is_pci(dev) ? + pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id; - if (!iommu || dmar_disabled) - return -EINVAL; + return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1); +} - if (!intel_svm_capable(iommu)) - return -ENOTSUPP; +static void intel_svm_free_pasid(struct mm_struct *mm) +{ + iommu_sva_free_pasid(mm); +} - if (dev_is_pci(dev)) { - pasid_max = pci_max_pasids(to_pci_dev(dev)); - if (pasid_max < 0) - return -EINVAL; - } else - pasid_max = 1 << 20; +static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, + struct device *dev, + struct mm_struct *mm, + unsigned int flags) +{ + struct device_domain_info *info = get_domain_info(dev); + unsigned long iflags, sflags; + struct intel_svm_dev *sdev; + struct intel_svm *svm; + int ret = 0; - /* Bind supervisor PASID shuld have mm = NULL */ - if (flags & SVM_FLAG_SUPERVISOR_MODE) { - if (!ecap_srs(iommu->ecap) || mm) { - pr_err("Supervisor PASID with user provided mm.\n"); - return -EINVAL; - } - } + svm = pasid_private_find(mm->pasid); + if (!svm) { + svm = kzalloc(sizeof(*svm), GFP_KERNEL); + if (!svm) + return ERR_PTR(-ENOMEM); - list_for_each_entry(t, &global_svm_list, list) { - if (t->mm != mm) - continue; + svm->pasid = mm->pasid; + svm->mm = mm; + svm->flags = flags; + INIT_LIST_HEAD_RCU(&svm->devs); - svm = t; - if (svm->pasid >= pasid_max) { - dev_warn(dev, - "Limited PASID width. Cannot use existing PASID %d\n", - svm->pasid); - ret = -ENOSPC; - goto out; + if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) { + svm->notifier.ops = &intel_mmuops; + ret = mmu_notifier_register(&svm->notifier, mm); + if (ret) { + kfree(svm); + return ERR_PTR(ret); + } } - /* Find the matching device in svm list */ - for_each_svm_dev(sdev, svm, dev) { - sdev->users++; - goto success; + ret = pasid_private_add(svm->pasid, svm); + if (ret) { + if (svm->notifier.ops) + mmu_notifier_unregister(&svm->notifier, mm); + kfree(svm); + return ERR_PTR(ret); } + } - break; + /* Find the matching device in svm list */ + sdev = svm_lookup_device_by_dev(svm, dev); + if (sdev) { + sdev->users++; + goto success; } sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); if (!sdev) { ret = -ENOMEM; - goto out; + goto free_svm; } + sdev->dev = dev; sdev->iommu = iommu; - - ret = intel_iommu_enable_pasid(iommu, dev); - if (ret) { - kfree(sdev); - goto out; - } - - info = get_domain_info(dev); sdev->did = FLPT_DEFAULT_DID; sdev->sid = PCI_DEVID(info->bus, info->devfn); + sdev->users = 1; + sdev->pasid = svm->pasid; + sdev->sva.dev = dev; + init_rcu_head(&sdev->rcu); if (info->ats_enabled) { sdev->dev_iotlb = 1; sdev->qdep = info->ats_qdep; @@ -539,95 +607,37 @@ intel_svm_bind_mm(struct device *dev, unsigned int flags, sdev->qdep = 0; } - /* Finish the setup now we know we're keeping it */ - sdev->users = 1; - init_rcu_head(&sdev->rcu); - - if (!svm) { - svm = kzalloc(sizeof(*svm), GFP_KERNEL); - if (!svm) { - ret = -ENOMEM; - kfree(sdev); - goto out; - } - - if (pasid_max > intel_pasid_max_id) - pasid_max = intel_pasid_max_id; + /* Setup the pasid table: */ + sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ? + PASID_FLAG_SUPERVISOR_MODE : 0; + sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + spin_lock_irqsave(&iommu->lock, iflags); + ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid, + FLPT_DEFAULT_DID, sflags); + spin_unlock_irqrestore(&iommu->lock, iflags); - /* Do not use PASID 0, reserved for RID to PASID */ - svm->pasid = ioasid_alloc(NULL, PASID_MIN, - pasid_max - 1, svm); - if (svm->pasid == INVALID_IOASID) { - kfree(svm); - kfree(sdev); - ret = -ENOSPC; - goto out; - } - svm->notifier.ops = &intel_mmuops; - svm->mm = mm; - svm->flags = flags; - INIT_LIST_HEAD_RCU(&svm->devs); - INIT_LIST_HEAD(&svm->list); - ret = -ENOMEM; - if (mm) { - ret = mmu_notifier_register(&svm->notifier, mm); - if (ret) { - ioasid_put(svm->pasid); - kfree(svm); - kfree(sdev); - goto out; - } - } + if (ret) + goto free_sdev; - spin_lock_irqsave(&iommu->lock, iflags); - ret = intel_pasid_setup_first_level(iommu, dev, - mm ? mm->pgd : init_mm.pgd, - svm->pasid, FLPT_DEFAULT_DID, - (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | - (cpu_feature_enabled(X86_FEATURE_LA57) ? - PASID_FLAG_FL5LP : 0)); - spin_unlock_irqrestore(&iommu->lock, iflags); - if (ret) { - if (mm) - mmu_notifier_unregister(&svm->notifier, mm); - ioasid_put(svm->pasid); - kfree(svm); - kfree(sdev); - goto out; - } + /* The newly allocated pasid is loaded to the mm. */ + if (!(flags & SVM_FLAG_SUPERVISOR_MODE) && list_empty(&svm->devs)) + load_pasid(mm, svm->pasid); - list_add_tail(&svm->list, &global_svm_list); - if (mm) { - /* The newly allocated pasid is loaded to the mm. */ - load_pasid(mm, svm->pasid); - } - } else { - /* - * Binding a new device with existing PASID, need to setup - * the PASID entry. - */ - spin_lock_irqsave(&iommu->lock, iflags); - ret = intel_pasid_setup_first_level(iommu, dev, - mm ? mm->pgd : init_mm.pgd, - svm->pasid, FLPT_DEFAULT_DID, - (mm ? 0 : PASID_FLAG_SUPERVISOR_MODE) | - (cpu_feature_enabled(X86_FEATURE_LA57) ? - PASID_FLAG_FL5LP : 0)); - spin_unlock_irqrestore(&iommu->lock, iflags); - if (ret) { - kfree(sdev); - goto out; - } - } list_add_rcu(&sdev->list, &svm->devs); success: - sdev->pasid = svm->pasid; - sdev->sva.dev = dev; - if (sd) - *sd = sdev; - ret = 0; -out: - return ret; + return &sdev->sva; + +free_sdev: + kfree(sdev); +free_svm: + if (list_empty(&svm->devs)) { + if (svm->notifier.ops) + mmu_notifier_unregister(&svm->notifier, mm); + pasid_private_remove(mm->pasid); + kfree(svm); + } + + return ERR_PTR(ret); } /* Caller must hold pasid_mutex */ @@ -636,6 +646,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid) struct intel_svm_dev *sdev; struct intel_iommu *iommu; struct intel_svm *svm; + struct mm_struct *mm; int ret = -EINVAL; iommu = device_to_iommu(dev, NULL, NULL); @@ -645,6 +656,7 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid) ret = pasid_to_svm_sdev(dev, pasid, &svm, &sdev); if (ret) goto out; + mm = svm->mm; if (sdev) { sdev->users--; @@ -663,13 +675,12 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid) kfree_rcu(sdev, rcu); if (list_empty(&svm->devs)) { - ioasid_put(svm->pasid); - if (svm->mm) { - mmu_notifier_unregister(&svm->notifier, svm->mm); + if (svm->notifier.ops) { + mmu_notifier_unregister(&svm->notifier, mm); /* Clear mm's pasid. */ - load_pasid(svm->mm, PASID_DISABLED); + load_pasid(mm, PASID_DISABLED); } - list_del(&svm->list); + pasid_private_remove(svm->pasid); /* We mandate that no page faults may be outstanding * for the PASID when intel_svm_unbind_mm() is called. * If that is not obeyed, subtle errors will happen. @@ -678,6 +689,8 @@ static int intel_svm_unbind_mm(struct device *dev, u32 pasid) kfree(svm); } } + /* Drop a PASID reference and free it if no reference. */ + intel_svm_free_pasid(mm); } out: return ret; @@ -714,22 +727,6 @@ struct page_req_dsc { #define PRQ_RING_MASK ((0x1000 << PRQ_ORDER) - 0x20) -static bool access_error(struct vm_area_struct *vma, struct page_req_dsc *req) -{ - unsigned long requested = 0; - - if (req->exe_req) - requested |= VM_EXEC; - - if (req->rd_req) - requested |= VM_READ; - - if (req->wr_req) - requested |= VM_WRITE; - - return (requested & ~vma->vm_flags) != 0; -} - static bool is_canonical_address(u64 addr) { int shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); @@ -799,6 +796,8 @@ prq_retry: goto prq_retry; } + iopf_queue_flush_dev(dev); + /* * Perform steps described in VT-d spec CH7.10 to drain page * requests and responses in hardware. @@ -841,8 +840,8 @@ static int prq_to_iommu_prot(struct page_req_dsc *req) return prot; } -static int -intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) +static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev, + struct page_req_dsc *desc) { struct iommu_fault_event event; @@ -872,159 +871,136 @@ intel_svm_prq_report(struct device *dev, struct page_req_dsc *desc) */ event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE; event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA; - memcpy(event.fault.prm.private_data, desc->priv_data, - sizeof(desc->priv_data)); + event.fault.prm.private_data[0] = desc->priv_data[0]; + event.fault.prm.private_data[1] = desc->priv_data[1]; + } else if (dmar_latency_enabled(iommu, DMAR_LATENCY_PRQ)) { + /* + * If the private data fields are not used by hardware, use it + * to monitor the prq handle latency. + */ + event.fault.prm.private_data[0] = ktime_to_ns(ktime_get()); } return iommu_report_device_fault(dev, &event); } +static void handle_bad_prq_event(struct intel_iommu *iommu, + struct page_req_dsc *req, int result) +{ + struct qi_desc desc; + + pr_err("%s: Invalid page request: %08llx %08llx\n", + iommu->name, ((unsigned long long *)req)[0], + ((unsigned long long *)req)[1]); + + /* + * Per VT-d spec. v3.0 ch7.7, system software must + * respond with page group response if private data + * is present (PDP) or last page in group (LPIG) bit + * is set. This is an additional VT-d feature beyond + * PCI ATS spec. + */ + if (!req->lpig && !req->priv_data_present) + return; + + desc.qw0 = QI_PGRP_PASID(req->pasid) | + QI_PGRP_DID(req->rid) | + QI_PGRP_PASID_P(req->pasid_present) | + QI_PGRP_PDP(req->priv_data_present) | + QI_PGRP_RESP_CODE(result) | + QI_PGRP_RESP_TYPE; + desc.qw1 = QI_PGRP_IDX(req->prg_index) | + QI_PGRP_LPIG(req->lpig); + + if (req->priv_data_present) { + desc.qw2 = req->priv_data[0]; + desc.qw3 = req->priv_data[1]; + } else { + desc.qw2 = 0; + desc.qw3 = 0; + } + + qi_submit_sync(iommu, &desc, 1, 0); +} + static irqreturn_t prq_event_thread(int irq, void *d) { struct intel_svm_dev *sdev = NULL; struct intel_iommu *iommu = d; struct intel_svm *svm = NULL; - int head, tail, handled = 0; - unsigned int flags = 0; + struct page_req_dsc *req; + int head, tail, handled; + u64 address; - /* Clear PPR bit before reading head/tail registers, to - * ensure that we get a new interrupt if needed. */ + /* + * Clear PPR bit before reading head/tail registers, to ensure that + * we get a new interrupt if needed. + */ writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; + handled = (head != tail); while (head != tail) { - struct vm_area_struct *vma; - struct page_req_dsc *req; - struct qi_desc resp; - int result; - vm_fault_t ret; - u64 address; - - handled = 1; req = &iommu->prq[head / sizeof(*req)]; - result = QI_RESP_INVALID; address = (u64)req->addr << VTD_PAGE_SHIFT; - if (!req->pasid_present) { - pr_err("%s: Page request without PASID: %08llx %08llx\n", - iommu->name, ((unsigned long long *)req)[0], - ((unsigned long long *)req)[1]); - goto no_pasid; + + if (unlikely(!req->pasid_present)) { + pr_err("IOMMU: %s: Page request without PASID\n", + iommu->name); +bad_req: + svm = NULL; + sdev = NULL; + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); + goto prq_advance; } - /* We shall not receive page request for supervisor SVM */ - if (req->pm_req && (req->rd_req | req->wr_req)) { - pr_err("Unexpected page request in Privilege Mode"); - /* No need to find the matching sdev as for bad_req */ - goto no_pasid; + + if (unlikely(!is_canonical_address(address))) { + pr_err("IOMMU: %s: Address is not canonical\n", + iommu->name); + goto bad_req; } - /* DMA read with exec requeset is not supported. */ - if (req->exe_req && req->rd_req) { - pr_err("Execution request not supported\n"); - goto no_pasid; + + if (unlikely(req->pm_req && (req->rd_req | req->wr_req))) { + pr_err("IOMMU: %s: Page request in Privilege Mode\n", + iommu->name); + goto bad_req; } + + if (unlikely(req->exe_req && req->rd_req)) { + pr_err("IOMMU: %s: Execution request not supported\n", + iommu->name); + goto bad_req; + } + if (!svm || svm->pasid != req->pasid) { - rcu_read_lock(); - svm = ioasid_find(NULL, req->pasid, NULL); - /* It *can't* go away, because the driver is not permitted + /* + * It can't go away, because the driver is not permitted * to unbind the mm while any page faults are outstanding. - * So we only need RCU to protect the internal idr code. */ - rcu_read_unlock(); - if (IS_ERR_OR_NULL(svm)) { - pr_err("%s: Page request for invalid PASID %d: %08llx %08llx\n", - iommu->name, req->pasid, ((unsigned long long *)req)[0], - ((unsigned long long *)req)[1]); - goto no_pasid; - } + */ + svm = pasid_private_find(req->pasid); + if (IS_ERR_OR_NULL(svm) || (svm->flags & SVM_FLAG_SUPERVISOR_MODE)) + goto bad_req; } if (!sdev || sdev->sid != req->rid) { - struct intel_svm_dev *t; - - sdev = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(t, &svm->devs, list) { - if (t->sid == req->rid) { - sdev = t; - break; - } - } - rcu_read_unlock(); + sdev = svm_lookup_device_by_sid(svm, req->rid); + if (!sdev) + goto bad_req; } - /* Since we're using init_mm.pgd directly, we should never take - * any faults on kernel addresses. */ - if (!svm->mm) - goto bad_req; - - /* If address is not canonical, return invalid response */ - if (!is_canonical_address(address)) - goto bad_req; + sdev->prq_seq_number++; /* * If prq is to be handled outside iommu driver via receiver of * the fault notifiers, we skip the page response here. */ - if (svm->flags & SVM_FLAG_GUEST_MODE) { - if (sdev && !intel_svm_prq_report(sdev->dev, req)) - goto prq_advance; - else - goto bad_req; - } - - /* If the mm is already defunct, don't handle faults. */ - if (!mmget_not_zero(svm->mm)) - goto bad_req; - - mmap_read_lock(svm->mm); - vma = find_extend_vma(svm->mm, address); - if (!vma || address < vma->vm_start) - goto invalid; - - if (access_error(vma, req)) - goto invalid; + if (intel_svm_prq_report(iommu, sdev->dev, req)) + handle_bad_prq_event(iommu, req, QI_RESP_INVALID); - flags = FAULT_FLAG_USER | FAULT_FLAG_REMOTE; - if (req->wr_req) - flags |= FAULT_FLAG_WRITE; - - ret = handle_mm_fault(vma, address, flags, NULL); - if (ret & VM_FAULT_ERROR) - goto invalid; - - result = QI_RESP_SUCCESS; -invalid: - mmap_read_unlock(svm->mm); - mmput(svm->mm); -bad_req: - /* We get here in the error case where the PASID lookup failed, - and these can be NULL. Do not use them below this point! */ - sdev = NULL; - svm = NULL; -no_pasid: - if (req->lpig || req->priv_data_present) { - /* - * Per VT-d spec. v3.0 ch7.7, system software must - * respond with page group response if private data - * is present (PDP) or last page in group (LPIG) bit - * is set. This is an additional VT-d feature beyond - * PCI ATS spec. - */ - resp.qw0 = QI_PGRP_PASID(req->pasid) | - QI_PGRP_DID(req->rid) | - QI_PGRP_PASID_P(req->pasid_present) | - QI_PGRP_PDP(req->priv_data_present) | - QI_PGRP_RESP_CODE(result) | - QI_PGRP_RESP_TYPE; - resp.qw1 = QI_PGRP_IDX(req->prg_index) | - QI_PGRP_LPIG(req->lpig); - resp.qw2 = 0; - resp.qw3 = 0; - - if (req->priv_data_present) - memcpy(&resp.qw2, req->priv_data, - sizeof(req->priv_data)); - qi_submit_sync(iommu, &resp, 1, 0); - } + trace_prq_report(iommu, sdev->dev, req->qw_0, req->qw_1, + req->priv_data[0], req->priv_data[1], + sdev->prq_seq_number); prq_advance: head = (head + sizeof(*req)) & PRQ_RING_MASK; } @@ -1041,6 +1017,7 @@ prq_advance: head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; if (head == tail) { + iopf_queue_discard_partial(iommu->iopf_queue); writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); pr_info_ratelimited("IOMMU: %s: PRQ overflow cleared", iommu->name); @@ -1053,31 +1030,42 @@ prq_advance: return IRQ_RETVAL(handled); } -#define to_intel_svm_dev(handle) container_of(handle, struct intel_svm_dev, sva) -struct iommu_sva * -intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) { - struct iommu_sva *sva = ERR_PTR(-EINVAL); - struct intel_svm_dev *sdev = NULL; + struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); unsigned int flags = 0; + struct iommu_sva *sva; int ret; - /* - * TODO: Consolidate with generic iommu-sva bind after it is merged. - * It will require shared SVM data structures, i.e. combine io_mm - * and intel_svm etc. - */ if (drvdata) flags = *(unsigned int *)drvdata; + + if (flags & SVM_FLAG_SUPERVISOR_MODE) { + if (!ecap_srs(iommu->ecap)) { + dev_err(dev, "%s: Supervisor PASID not supported\n", + iommu->name); + return ERR_PTR(-EOPNOTSUPP); + } + + if (mm) { + dev_err(dev, "%s: Supervisor PASID with user provided mm\n", + iommu->name); + return ERR_PTR(-EINVAL); + } + + mm = &init_mm; + } + mutex_lock(&pasid_mutex); - ret = intel_svm_bind_mm(dev, flags, mm, &sdev); - if (ret) - sva = ERR_PTR(ret); - else if (sdev) - sva = &sdev->sva; - else - WARN(!sdev, "SVM bind succeeded with no sdev!\n"); + ret = intel_svm_alloc_pasid(dev, mm, flags); + if (ret) { + mutex_unlock(&pasid_mutex); + return ERR_PTR(ret); + } + sva = intel_svm_bind_mm(iommu, dev, mm, flags); + if (IS_ERR_OR_NULL(sva)) + intel_svm_free_pasid(mm); mutex_unlock(&pasid_mutex); return sva; @@ -1085,10 +1073,9 @@ intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) void intel_svm_unbind(struct iommu_sva *sva) { - struct intel_svm_dev *sdev; + struct intel_svm_dev *sdev = to_intel_svm_dev(sva); mutex_lock(&pasid_mutex); - sdev = to_intel_svm_dev(sva); intel_svm_unbind_mm(sdev->dev, sdev->pasid); mutex_unlock(&pasid_mutex); } @@ -1194,9 +1181,14 @@ int intel_svm_page_response(struct device *dev, desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page); desc.qw2 = 0; desc.qw3 = 0; - if (private_present) - memcpy(&desc.qw2, prm->private_data, - sizeof(prm->private_data)); + + if (private_present) { + desc.qw2 = prm->private_data[0]; + desc.qw3 = prm->private_data[1]; + } else if (prm->private_data[0]) { + dmar_latency_update(iommu, DMAR_LATENCY_PRQ, + ktime_to_ns(ktime_get()) - prm->private_data[0]); + } qi_submit_sync(iommu, &desc, 1, 0); } |