diff options
Diffstat (limited to 'drivers/iommu/intel')
-rw-r--r-- | drivers/iommu/intel/cache.c | 239 | ||||
-rw-r--r-- | drivers/iommu/intel/dmar.c | 109 | ||||
-rw-r--r-- | drivers/iommu/intel/iommu.c | 512 | ||||
-rw-r--r-- | drivers/iommu/intel/iommu.h | 132 | ||||
-rw-r--r-- | drivers/iommu/intel/irq_remapping.c | 11 | ||||
-rw-r--r-- | drivers/iommu/intel/nested.c | 3 | ||||
-rw-r--r-- | drivers/iommu/intel/pasid.c | 19 | ||||
-rw-r--r-- | drivers/iommu/intel/perfmon.c | 111 | ||||
-rw-r--r-- | drivers/iommu/intel/svm.c | 7 |
9 files changed, 566 insertions, 577 deletions
diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index 44e92638c0cd..e5b89f728ad3 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -190,6 +190,13 @@ int cache_tag_assign_domain(struct dmar_domain *domain, u16 did = domain_get_id_for_dev(domain, dev); int ret; + /* domain->qi_bach will be freed in iommu_free_domain() path. */ + if (!domain->qi_batch) { + domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_KERNEL); + if (!domain->qi_batch) + return -ENOMEM; + } + ret = __cache_tag_assign_domain(domain, did, dev, pasid); if (ret || domain->domain.type != IOMMU_DOMAIN_NESTED) return ret; @@ -255,6 +262,154 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask); } +static void qi_batch_flush_descs(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (!iommu || !batch->index) + return; + + qi_submit_sync(iommu, batch->descs, batch->index, 0); + + /* Reset the index value and clean the whole batch buffer. */ + memset(batch, 0, sizeof(*batch)); +} + +static void qi_batch_increment_index(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (++batch->index == QI_MAX_BATCHED_DESC_COUNT) + qi_batch_flush_descs(iommu, batch); +} + +static void qi_batch_add_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_batch *batch) +{ + qi_desc_iotlb(iommu, did, addr, size_order, type, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned int mask, + struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any Device-TLB + * invalidation requests while address remapping hardware is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, + u64 addr, unsigned long npages, bool ih, + struct qi_batch *batch) +{ + /* + * npages == -1 means a PASID-selective invalidation, otherwise, + * a positive value for Page-selective-within-PASID invalidation. + * 0 is not a valid input. + */ + if (!npages) + return; + + qi_desc_piotlb(did, pasid, addr, npages, ih, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_pasid_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any + * Device-TLB invalidation requests while address remapping hardware + * is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, qdep, addr, size_order, + &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long pages, + unsigned long mask, int ih) +{ + struct intel_iommu *iommu = tag->iommu; + u64 type = DMA_TLB_PSI_FLUSH; + + if (domain->use_first_level) { + qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, + pages, ih, domain->qi_batch); + return; + } + + /* + * Fallback to domain selective flush if no PSI support or the size + * is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap) || pages == -1) { + addr = 0; + mask = 0; + ih = 0; + type = DMA_TLB_DSI_FLUSH; + } + + if (ecap_qis(iommu->ecap)) + qi_batch_add_iotlb(iommu, tag->domain_id, addr | ih, mask, type, + domain->qi_batch); + else + __iommu_flush_iotlb(iommu, tag->domain_id, addr | ih, mask, type); +} + +static void cache_tag_flush_devtlb_psi(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long mask) +{ + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + if (tag->pasid == IOMMU_NO_PASID) { + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + return; + } + + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, + domain->qi_batch); +} + +static void cache_tag_flush_devtlb_all(struct dmar_domain *domain, struct cache_tag *tag) +{ + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, domain->qi_batch); +} + /* * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive) * when the memory mappings in the target domain have been modified. @@ -262,6 +417,7 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, unsigned long end, int ih) { + struct intel_iommu *iommu = NULL; unsigned long pages, mask, addr; struct cache_tag *tag; unsigned long flags; @@ -270,30 +426,14 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; switch (tag->type) { case CACHE_TAG_IOTLB: case CACHE_TAG_NESTING_IOTLB: - if (domain->use_first_level) { - qi_flush_piotlb(iommu, tag->domain_id, - tag->pasid, addr, pages, ih); - } else { - /* - * Fallback to domain selective flush if no - * PSI support or the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - addr | ih, mask, - DMA_TLB_PSI_FLUSH); - } + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, ih); break; case CACHE_TAG_NESTING_DEVTLB: /* @@ -307,23 +447,13 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, mask = MAX_AGAW_PFN_WIDTH; fallthrough; case CACHE_TAG_DEVTLB: - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - if (tag->pasid == IOMMU_NO_PASID) - qi_flush_dev_iotlb(iommu, sid, info->pfsid, - info->ats_qdep, addr, mask); - else - qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid, - tag->pasid, info->ats_qdep, - addr, mask); - - quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep); + cache_tag_flush_devtlb_psi(domain, tag, addr, mask); break; } trace_cache_tag_flush_range(tag, start, end, addr, pages, mask); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } @@ -333,39 +463,30 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, */ void cache_tag_flush_all(struct dmar_domain *domain) { + struct intel_iommu *iommu = NULL; struct cache_tag *tag; unsigned long flags; spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; switch (tag->type) { case CACHE_TAG_IOTLB: case CACHE_TAG_NESTING_IOTLB: - if (domain->use_first_level) - qi_flush_piotlb(iommu, tag->domain_id, - tag->pasid, 0, -1, 0); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); + cache_tag_flush_iotlb(domain, tag, 0, -1, 0, 0); break; case CACHE_TAG_DEVTLB: case CACHE_TAG_NESTING_DEVTLB: - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - qi_flush_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, - 0, MAX_AGAW_PFN_WIDTH); - quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, - IOMMU_NO_PASID, info->ats_qdep); + cache_tag_flush_devtlb_all(domain, tag); break; } trace_cache_tag_flush_all(tag); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } @@ -383,6 +504,7 @@ void cache_tag_flush_all(struct dmar_domain *domain) void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end) { + struct intel_iommu *iommu = NULL; unsigned long pages, mask, addr; struct cache_tag *tag; unsigned long flags; @@ -391,7 +513,9 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; if (!cap_caching_mode(iommu->cap) || domain->use_first_level) { iommu_flush_write_buffer(iommu); @@ -399,22 +523,11 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, } if (tag->type == CACHE_TAG_IOTLB || - tag->type == CACHE_TAG_NESTING_IOTLB) { - /* - * Fallback to domain selective flush if no - * PSI support or the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - addr, mask, - DMA_TLB_PSI_FLUSH); - } + tag->type == CACHE_TAG_NESTING_IOTLB) + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, 0); trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 1c8d3141cb55..eaf862e8dea1 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1204,9 +1204,7 @@ static void free_iommu(struct intel_iommu *iommu) */ static inline void reclaim_free_desc(struct q_inval *qi) { - while (qi->desc_status[qi->free_tail] == QI_DONE || - qi->desc_status[qi->free_tail] == QI_ABORT) { - qi->desc_status[qi->free_tail] = QI_FREE; + while (qi->desc_status[qi->free_tail] == QI_FREE && qi->free_tail != qi->free_head) { qi->free_tail = (qi->free_tail + 1) % QI_LENGTH; qi->free_cnt++; } @@ -1463,8 +1461,16 @@ restart: raw_spin_lock(&qi->q_lock); } - for (i = 0; i < count; i++) - qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE; + /* + * The reclaim code can free descriptors from multiple submissions + * starting from the tail of the queue. When count == 0, the + * status of the standalone wait descriptor at the tail of the queue + * must be set to QI_FREE to allow the reclaim code to proceed. + * It is also possible that descriptors from one of the previous + * submissions has to be reclaimed by a subsequent submission. + */ + for (i = 0; i <= count; i++) + qi->desc_status[(index + i) % QI_LENGTH] = QI_FREE; reclaim_free_desc(qi); raw_spin_unlock_irqrestore(&qi->q_lock, flags); @@ -1520,24 +1526,9 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type) { - u8 dw = 0, dr = 0; - struct qi_desc desc; - int ih = 0; - - if (cap_write_drain(iommu->cap)) - dw = 1; - - if (cap_read_drain(iommu->cap)) - dr = 1; - - desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) - | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; - desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) - | QI_IOTLB_AM(size_order); - desc.qw2 = 0; - desc.qw3 = 0; + qi_desc_iotlb(iommu, did, addr, size_order, type, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1555,20 +1546,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (!(iommu->gcmd & DMA_GCMD_TE)) return; - if (mask) { - addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; - desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; - } else - desc.qw1 = QI_DEV_IOTLB_ADDR(addr); - - if (qdep >= QI_DEV_IOTLB_MAX_INVS) - qdep = 0; - - desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | - QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); - desc.qw2 = 0; - desc.qw3 = 0; - + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1588,28 +1566,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, return; } - if (npages == -1) { - desc.qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc.qw1 = 0; - } else { - int mask = ilog2(__roundup_pow_of_two(npages)); - unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); - - if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) - addr = ALIGN_DOWN(addr, align); - - desc.qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | - QI_EIOTLB_TYPE; - desc.qw1 = QI_EIOTLB_ADDR(addr) | - QI_EIOTLB_IH(ih) | - QI_EIOTLB_AM(mask); - } - + qi_desc_piotlb(did, pasid, addr, npages, ih, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1617,7 +1574,6 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, u32 pasid, u16 qdep, u64 addr, unsigned int size_order) { - unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; /* @@ -1629,40 +1585,9 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (!(iommu->gcmd & DMA_GCMD_TE)) return; - desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(pfsid); - - /* - * If S bit is 0, we only flush a single page. If S bit is set, - * The least significant zero bit indicates the invalidation address - * range. VT-d spec 6.5.2.6. - * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. - * size order = 0 is PAGE_SIZE 4KB - * Max Invs Pending (MIP) is set to 0 for now until we have DIT in - * ECAP. - */ - if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) - pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", - addr, size_order); - - /* Take page address */ - desc.qw1 = QI_DEV_EIOTLB_ADDR(addr); - - if (size_order) { - /* - * Existing 0s in address below size_order may be the least - * significant bit, we must set them to 1s to avoid having - * smaller size than desired. - */ - desc.qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, - VTD_PAGE_SHIFT); - /* Clear size_order bit to indicate size */ - desc.qw1 &= ~mask; - /* Set the S bit to indicate flushing more than 1 page */ - desc.qw1 |= QI_DEV_EIOTLB_SIZE; - } - + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, + qdep, addr, size_order, + &desc); qi_submit_sync(iommu, &desc, 1, 0); } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9ff8b83c19a3..9f6b0780f2ef 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -167,15 +167,6 @@ static void device_rbtree_remove(struct device_domain_info *info) spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); } -/* - * This domain is a statically identity mapping domain. - * 1. This domain creats a static 1:1 mapping to all usable memory. - * 2. It maps to each iommu if successful. - * 3. Each iommu mapps to this domain if successful. - */ -static struct dmar_domain *si_domain; -static int hw_pass_through = 1; - struct dmar_rmrr_unit { struct list_head list; /* list of rmrr units */ struct acpi_dmar_header *hdr; /* ACPI header */ @@ -293,11 +284,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -static int domain_type_is_si(struct dmar_domain *domain) -{ - return domain->domain.type == IOMMU_DOMAIN_IDENTITY; -} - static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; @@ -492,7 +478,6 @@ void domain_update_iommu_cap(struct dmar_domain *domain) domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); - domain_update_iotlb(domain); } struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, @@ -1199,9 +1184,8 @@ static void __iommu_flush_context(struct intel_iommu *iommu, raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -/* return value determine if we need a write buffer flush */ -static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int size_order, u64 type) +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; @@ -1270,32 +1254,6 @@ domain_lookup_dev_info(struct dmar_domain *domain, return NULL; } -void domain_update_iotlb(struct dmar_domain *domain) -{ - struct dev_pasid_info *dev_pasid; - struct device_domain_info *info; - bool has_iotlb_device = false; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - if (info->ats_enabled) { - has_iotlb_device = true; - break; - } - } - - list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { - info = dev_iommu_priv_get(dev_pasid->dev); - if (info->ats_enabled) { - has_iotlb_device = true; - break; - } - } - domain->has_iotlb_device = has_iotlb_device; - spin_unlock_irqrestore(&domain->lock, flags); -} - /* * The extra devTLB flush quirk impacts those QAT devices with PCI device * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() @@ -1322,20 +1280,9 @@ static void iommu_enable_pci_caps(struct device_domain_info *info) return; pdev = to_pci_dev(info->dev); - - /* The PCIe spec, in its wisdom, declares that the behaviour of - the device if you enable PASID support after ATS support is - undefined. So always enable PASID support on devices which - have it, even if we can't yet know if we're ever going to - use it. */ - if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) - info->pasid_enabled = 1; - if (info->ats_supported && pci_ats_page_aligned(pdev) && - !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { + !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) info->ats_enabled = 1; - domain_update_iotlb(info->domain); - } } static void iommu_disable_pci_caps(struct device_domain_info *info) @@ -1350,12 +1297,6 @@ static void iommu_disable_pci_caps(struct device_domain_info *info) if (info->ats_enabled) { pci_disable_ats(pdev); info->ats_enabled = 0; - domain_update_iotlb(info->domain); - } - - if (info->pasid_enabled) { - pci_disable_pasid(pdev); - info->pasid_enabled = 0; } } @@ -1447,10 +1388,10 @@ static int iommu_init_domains(struct intel_iommu *iommu) * entry for first-level or pass-through translation modes should * be programmed with a domain id different from those used for * second-level or nested translation. We reserve a domain id for - * this purpose. + * this purpose. This domain id is also used for identity domain + * in legacy mode. */ - if (sm_supported(iommu)) - set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); + set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); return 0; } @@ -1524,7 +1465,6 @@ static struct dmar_domain *alloc_domain(unsigned int type) domain->nid = NUMA_NO_NODE; if (first_level_by_default(type)) domain->use_first_level = true; - domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); INIT_LIST_HEAD(&domain->dev_pasids); INIT_LIST_HEAD(&domain->cache_tags); @@ -1632,9 +1572,65 @@ static void domain_exit(struct dmar_domain *domain) if (WARN_ON(!list_empty(&domain->devices))) return; + kfree(domain->qi_batch); kfree(domain); } +/* + * For kdump cases, old valid entries may be cached due to the + * in-flight DMA and copied pgtable, but there is no unmapping + * behaviour for them, thus we need an explicit cache flush for + * the newly-mapped device. For kdump, at this point, the device + * is supposed to finish reset at its driver probe stage, so no + * in-flight DMA will exist, and we don't need to worry anymore + * hereafter. + */ +static void copied_context_tear_down(struct intel_iommu *iommu, + struct context_entry *context, + u8 bus, u8 devfn) +{ + u16 did_old; + + if (!context_copied(iommu, bus, devfn)) + return; + + assert_spin_locked(&iommu->lock); + + did_old = context_domain_id(context); + context_clear_entry(context); + + if (did_old < cap_ndoms(iommu->cap)) { + iommu->flush.flush_context(iommu, did_old, + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did_old, 0, 0, + DMA_TLB_DSI_FLUSH); + } + + clear_context_copied(iommu, bus, devfn); +} + +/* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ +static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, + u8 bus, u8 devfn) +{ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + } else { + iommu_flush_write_buffer(iommu); + } +} + static int domain_context_mapping_one(struct dmar_domain *domain, struct intel_iommu *iommu, u8 bus, u8 devfn) @@ -1647,9 +1643,6 @@ static int domain_context_mapping_one(struct dmar_domain *domain, struct context_entry *context; int agaw, ret; - if (hw_pass_through && domain_type_is_si(domain)) - translation = CONTEXT_TT_PASS_THROUGH; - pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1663,83 +1656,35 @@ static int domain_context_mapping_one(struct dmar_domain *domain, if (context_present(context) && !context_copied(iommu, bus, devfn)) goto out_unlock; - /* - * For kdump cases, old valid entries may be cached due to the - * in-flight DMA and copied pgtable, but there is no unmapping - * behaviour for them, thus we need an explicit cache flush for - * the newly-mapped device. For kdump, at this point, the device - * is supposed to finish reset at its driver probe stage, so no - * in-flight DMA will exist, and we don't need to worry anymore - * hereafter. - */ - if (context_copied(iommu, bus, devfn)) { - u16 did_old = context_domain_id(context); - - if (did_old < cap_ndoms(iommu->cap)) { - iommu->flush.flush_context(iommu, did_old, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, did_old, 0, 0, - DMA_TLB_DSI_FLUSH); - } - - clear_context_copied(iommu, bus, devfn); - } - + copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); - context_set_domain_id(context, did); - if (translation != CONTEXT_TT_PASS_THROUGH) { - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - - if (info && info->ats_supported) - translation = CONTEXT_TT_DEV_IOTLB; - else - translation = CONTEXT_TT_MULTI_LEVEL; + context_set_domain_id(context, did); - context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); - } else { - /* - * In pass through mode, AW must be programmed to - * indicate the largest AGAW value supported by - * hardware. And ASR is ignored by hardware. - */ - context_set_address_width(context, iommu->msagaw); + /* + * Skip top levels of page tables for iommu which has + * less agaw than default. Unnecessary for PT mode. + */ + for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { + ret = -ENOMEM; + pgd = phys_to_virt(dma_pte_addr(pgd)); + if (!dma_pte_present(pgd)) + goto out_unlock; } + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; + + context_set_address_root(context, virt_to_phys(pgd)); + context_set_address_width(context, agaw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); - - /* - * It's a non-present to present mapping. If hardware doesn't cache - * non-present entry we only need to flush the write-buffer. If the - * _does_ cache non-present entries, then it does so in the special - * domain #0, which we have to flush: - */ - if (cap_caching_mode(iommu->cap)) { - iommu->flush.flush_context(iommu, 0, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - } else { - iommu_flush_write_buffer(iommu); - } - + context_present_cache_flush(iommu, did, bus, devfn); ret = 0; out_unlock: @@ -1944,6 +1889,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 { struct intel_iommu *iommu = info->iommu; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); @@ -1952,10 +1898,11 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 return; } + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, true); + intel_context_flush_present(info, context, did, true); } static int domain_setup_first_level(struct intel_iommu *iommu, @@ -1998,80 +1945,6 @@ static bool dev_is_real_dma_subdevice(struct device *dev) pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); } -static int iommu_domain_identity_map(struct dmar_domain *domain, - unsigned long first_vpfn, - unsigned long last_vpfn) -{ - /* - * RMRR range might have overlap with physical memory range, - * clear it first - */ - dma_pte_clear_range(domain, first_vpfn, last_vpfn); - - return __domain_mapping(domain, first_vpfn, - first_vpfn, last_vpfn - first_vpfn + 1, - DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); -} - -static int md_domain_init(struct dmar_domain *domain, int guest_width); - -static int __init si_domain_init(int hw) -{ - struct dmar_rmrr_unit *rmrr; - struct device *dev; - int i, nid, ret; - - si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); - if (!si_domain) - return -EFAULT; - - if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - domain_exit(si_domain); - si_domain = NULL; - return -EFAULT; - } - - if (hw) - return 0; - - for_each_online_node(nid) { - unsigned long start_pfn, end_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - ret = iommu_domain_identity_map(si_domain, - mm_to_dma_pfn_start(start_pfn), - mm_to_dma_pfn_end(end_pfn-1)); - if (ret) - return ret; - } - } - - /* - * Identity map the RMRRs so that devices with RMRRs could also use - * the si_domain. - */ - for_each_rmrr_units(rmrr) { - for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, - i, dev) { - unsigned long long start = rmrr->base_address; - unsigned long long end = rmrr->end_address; - - if (WARN_ON(end < start || - end >> agaw_to_width(si_domain->agaw))) - continue; - - ret = iommu_domain_identity_map(si_domain, - mm_to_dma_pfn_start(start >> PAGE_SHIFT), - mm_to_dma_pfn_end(end >> PAGE_SHIFT)); - if (ret) - return ret; - } - } - - return 0; -} - static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { @@ -2094,8 +1967,6 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); - else if (hw_pass_through && domain_type_is_si(domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); else if (domain->use_first_level) ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); else @@ -2104,8 +1975,7 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (ret) goto out_block_translation; - if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) - iommu_enable_pci_caps(info); + iommu_enable_pci_caps(info); ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); if (ret) @@ -2149,6 +2019,16 @@ static bool device_rmrr_is_relaxable(struct device *dev) static int device_def_domain_type(struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * Hardware does not support the passthrough translation mode. + * Always use a dynamaic mapping domain. + */ + if (!ecap_pass_through(iommu->ecap)) + return IOMMU_DOMAIN_DMA; + if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); @@ -2439,8 +2319,6 @@ static int __init init_dmars(void) } } - if (!ecap_pass_through(iommu->ecap)) - hw_pass_through = 0; intel_svm_check(iommu); } @@ -2456,10 +2334,6 @@ static int __init init_dmars(void) check_tylersburg_isoch(); - ret = si_domain_init(hw_pass_through); - if (ret) - goto free_iommu; - /* * for each drhd * enable fault log @@ -2505,10 +2379,6 @@ free_iommu: disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } - if (si_domain) { - domain_exit(si_domain); - si_domain = NULL; - } return ret; } @@ -2883,12 +2753,6 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) if (ret) goto out; - if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { - pr_warn("%s: Doesn't support hardware pass through.\n", - iommu->name); - return -ENXIO; - } - sp = domain_update_iommu_superpage(NULL, iommu) - 1; if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { pr_warn("%s: Doesn't support large page.\n", @@ -3139,43 +3003,6 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) return 0; } -static int intel_iommu_memory_notifier(struct notifier_block *nb, - unsigned long val, void *v) -{ - struct memory_notify *mhp = v; - unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); - unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + - mhp->nr_pages - 1); - - switch (val) { - case MEM_GOING_ONLINE: - if (iommu_domain_identity_map(si_domain, - start_vpfn, last_vpfn)) { - pr_warn("Failed to build identity map for [%lx-%lx]\n", - start_vpfn, last_vpfn); - return NOTIFY_BAD; - } - break; - - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - { - LIST_HEAD(freelist); - - domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); - iommu_put_pages_list(&freelist); - } - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block intel_iommu_memory_nb = { - .notifier_call = intel_iommu_memory_notifier, - .priority = 0 -}; - static void intel_disable_iommus(void) { struct intel_iommu *iommu = NULL; @@ -3472,12 +3299,7 @@ int __init intel_iommu_init(void) iommu_pmu_register(iommu); } - up_read(&dmar_global_lock); - - if (si_domain && !hw_pass_through) - register_memory_notifier(&intel_iommu_memory_nb); - down_read(&dmar_global_lock); if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); @@ -3622,7 +3444,6 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st xa_init(&domain->iommu_array); domain->nid = dev_to_node(dev); - domain->has_iotlb_device = info->ats_enabled; domain->use_first_level = first_stage; /* calculate the address width */ @@ -3691,8 +3512,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) domain->geometry.force_aperture = true; return domain; - case IOMMU_DOMAIN_IDENTITY: - return &si_domain->domain; default: return NULL; } @@ -3759,8 +3578,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) WARN_ON(dmar_domain->nested_parent && !list_empty(&dmar_domain->s1_domains)); - if (domain != &si_domain->domain) - domain_exit(dmar_domain); + domain_exit(dmar_domain); } int prepare_domain_attach_device(struct iommu_domain *domain, @@ -3810,11 +3628,9 @@ int prepare_domain_attach_device(struct iommu_domain *domain, static int intel_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { - struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; - if (info->domain) - device_block_translation(dev); + device_block_translation(dev); ret = prepare_domain_attach_device(domain, dev); if (ret) @@ -4091,6 +3907,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) dev_iommu_priv_set(dev, info); if (pdev && pci_ats_supported(pdev)) { + pci_prepare_ats(pdev, VTD_PAGE_SHIFT); ret = device_rbtree_insert(iommu, info); if (ret) goto free; @@ -4112,6 +3929,16 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) intel_iommu_debugfs_create_dev(info); + /* + * The PCIe spec, in its wisdom, declares that the behaviour of the + * device is undefined if you enable PASID support after ATS support. + * So always enable PASID support on devices which have it, even if + * we can't yet know if we're ever going to use it. + */ + if (info->pasid_supported && + !pci_enable_pasid(pdev, info->pasid_supported & ~1)) + info->pasid_enabled = 1; + return &iommu->iommu; free_table: intel_pasid_free_table(dev); @@ -4128,6 +3955,11 @@ static void intel_iommu_release_device(struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + if (info->pasid_enabled) { + pci_disable_pasid(to_pci_dev(dev)); + info->pasid_enabled = 0; + } + mutex_lock(&iommu->iopf_lock); if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) device_rbtree_remove(info); @@ -4249,6 +4081,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); if (context_copied(iommu, bus, devfn)) { @@ -4261,6 +4094,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) spin_unlock(&iommu->lock); return -ENODEV; } + did = context_domain_id(context); if (enable) context_set_sm_pre(context); @@ -4269,7 +4103,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); - intel_context_flush_present(info, context, true); + intel_context_flush_present(info, context, did, true); spin_unlock(&iommu->lock); return 0; @@ -4420,11 +4254,17 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct dev_pasid_info *curr, *dev_pasid = NULL; struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; unsigned long flags; + if (domain->type == IOMMU_DOMAIN_IDENTITY) { + intel_pasid_tear_down_entry(iommu, dev, pasid, false); + return; + } + + dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { @@ -4479,9 +4319,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (ret) goto out_detach_iommu; - if (domain_type_is_si(dmar_domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, pasid); - else if (dmar_domain->use_first_level) + if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, dev, pasid); else @@ -4651,9 +4489,111 @@ static const struct iommu_dirty_ops intel_dirty_ops = { .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, }; +static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, 1); + if (!context) { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + if (context_present(context) && !context_copied(iommu, bus, devfn)) { + spin_unlock(&iommu->lock); + return 0; + } + + copied_context_tear_down(iommu, context, bus, devfn); + context_clear_entry(context); + context_set_domain_id(context, FLPT_DEFAULT_DID); + + /* + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. + */ + context_set_address_width(context, iommu->msagaw); + context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); + context_set_fault_enable(context); + context_set_present(context); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(context, sizeof(*context)); + context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); + spin_unlock(&iommu->lock); + + return 0; +} + +static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; + + if (dev != &pdev->dev) + return 0; + + return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); +} + +static int device_setup_pass_through(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) + return context_setup_pass_through(dev, info->bus, info->devfn); + + return pci_for_each_dma_alias(to_pci_dev(dev), + context_setup_pass_through_cb, dev); +} + +static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + device_block_translation(dev); + + if (dev_is_real_dma_subdevice(dev)) + return 0; + + if (sm_supported(iommu)) { + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); + if (!ret) + iommu_enable_pci_caps(info); + } else { + ret = device_setup_pass_through(dev); + } + + return ret; +} + +static int identity_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + return intel_pasid_setup_pass_through(iommu, dev, pasid); +} + +static struct iommu_domain identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = identity_domain_attach_dev, + .set_dev_pasid = identity_domain_set_dev_pasid, + }, +}; + const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, + .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index b67c14da1240..1497f3112b12 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -584,11 +584,23 @@ struct iommu_domain_info { * to VT-d spec, section 9.3 */ }; +/* + * We start simply by using a fixed size for the batched descriptors. This + * size is currently sufficient for our needs. Future improvements could + * involve dynamically allocating the batch buffer based on actual demand, + * allowing us to adjust the batch size for optimal performance in different + * scenarios. + */ +#define QI_MAX_BATCHED_DESC_COUNT 16 +struct qi_batch { + struct qi_desc descs[QI_MAX_BATCHED_DESC_COUNT]; + unsigned int index; +}; + struct dmar_domain { int nid; /* node id */ struct xarray iommu_array; /* Attached IOMMU array */ - u8 has_iotlb_device: 1; u8 iommu_coherency: 1; /* indicate coherency of iommu access */ u8 force_snooping : 1; /* Create IOPTEs with snoop control */ u8 set_pte_snp:1; @@ -609,6 +621,7 @@ struct dmar_domain { spinlock_t cache_lock; /* Protect the cache tag list */ struct list_head cache_tags; /* Cache tag list */ + struct qi_batch *qi_batch; /* Batched QI descriptors */ int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, @@ -687,8 +700,6 @@ struct iommu_pmu { DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX); struct perf_event *event_list[IOMMU_PMU_IDX_MAX]; unsigned char irq_name[16]; - struct hlist_node cpuhp_node; - int cpu; }; #define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED) @@ -1067,6 +1078,115 @@ static inline unsigned long nrpages_to_size(unsigned long npages) return npages << VTD_PAGE_SHIFT; } +static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_desc *desc) +{ + u8 dw = 0, dr = 0; + int ih = 0; + + if (cap_write_drain(iommu->cap)) + dw = 1; + + if (cap_read_drain(iommu->cap)) + dr = 1; + + desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) + | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; + desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) + | QI_IOTLB_AM(size_order); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_dev_iotlb(u16 sid, u16 pfsid, u16 qdep, u64 addr, + unsigned int mask, struct qi_desc *desc) +{ + if (mask) { + addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; + desc->qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; + } else { + desc->qw1 = QI_DEV_IOTLB_ADDR(addr); + } + + if (qdep >= QI_DEV_IOTLB_MAX_INVS) + qdep = 0; + + desc->qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | + QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_piotlb(u16 did, u32 pasid, u64 addr, + unsigned long npages, bool ih, + struct qi_desc *desc) +{ + if (npages == -1) { + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = 0; + } else { + int mask = ilog2(__roundup_pow_of_two(npages)); + unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); + + if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) + addr = ALIGN_DOWN(addr, align); + + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = QI_EIOTLB_ADDR(addr) | + QI_EIOTLB_IH(ih) | + QI_EIOTLB_AM(mask); + } +} + +static inline void qi_desc_dev_iotlb_pasid(u16 sid, u16 pfsid, u32 pasid, + u16 qdep, u64 addr, + unsigned int size_order, + struct qi_desc *desc) +{ + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); + + desc->qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(pfsid); + + /* + * If S bit is 0, we only flush a single page. If S bit is set, + * The least significant zero bit indicates the invalidation address + * range. VT-d spec 6.5.2.6. + * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. + * size order = 0 is PAGE_SIZE 4KB + * Max Invs Pending (MIP) is set to 0 for now until we have DIT in + * ECAP. + */ + if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) + pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", + addr, size_order); + + /* Take page address */ + desc->qw1 = QI_DEV_EIOTLB_ADDR(addr); + + if (size_order) { + /* + * Existing 0s in address below size_order may be the least + * significant bit, we must set them to 1s to avoid having + * smaller size than desired. + */ + desc->qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, + VTD_PAGE_SHIFT); + /* Clear size_order bit to indicate size */ + desc->qw1 &= ~mask; + /* Set the S bit to indicate flushing more than 1 page */ + desc->qw1 |= QI_DEV_EIOTLB_SIZE; + } +} + /* Convert value to context PASID directory size field coding. */ #define context_pdts(pds) (((pds) & 0x7) << 9) @@ -1098,13 +1218,15 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, unsigned int count, unsigned long options); + +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); /* * Options used in qi_submit_sync: * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. */ #define QI_OPT_WAIT_DRAIN BIT(0) -void domain_update_iotlb(struct dmar_domain *domain); int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); @@ -1154,7 +1276,7 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, - bool affect_domains); + u16 did, bool affect_domains); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c index e090ca07364b..7a6d188e3bea 100644 --- a/drivers/iommu/intel/irq_remapping.c +++ b/drivers/iommu/intel/irq_remapping.c @@ -1352,12 +1352,11 @@ static void intel_irq_remapping_prepare_irte(struct intel_ir_data *data, case X86_IRQ_ALLOC_TYPE_IOAPIC: /* Set source-id of interrupt request */ set_ioapic_sid(irte, info->devid); - apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", - info->devid, irte->present, irte->fpd, - irte->dst_mode, irte->redir_hint, - irte->trigger_mode, irte->dlvry_mode, - irte->avail, irte->vector, irte->dest_id, - irte->sid, irte->sq, irte->svt); + apic_pr_verbose("IOAPIC[%d]: Set IRTE entry (P:%d FPD:%d Dst_Mode:%d Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X Avail:%X Vector:%02X Dest:%08X SID:%04X SQ:%X SVT:%X)\n", + info->devid, irte->present, irte->fpd, irte->dst_mode, + irte->redir_hint, irte->trigger_mode, irte->dlvry_mode, + irte->avail, irte->vector, irte->dest_id, irte->sid, + irte->sq, irte->svt); sub_handle = info->ioapic.pin; break; case X86_IRQ_ALLOC_TYPE_HPET: diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 16a2bcf5cfeb..433c58944401 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -66,8 +66,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, list_add(&info->link, &dmar_domain->devices); spin_unlock_irqrestore(&dmar_domain->lock, flags); - domain_update_iotlb(dmar_domain); - return 0; unassign_tag: cache_tag_unassign_domain(dmar_domain, dev, IOMMU_NO_PASID); @@ -85,6 +83,7 @@ static void intel_nested_domain_free(struct iommu_domain *domain) spin_lock(&s2_domain->s1_lock); list_del(&dmar_domain->s2_link); spin_unlock(&s2_domain->s1_lock); + kfree(dmar_domain->qi_batch); kfree(dmar_domain); } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 5792c817cefa..2e5fa0a23299 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -264,9 +264,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, else iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); } /* @@ -493,9 +491,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); return 0; } @@ -572,9 +568,7 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, pasid_cache_invalidation_with_pasid(iommu, did, pasid); qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); } /** @@ -683,6 +677,7 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, false); @@ -691,10 +686,11 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) return; } + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, false); + intel_context_flush_present(info, context, did, false); } static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) @@ -885,10 +881,9 @@ static void __context_flush_dev_iotlb(struct device_domain_info *info) */ void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, - bool flush_domains) + u16 did, bool flush_domains) { struct intel_iommu *iommu = info->iommu; - u16 did = context_domain_id(context); struct pasid_entry *pte; int i; diff --git a/drivers/iommu/intel/perfmon.c b/drivers/iommu/intel/perfmon.c index 44083d01852d..75f493bcb353 100644 --- a/drivers/iommu/intel/perfmon.c +++ b/drivers/iommu/intel/perfmon.c @@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_events_attr_group = { .attrs = attrs_empty, }; -static cpumask_t iommu_pmu_cpu_mask; - -static ssize_t -cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask); -} -static DEVICE_ATTR_RO(cpumask); - -static struct attribute *iommu_pmu_cpumask_attrs[] = { - &dev_attr_cpumask.attr, - NULL -}; - -static struct attribute_group iommu_pmu_cpumask_attr_group = { - .attrs = iommu_pmu_cpumask_attrs, -}; - static const struct attribute_group *iommu_pmu_attr_groups[] = { &iommu_pmu_format_attr_group, &iommu_pmu_events_attr_group, - &iommu_pmu_cpumask_attr_group, NULL }; @@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct intel_iommu *iommu) iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups; iommu_pmu->pmu.attr_update = iommu_pmu_attr_update; iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE; iommu_pmu->pmu.module = THIS_MODULE; return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1); @@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu) iommu->perf_irq = 0; } -static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node) -{ - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); - - if (cpumask_empty(&iommu_pmu_cpu_mask)) - cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask); - - if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask)) - iommu_pmu->cpu = cpu; - - return 0; -} - -static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node) -{ - struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node); - int target = cpumask_first(&iommu_pmu_cpu_mask); - - /* - * The iommu_pmu_cpu_mask has been updated when offline the CPU - * for the first iommu_pmu. Migrate the other iommu_pmu to the - * new target. - */ - if (target < nr_cpu_ids && target != iommu_pmu->cpu) { - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); - iommu_pmu->cpu = target; - return 0; - } - - if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask)) - return 0; - - target = cpumask_any_but(cpu_online_mask, cpu); - - if (target < nr_cpu_ids) - cpumask_set_cpu(target, &iommu_pmu_cpu_mask); - else - return 0; - - perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target); - iommu_pmu->cpu = target; - - return 0; -} - -static int nr_iommu_pmu; -static enum cpuhp_state iommu_cpuhp_slot; - -static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu) -{ - int ret; - - if (!nr_iommu_pmu) { - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, - "driver/iommu/intel/perfmon:online", - iommu_pmu_cpu_online, - iommu_pmu_cpu_offline); - if (ret < 0) - return ret; - iommu_cpuhp_slot = ret; - } - - ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); - if (ret) { - if (!nr_iommu_pmu) - cpuhp_remove_multi_state(iommu_cpuhp_slot); - return ret; - } - nr_iommu_pmu++; - - return 0; -} - -static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu) -{ - cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node); - - if (--nr_iommu_pmu) - return; - - cpuhp_remove_multi_state(iommu_cpuhp_slot); -} - void iommu_pmu_register(struct intel_iommu *iommu) { struct iommu_pmu *iommu_pmu = iommu->pmu; @@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iommu *iommu) if (__iommu_pmu_register(iommu)) goto err; - if (iommu_pmu_cpuhp_setup(iommu_pmu)) - goto unregister; - /* Set interrupt for overflow */ if (iommu_pmu_set_interrupt(iommu)) - goto cpuhp_free; + goto unregister; return; -cpuhp_free: - iommu_pmu_cpuhp_free(iommu_pmu); unregister: perf_pmu_unregister(&iommu_pmu->pmu); err: @@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_iommu *iommu) return; iommu_pmu_unset_interrupt(iommu); - iommu_pmu_cpuhp_free(iommu_pmu); perf_pmu_unregister(&iommu_pmu->pmu); } diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 0e3a9b38bef2..078d1e32a24e 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -184,7 +184,10 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static void intel_mm_free_notifier(struct mmu_notifier *mn) { - kfree(container_of(mn, struct dmar_domain, notifier)); + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); + + kfree(domain->qi_batch); + kfree(domain); } static const struct mmu_notifier_ops intel_mmuops = { @@ -311,7 +314,7 @@ void intel_drain_pasid_prq(struct device *dev, u32 pasid) domain = info->domain; pdev = to_pci_dev(dev); sid = PCI_DEVID(info->bus, info->devfn); - did = domain_id_iommu(domain, iommu); + did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; qdep = pci_ats_queue_depth(pdev); /* |