From 0c5f6c0d8201a809a6585b07b6263e9db2c874a3 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Aug 2022 14:15:54 +0800 Subject: iommu/vt-d: Fix kdump kernels boot failure with scalable mode The translation table copying code for kdump kernels is currently based on the extended root/context entry formats of ECS mode defined in older VT-d v2.5, and doesn't handle the scalable mode formats. This causes the kexec capture kernel boot failure with DMAR faults if the IOMMU was enabled in scalable mode by the previous kernel. The ECS mode has already been deprecated by the VT-d spec since v3.0 and Intel IOMMU driver doesn't support this mode as there's no real hardware implementation. Hence this converts ECS checking in copying table code into scalable mode. The existing copying code consumes a bit in the context entry as a mark of copied entry. It needs to work for the old format as well as for the extended context entries. As it's hard to find such a common bit for both legacy and scalable mode context entries. This replaces it with a per- IOMMU bitmap. Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support") Cc: stable@vger.kernel.org Reported-by: Jerry Snitselaar Tested-by: Wen Jin Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220817011035.3250131-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 100 +++++++++++++++++++------------------------- drivers/iommu/intel/iommu.h | 9 ++-- 2 files changed, 50 insertions(+), 59 deletions(-) (limited to 'drivers/iommu/intel') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 7cca030a508e..b9d058c27568 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -163,38 +163,6 @@ static phys_addr_t root_entry_uctp(struct root_entry *re) return re->hi & VTD_PAGE_MASK; } -static inline void context_clear_pasid_enable(struct context_entry *context) -{ - context->lo &= ~(1ULL << 11); -} - -static inline bool context_pasid_enabled(struct context_entry *context) -{ - return !!(context->lo & (1ULL << 11)); -} - -static inline void context_set_copied(struct context_entry *context) -{ - context->hi |= (1ull << 3); -} - -static inline bool context_copied(struct context_entry *context) -{ - return !!(context->hi & (1ULL << 3)); -} - -static inline bool __context_present(struct context_entry *context) -{ - return (context->lo & 1); -} - -bool context_present(struct context_entry *context) -{ - return context_pasid_enabled(context) ? - __context_present(context) : - __context_present(context) && !context_copied(context); -} - static inline void context_set_present(struct context_entry *context) { context->lo |= 1; @@ -242,6 +210,26 @@ static inline void context_clear_entry(struct context_entry *context) context->hi = 0; } +static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + if (!iommu->copied_tables) + return false; + + return test_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + set_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + +static inline void +clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn) +{ + clear_bit(((long)bus << 8) | devfn, iommu->copied_tables); +} + /* * This domain is a statically identity mapping domain. * 1. This domain creats a static 1:1 mapping to all usable memory. @@ -578,6 +566,13 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, struct context_entry *context; u64 *entry; + /* + * Except that the caller requested to allocate a new entry, + * returning a copied context entry makes no sense. + */ + if (!alloc && context_copied(iommu, bus, devfn)) + return NULL; + entry = &root->lo; if (sm_supported(iommu)) { if (devfn >= 0x80) { @@ -1688,6 +1683,11 @@ static void free_dmar_iommu(struct intel_iommu *iommu) iommu->domain_ids = NULL; } + if (iommu->copied_tables) { + bitmap_free(iommu->copied_tables); + iommu->copied_tables = NULL; + } + /* free context mapping */ free_context_table(iommu); @@ -1913,7 +1913,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, goto out_unlock; ret = 0; - if (context_present(context)) + if (context_present(context) && !context_copied(iommu, bus, devfn)) goto out_unlock; /* @@ -1925,7 +1925,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, * in-flight DMA will exist, and we don't need to worry anymore * hereafter. */ - if (context_copied(context)) { + if (context_copied(iommu, bus, devfn)) { u16 did_old = context_domain_id(context); if (did_old < cap_ndoms(iommu->cap)) { @@ -1936,6 +1936,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, iommu->flush.flush_iotlb(iommu, did_old, 0, 0, DMA_TLB_DSI_FLUSH); } + + clear_context_copied(iommu, bus, devfn); } context_clear_entry(context); @@ -2684,32 +2686,14 @@ static int copy_context_table(struct intel_iommu *iommu, /* Now copy the context entry */ memcpy(&ce, old_ce + idx, sizeof(ce)); - if (!__context_present(&ce)) + if (!context_present(&ce)) continue; did = context_domain_id(&ce); if (did >= 0 && did < cap_ndoms(iommu->cap)) set_bit(did, iommu->domain_ids); - /* - * We need a marker for copied context entries. This - * marker needs to work for the old format as well as - * for extended context entries. - * - * Bit 67 of the context entry is used. In the old - * format this bit is available to software, in the - * extended format it is the PGE bit, but PGE is ignored - * by HW if PASIDs are disabled (and thus still - * available). - * - * So disable PASIDs first and then mark the entry - * copied. This means that we don't copy PASID - * translations from the old kernel, but this is fine as - * faults there are not fatal. - */ - context_clear_pasid_enable(&ce); - context_set_copied(&ce); - + set_context_copied(iommu, bus, devfn); new_ce[idx] = ce; } @@ -2735,8 +2719,8 @@ static int copy_translation_tables(struct intel_iommu *iommu) bool new_ext, ext; rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); - ext = !!(rtaddr_reg & DMA_RTADDR_RTT); - new_ext = !!ecap_ecs(iommu->ecap); + ext = !!(rtaddr_reg & DMA_RTADDR_SMT); + new_ext = !!sm_supported(iommu); /* * The RTT bit can only be changed when translation is disabled, @@ -2747,6 +2731,10 @@ static int copy_translation_tables(struct intel_iommu *iommu) if (new_ext != ext) return -EINVAL; + iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL); + if (!iommu->copied_tables) + return -ENOMEM; + old_rt_phys = rtaddr_reg & VTD_PAGE_MASK; if (!old_rt_phys) return -EINVAL; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index fae45bbb0c7f..74b0e19e23ee 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -197,7 +197,6 @@ #define ecap_dis(e) (((e) >> 27) & 0x1) #define ecap_nest(e) (((e) >> 26) & 0x1) #define ecap_mts(e) (((e) >> 25) & 0x1) -#define ecap_ecs(e) (((e) >> 24) & 0x1) #define ecap_iotlb_offset(e) ((((e) >> 8) & 0x3ff) * 16) #define ecap_max_iotlb_offset(e) (ecap_iotlb_offset(e) + 16) #define ecap_coherent(e) ((e) & 0x1) @@ -265,7 +264,6 @@ #define DMA_GSTS_CFIS (((u32)1) << 23) /* DMA_RTADDR_REG */ -#define DMA_RTADDR_RTT (((u64)1) << 11) #define DMA_RTADDR_SMT (((u64)1) << 10) /* CCMD_REG */ @@ -579,6 +577,7 @@ struct intel_iommu { #ifdef CONFIG_INTEL_IOMMU unsigned long *domain_ids; /* bitmap of domains */ + unsigned long *copied_tables; /* bitmap of copied tables */ spinlock_t lock; /* protect context, domain ids */ struct root_entry *root_entry; /* virtual address */ @@ -701,6 +700,11 @@ static inline int nr_pte_to_next_page(struct dma_pte *pte) (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; } +static inline bool context_present(struct context_entry *context) +{ + return (context->lo & 1); +} + extern struct dmar_drhd_unit * dmar_find_matched_drhd_unit(struct pci_dev *dev); extern int dmar_enable_qi(struct intel_iommu *iommu); @@ -784,7 +788,6 @@ static inline void intel_iommu_debugfs_init(void) {} #endif /* CONFIG_INTEL_IOMMU_DEBUGFS */ extern const struct attribute_group *intel_iommu_groups[]; -bool context_present(struct context_entry *context); struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, u8 devfn, int alloc); -- cgit From 53fc7ad6edf210b497230ce74b61b322a202470c Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Aug 2022 14:15:55 +0800 Subject: iommu/vt-d: Correctly calculate sagaw value of IOMMU The Intel IOMMU driver possibly selects between the first-level and the second-level translation tables for DMA address translation. However, the levels of page-table walks for the 4KB base page size are calculated from the SAGAW field of the capability register, which is only valid for the second-level page table. This causes the IOMMU driver to stop working if the hardware (or the emulated IOMMU) advertises only first-level translation capability and reports the SAGAW field as 0. This solves the above problem by considering both the first level and the second level when calculating the supported page table levels. Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level") Cc: stable@vger.kernel.org Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220817023558.3253263-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'drivers/iommu/intel') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index b9d058c27568..b155c7af7d15 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -390,14 +390,36 @@ static inline int domain_pfn_supported(struct dmar_domain *domain, return !(addr_width < BITS_PER_LONG && pfn >> addr_width); } +/* + * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. + * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of + * the returned SAGAW. + */ +static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu) +{ + unsigned long fl_sagaw, sl_sagaw; + + fl_sagaw = BIT(2) | (cap_fl1gp_support(iommu->cap) ? BIT(3) : 0); + sl_sagaw = cap_sagaw(iommu->cap); + + /* Second level only. */ + if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) + return sl_sagaw; + + /* First level only. */ + if (!ecap_slts(iommu->ecap)) + return fl_sagaw; + + return fl_sagaw & sl_sagaw; +} + static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) { unsigned long sagaw; int agaw; - sagaw = cap_sagaw(iommu->cap); - for (agaw = width_to_agaw(max_gaw); - agaw >= 0; agaw--) { + sagaw = __iommu_calculate_sagaw(iommu); + for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { if (test_bit(agaw, &sagaw)) break; } -- cgit From a349ffcb4daf77a4040ab5a6c95295bb533bf384 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Aug 2022 14:15:56 +0800 Subject: iommu/vt-d: Fix recursive lock issue in iommu_flush_dev_iotlb() The per domain spinlock is acquired in iommu_flush_dev_iotlb(), which is possbile to be called in the interrupt context. For example, the drm-intel's CI system got completely blocked with below error: WARNING: inconsistent lock state 6.0.0-rc1-CI_DRM_11990-g6590d43d39b9+ #1 Not tainted -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/6/0 [HC0[0]:SC1[1]:HE1:SE0] takes: ffff88810440d678 (&domain->lock){+.?.}-{2:2}, at: iommu_flush_dev_iotlb.part.61+0x23/0x80 {SOFTIRQ-ON-W} state was registered at: lock_acquire+0xd3/0x310 _raw_spin_lock+0x2a/0x40 domain_update_iommu_cap+0x20b/0x2c0 intel_iommu_attach_device+0x5bd/0x860 __iommu_attach_device+0x18/0xe0 bus_iommu_probe+0x1f3/0x2d0 bus_set_iommu+0x82/0xd0 intel_iommu_init+0xe45/0x102a pci_iommu_init+0x9/0x31 do_one_initcall+0x53/0x2f0 kernel_init_freeable+0x18f/0x1e1 kernel_init+0x11/0x120 ret_from_fork+0x1f/0x30 irq event stamp: 162354 hardirqs last enabled at (162354): [] _raw_spin_unlock_irqrestore+0x54/0x70 hardirqs last disabled at (162353): [] _raw_spin_lock_irqsave+0x4b/0x50 softirqs last enabled at (162338): [] __do_softirq+0x323/0x48e softirqs last disabled at (162349): [] irq_exit_rcu+0xb8/0xe0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&domain->lock); lock(&domain->lock); *** DEADLOCK *** 1 lock held by swapper/6/0: This coverts the spin_lock/unlock() into the irq save/restore varieties to fix the recursive locking issues. Fixes: ffd5869d93530 ("iommu/vt-d: Replace spin_lock_irqsave() with spin_lock()") Signed-off-by: Lu Baolu Acked-by: Lucas De Marchi Link: https://lore.kernel.org/r/20220817025650.3253959-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'drivers/iommu/intel') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index b155c7af7d15..e3fe1a148187 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -515,8 +515,9 @@ static int domain_update_device_node(struct dmar_domain *domain) { struct device_domain_info *info; int nid = NUMA_NO_NODE; + unsigned long flags; - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); list_for_each_entry(info, &domain->devices, link) { /* * There could possibly be multiple device numa nodes as devices @@ -528,7 +529,7 @@ static int domain_update_device_node(struct dmar_domain *domain) if (nid != NUMA_NO_NODE) break; } - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); return nid; } @@ -1362,19 +1363,20 @@ iommu_support_dev_iotlb(struct dmar_domain *domain, struct intel_iommu *iommu, u8 bus, u8 devfn) { struct device_domain_info *info; + unsigned long flags; if (!iommu->qi) return NULL; - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); list_for_each_entry(info, &domain->devices, link) { if (info->iommu == iommu && info->bus == bus && info->devfn == devfn) { - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); return info->ats_supported ? info : NULL; } } - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); return NULL; } @@ -1383,8 +1385,9 @@ static void domain_update_iotlb(struct dmar_domain *domain) { struct device_domain_info *info; bool has_iotlb_device = false; + unsigned long flags; - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); list_for_each_entry(info, &domain->devices, link) { if (info->ats_enabled) { has_iotlb_device = true; @@ -1392,7 +1395,7 @@ static void domain_update_iotlb(struct dmar_domain *domain) } } domain->has_iotlb_device = has_iotlb_device; - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); } static void iommu_enable_dev_iotlb(struct device_domain_info *info) @@ -1484,14 +1487,15 @@ static void iommu_flush_dev_iotlb(struct dmar_domain *domain, u64 addr, unsigned mask) { struct device_domain_info *info; + unsigned long flags; if (!domain->has_iotlb_device) return; - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); list_for_each_entry(info, &domain->devices, link) __iommu_flush_dev_iotlb(info, addr, mask); - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); } static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, @@ -2453,6 +2457,7 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu; + unsigned long flags; u8 bus, devfn; int ret; @@ -2464,9 +2469,9 @@ static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev) if (ret) return ret; info->domain = domain; - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); list_add(&info->link, &domain->devices); - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); /* PASID table is mandatory for a PCI device in scalable mode. */ if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) { @@ -4090,6 +4095,7 @@ static void dmar_remove_one_dev_info(struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct dmar_domain *domain = info->domain; struct intel_iommu *iommu = info->iommu; + unsigned long flags; if (!dev_is_real_dma_subdevice(info->dev)) { if (dev_is_pci(info->dev) && sm_supported(iommu)) @@ -4101,9 +4107,9 @@ static void dmar_remove_one_dev_info(struct device *dev) intel_pasid_free_table(info->dev); } - spin_lock(&domain->lock); + spin_lock_irqsave(&domain->lock, flags); list_del(&info->link); - spin_unlock(&domain->lock); + spin_unlock_irqrestore(&domain->lock, flags); domain_detach_iommu(domain, iommu); info->domain = NULL; @@ -4422,19 +4428,20 @@ static void domain_set_force_snooping(struct dmar_domain *domain) static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); + unsigned long flags; if (dmar_domain->force_snooping) return true; - spin_lock(&dmar_domain->lock); + spin_lock_irqsave(&dmar_domain->lock, flags); if (!domain_support_force_snooping(dmar_domain)) { - spin_unlock(&dmar_domain->lock); + spin_unlock_irqrestore(&dmar_domain->lock, flags); return false; } domain_set_force_snooping(dmar_domain); dmar_domain->force_snooping = true; - spin_unlock(&dmar_domain->lock); + spin_unlock_irqrestore(&dmar_domain->lock, flags); return true; } -- cgit From 35bf49e054214a0c7d3b0a0f8606f753f3f6ae96 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 23 Aug 2022 14:15:57 +0800 Subject: iommu/vt-d: Fix lockdep splat due to klist iteration in atomic context With CONFIG_INTEL_IOMMU_DEBUGFS enabled, below lockdep splat are seen when an I/O fault occurs on a machine with an Intel IOMMU in it. DMAR: DRHD: handling fault status reg 3 DMAR: [DMA Write NO_PASID] Request device [00:1a.0] fault addr 0x0 [fault reason 0x05] PTE Write access is not set DMAR: Dump dmar0 table entries for IOVA 0x0 DMAR: root entry: 0x0000000127f42001 DMAR: context entry: hi 0x0000000000001502, low 0x000000012d8ab001 ================================ WARNING: inconsistent lock state 5.20.0-0.rc0.20220812git7ebfc85e2cd7.10.fc38.x86_64 #1 Not tainted -------------------------------- inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. rngd/1006 [HC1[1]:SC0[0]:HE0:SE1] takes: ff177021416f2d78 (&k->k_lock){?.+.}-{2:2}, at: klist_next+0x1b/0x160 {HARDIRQ-ON-W} state was registered at: lock_acquire+0xce/0x2d0 _raw_spin_lock+0x33/0x80 klist_add_tail+0x46/0x80 bus_add_device+0xee/0x150 device_add+0x39d/0x9a0 add_memory_block+0x108/0x1d0 memory_dev_init+0xe1/0x117 driver_init+0x43/0x4d kernel_init_freeable+0x1c2/0x2cc kernel_init+0x16/0x140 ret_from_fork+0x1f/0x30 irq event stamp: 7812 hardirqs last enabled at (7811): [] asm_sysvec_apic_timer_interrupt+0x16/0x20 hardirqs last disabled at (7812): [] irqentry_enter+0x54/0x60 softirqs last enabled at (7794): [] __irq_exit_rcu+0xf9/0x170 softirqs last disabled at (7787): [] __irq_exit_rcu+0xf9/0x170 The klist iterator functions using spin_*lock_irq*() but the klist insertion functions using spin_*lock(), combined with the Intel DMAR IOMMU driver iterating over klists from atomic (hardirq) context, where pci_get_domain_bus_and_slot() calls into bus_find_device() which iterates over klists. As currently there's no plan to fix the klist to make it safe to use in atomic context, this fixes the lockdep splat by avoid calling pci_get_domain_bus_and_slot() in the hardirq context. Fixes: 8ac0b64b9735 ("iommu/vt-d: Use pci_get_domain_bus_and_slot() in pgtable_walk()") Reported-by: Lennert Buytenhek Link: https://lore.kernel.org/linux-iommu/Yvo2dfpEh%2FWC+Wrr@wantstofly.org/ Link: https://lore.kernel.org/linux-iommu/YvyBdPwrTuHHbn5X@wantstofly.org/ Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20220819015949.4795-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 47 ++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) (limited to 'drivers/iommu/intel') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index e3fe1a148187..64d30895a4c8 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -813,32 +813,11 @@ static void free_context_table(struct intel_iommu *iommu) } #ifdef CONFIG_DMAR_DEBUG -static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, u8 bus, u8 devfn) +static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn, + u8 bus, u8 devfn, struct dma_pte *parent, int level) { - struct device_domain_info *info; - struct dma_pte *parent, *pte; - struct dmar_domain *domain; - struct pci_dev *pdev; - int offset, level; - - pdev = pci_get_domain_bus_and_slot(iommu->segment, bus, devfn); - if (!pdev) - return; - - info = dev_iommu_priv_get(&pdev->dev); - if (!info || !info->domain) { - pr_info("device [%02x:%02x.%d] not probed\n", - bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); - return; - } - - domain = info->domain; - level = agaw_to_level(domain->agaw); - parent = domain->pgd; - if (!parent) { - pr_info("no page table setup\n"); - return; - } + struct dma_pte *pte; + int offset; while (1) { offset = pfn_level_offset(pfn, level); @@ -865,9 +844,10 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, struct pasid_entry *entries, *pte; struct context_entry *ctx_entry; struct root_entry *rt_entry; + int i, dir_index, index, level; u8 devfn = source_id & 0xff; u8 bus = source_id >> 8; - int i, dir_index, index; + struct dma_pte *pgtable; pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr); @@ -895,8 +875,11 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, ctx_entry->hi, ctx_entry->lo); /* legacy mode does not require PASID entries */ - if (!sm_supported(iommu)) + if (!sm_supported(iommu)) { + level = agaw_to_level(ctx_entry->hi & 7); + pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); goto pgtable_walk; + } /* get the pointer to pasid directory entry */ dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK); @@ -923,8 +906,16 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, for (i = 0; i < ARRAY_SIZE(pte->val); i++) pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]); + if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) { + level = pte->val[2] & BIT_ULL(2) ? 5 : 4; + pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK); + } else { + level = agaw_to_level((pte->val[0] >> 2) & 0x7); + pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK); + } + pgtable_walk: - pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn); + pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); } #endif -- cgit From 9cd4f1434479f1ac25c440c421fbf52069079914 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sun, 11 Sep 2022 11:18:45 +0800 Subject: iommu/vt-d: Fix possible recursive locking in intel_iommu_init() The global rwsem dmar_global_lock was introduced by commit 3a5670e8ac932 ("iommu/vt-d: Introduce a rwsem to protect global data structures"). It is used to protect DMAR related global data from DMAR hotplug operations. The dmar_global_lock used in the intel_iommu_init() might cause recursive locking issue, for example, intel_iommu_get_resv_regions() is taking the dmar_global_lock from within a section where intel_iommu_init() already holds it via probe_acpi_namespace_devices(). Using dmar_global_lock in intel_iommu_init() could be relaxed since it is unlikely that any IO board must be hot added before the IOMMU subsystem is initialized. This eliminates the possible recursive locking issue by moving down DMAR hotplug support after the IOMMU is initialized and removing the uses of dmar_global_lock in intel_iommu_init(). Fixes: d5692d4af08cd ("iommu/vt-d: Fix suspicious RCU usage in probe_acpi_namespace_devices()") Reported-by: Robin Murphy Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/894db0ccae854b35c73814485569b634237b5538.1657034828.git.robin.murphy@arm.com Link: https://lore.kernel.org/r/20220718235325.3952426-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/dmar.c | 7 +++++++ drivers/iommu/intel/iommu.c | 27 ++------------------------- include/linux/dmar.h | 4 +++- 3 files changed, 12 insertions(+), 26 deletions(-) (limited to 'drivers/iommu/intel') diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 5a8f780e7ffd..497c912ad9e1 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -2349,6 +2349,13 @@ static int dmar_device_hotplug(acpi_handle handle, bool insert) if (!dmar_in_use()) return 0; + /* + * It's unlikely that any I/O board is hot added before the IOMMU + * subsystem is initialized. + */ + if (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled) + return -EOPNOTSUPP; + if (dmar_detect_dsm(handle, DMAR_DSM_FUNC_DRHD)) { tmp = handle; } else { diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 64d30895a4c8..1f2cd43cf9bc 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3019,13 +3019,7 @@ static int __init init_dmars(void) #ifdef CONFIG_INTEL_IOMMU_SVM if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) { - /* - * Call dmar_alloc_hwirq() with dmar_global_lock held, - * could cause possible lock race condition. - */ - up_write(&dmar_global_lock); ret = intel_svm_enable_prq(iommu); - down_write(&dmar_global_lock); if (ret) goto free_iommu; } @@ -3938,7 +3932,6 @@ int __init intel_iommu_init(void) force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || platform_optin_force_iommu(); - down_write(&dmar_global_lock); if (dmar_table_init()) { if (force_on) panic("tboot: Failed to initialize DMAR table\n"); @@ -3951,16 +3944,6 @@ int __init intel_iommu_init(void) goto out_free_dmar; } - up_write(&dmar_global_lock); - - /* - * The bus notifier takes the dmar_global_lock, so lockdep will - * complain later when we register it under the lock. - */ - dmar_register_bus_notifier(); - - down_write(&dmar_global_lock); - if (!no_iommu) intel_iommu_debugfs_init(); @@ -4005,11 +3988,9 @@ int __init intel_iommu_init(void) pr_err("Initialization failed\n"); goto out_free_dmar; } - up_write(&dmar_global_lock); init_iommu_pm_ops(); - down_read(&dmar_global_lock); for_each_active_iommu(iommu, drhd) { /* * The flush queue implementation does not perform @@ -4027,13 +4008,11 @@ int __init intel_iommu_init(void) "%s", iommu->name); iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL); } - up_read(&dmar_global_lock); bus_set_iommu(&pci_bus_type, &intel_iommu_ops); if (si_domain && !hw_pass_through) register_memory_notifier(&intel_iommu_memory_nb); - down_read(&dmar_global_lock); if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); @@ -4044,17 +4023,15 @@ int __init intel_iommu_init(void) iommu_disable_protect_mem_regions(iommu); } - up_read(&dmar_global_lock); - - pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); intel_iommu_enabled = 1; + dmar_register_bus_notifier(); + pr_info("Intel(R) Virtualization Technology for Directed I/O\n"); return 0; out_free_dmar: intel_iommu_free_dmars(); - up_write(&dmar_global_lock); return ret; } diff --git a/include/linux/dmar.h b/include/linux/dmar.h index d81a51978d01..8917a32173c4 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -65,6 +65,7 @@ struct dmar_pci_notify_info { extern struct rw_semaphore dmar_global_lock; extern struct list_head dmar_drhd_units; +extern int intel_iommu_enabled; #define for_each_drhd_unit(drhd) \ list_for_each_entry_rcu(drhd, &dmar_drhd_units, list, \ @@ -88,7 +89,8 @@ extern struct list_head dmar_drhd_units; static inline bool dmar_rcu_check(void) { return rwsem_is_locked(&dmar_global_lock) || - system_state == SYSTEM_BOOTING; + system_state == SYSTEM_BOOTING || + (IS_ENABLED(CONFIG_INTEL_IOMMU) && !intel_iommu_enabled); } #define dmar_rcu_dereference(p) rcu_dereference_check((p), dmar_rcu_check()) -- cgit