diff options
Diffstat (limited to 'drivers/iommu')
24 files changed, 856 insertions, 799 deletions
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h index 2d5945c982bd..6386fa4556d9 100644 --- a/drivers/iommu/amd/amd_iommu.h +++ b/drivers/iommu/amd/amd_iommu.h @@ -43,9 +43,10 @@ int amd_iommu_enable_faulting(unsigned int cpu); extern int amd_iommu_guest_ir; extern enum io_pgtable_fmt amd_iommu_pgtable; extern int amd_iommu_gpt_level; +extern unsigned long amd_iommu_pgsize_bitmap; /* Protection domain ops */ -struct protection_domain *protection_domain_alloc(unsigned int type); +struct protection_domain *protection_domain_alloc(unsigned int type, int nid); void protection_domain_free(struct protection_domain *domain); struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct mm_struct *mm); @@ -87,14 +88,10 @@ int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag); void amd_iommu_flush_all_caches(struct amd_iommu *iommu); void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); void amd_iommu_domain_update(struct protection_domain *domain); -void amd_iommu_dev_update_dte(struct iommu_dev_data *dev_data, bool set); -void amd_iommu_domain_flush_complete(struct protection_domain *domain); void amd_iommu_domain_flush_pages(struct protection_domain *domain, u64 address, size_t size); void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, ioasid_t pasid, u64 address, size_t size); -void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data, - ioasid_t pasid); #ifdef CONFIG_IRQ_REMAP int amd_iommu_create_irq_domain(struct amd_iommu *iommu); @@ -121,11 +118,6 @@ static inline bool check_feature2(u64 mask) return (amd_iommu_efr2 & mask); } -static inline int check_feature_gpt_level(void) -{ - return ((amd_iommu_efr >> FEATURE_GATS_SHIFT) & FEATURE_GATS_MASK); -} - static inline bool amd_iommu_gt_ppr_supported(void) { return (check_feature(FEATURE_GT) && @@ -143,19 +135,6 @@ static inline void *iommu_phys_to_virt(unsigned long paddr) return phys_to_virt(__sme_clr(paddr)); } -static inline -void amd_iommu_domain_set_pt_root(struct protection_domain *domain, u64 root) -{ - domain->iop.root = (u64 *)(root & PAGE_MASK); - domain->iop.mode = root & 7; /* lowest 3 bits encode pgtable mode */ -} - -static inline -void amd_iommu_domain_clr_pt_root(struct protection_domain *domain) -{ - amd_iommu_domain_set_pt_root(domain, 0); -} - static inline int get_pci_sbdf_id(struct pci_dev *pdev) { int seg = pci_domain_nr(pdev->bus); @@ -185,7 +164,6 @@ static inline struct protection_domain *to_pdomain(struct iommu_domain *dom) } bool translation_pre_enabled(struct amd_iommu *iommu); -bool amd_iommu_is_attach_deferred(struct device *dev); int __init add_special_device(u8 type, u8 id, u32 *devid, bool cmd_line); #ifdef CONFIG_DMI diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 2b76b5dedc1d..601fb4ee6900 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -8,6 +8,7 @@ #ifndef _ASM_X86_AMD_IOMMU_TYPES_H #define _ASM_X86_AMD_IOMMU_TYPES_H +#include <linux/bitfield.h> #include <linux/iommu.h> #include <linux/types.h> #include <linux/mmu_notifier.h> @@ -95,26 +96,21 @@ #define FEATURE_GA BIT_ULL(7) #define FEATURE_HE BIT_ULL(8) #define FEATURE_PC BIT_ULL(9) -#define FEATURE_GATS_SHIFT (12) -#define FEATURE_GATS_MASK (3ULL) +#define FEATURE_GATS GENMASK_ULL(13, 12) +#define FEATURE_GLX GENMASK_ULL(15, 14) #define FEATURE_GAM_VAPIC BIT_ULL(21) +#define FEATURE_PASMAX GENMASK_ULL(36, 32) #define FEATURE_GIOSUP BIT_ULL(48) #define FEATURE_HASUP BIT_ULL(49) #define FEATURE_EPHSUP BIT_ULL(50) #define FEATURE_HDSUP BIT_ULL(52) #define FEATURE_SNP BIT_ULL(63) -#define FEATURE_PASID_SHIFT 32 -#define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT) - -#define FEATURE_GLXVAL_SHIFT 14 -#define FEATURE_GLXVAL_MASK (0x03ULL << FEATURE_GLXVAL_SHIFT) /* Extended Feature 2 Bits */ -#define FEATURE_SNPAVICSUP_SHIFT 5 -#define FEATURE_SNPAVICSUP_MASK (0x07ULL << FEATURE_SNPAVICSUP_SHIFT) +#define FEATURE_SNPAVICSUP GENMASK_ULL(7, 5) #define FEATURE_SNPAVICSUP_GAM(x) \ - ((x & FEATURE_SNPAVICSUP_MASK) >> FEATURE_SNPAVICSUP_SHIFT == 0x1) + (FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1) /* Note: * The current driver only support 16-bit PASID. @@ -294,8 +290,13 @@ * that we support. * * 512GB Pages are not supported due to a hardware bug + * Page sizes >= the 52 bit max physical address of the CPU are not supported. */ -#define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) +#define AMD_IOMMU_PGSIZES (GENMASK_ULL(51, 12) ^ SZ_512G) + +/* Special mode where page-sizes are limited to 4 KiB */ +#define AMD_IOMMU_PGSIZES_4K (PAGE_SIZE) + /* 4K, 2MB, 1G page sizes are supported */ #define AMD_IOMMU_PGSIZES_V2 (PAGE_SIZE | (1ULL << 21) | (1ULL << 30)) @@ -419,10 +420,6 @@ #define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL) #define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0x1fffffULL) -#define DTE_GCR3_INDEX_A 0 -#define DTE_GCR3_INDEX_B 1 -#define DTE_GCR3_INDEX_C 1 - #define DTE_GCR3_SHIFT_A 58 #define DTE_GCR3_SHIFT_B 16 #define DTE_GCR3_SHIFT_C 43 @@ -527,7 +524,7 @@ struct amd_irte_ops; #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) #define io_pgtable_to_data(x) \ - container_of((x), struct amd_io_pgtable, iop) + container_of((x), struct amd_io_pgtable, pgtbl) #define io_pgtable_ops_to_data(x) \ io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) @@ -537,7 +534,7 @@ struct amd_irte_ops; struct protection_domain, iop) #define io_pgtable_cfg_to_data(x) \ - container_of((x), struct amd_io_pgtable, pgtbl_cfg) + container_of((x), struct amd_io_pgtable, pgtbl.cfg) struct gcr3_tbl_info { u64 *gcr3_tbl; /* Guest CR3 table */ @@ -547,8 +544,7 @@ struct gcr3_tbl_info { }; struct amd_io_pgtable { - struct io_pgtable_cfg pgtbl_cfg; - struct io_pgtable iop; + struct io_pgtable pgtbl; int mode; u64 *root; u64 *pgd; /* v2 pgtable pgd pointer */ @@ -580,7 +576,6 @@ struct protection_domain { struct amd_io_pgtable iop; spinlock_t lock; /* mostly used to lock the page table*/ u16 id; /* the domain id written to the device table */ - int nid; /* Node ID */ enum protection_domain_mode pd_mode; /* Track page table type */ bool dirty_tracking; /* dirty tracking is enabled in the domain */ unsigned dev_cnt; /* devices assigned to this domain */ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index c89d85b54a1a..43131c3a2172 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -192,6 +192,8 @@ bool amdr_ivrs_remap_support __read_mostly; bool amd_iommu_force_isolation __read_mostly; +unsigned long amd_iommu_pgsize_bitmap __ro_after_init = AMD_IOMMU_PGSIZES; + /* * AMD IOMMU allows up to 2^16 different protection domains. This is a bitmap * to know which ones are already in use. @@ -2042,14 +2044,12 @@ static int __init iommu_init_pci(struct amd_iommu *iommu) int glxval; u64 pasmax; - pasmax = amd_iommu_efr & FEATURE_PASID_MASK; - pasmax >>= FEATURE_PASID_SHIFT; + pasmax = FIELD_GET(FEATURE_PASMAX, amd_iommu_efr); iommu->iommu.max_pasids = (1 << (pasmax + 1)) - 1; BUG_ON(iommu->iommu.max_pasids & ~PASID_MASK); - glxval = amd_iommu_efr & FEATURE_GLXVAL_MASK; - glxval >>= FEATURE_GLXVAL_SHIFT; + glxval = FIELD_GET(FEATURE_GLX, amd_iommu_efr); if (amd_iommu_max_glx_val == -1) amd_iommu_max_glx_val = glxval; @@ -3088,7 +3088,7 @@ static int __init early_amd_iommu_init(void) /* 5 level guest page table */ if (cpu_feature_enabled(X86_FEATURE_LA57) && - check_feature_gpt_level() == GUEST_PGTABLE_5_LEVEL) + FIELD_GET(FEATURE_GATS, amd_iommu_efr) == GUEST_PGTABLE_5_LEVEL) amd_iommu_gpt_level = PAGE_MODE_5_LEVEL; /* Disable any previously enabled IOMMUs */ @@ -3494,6 +3494,12 @@ static int __init parse_amd_iommu_options(char *str) amd_iommu_pgtable = AMD_IOMMU_V2; } else if (strncmp(str, "irtcachedis", 11) == 0) { amd_iommu_irtcachedis = true; + } else if (strncmp(str, "nohugepages", 11) == 0) { + pr_info("Restricting V1 page-sizes to 4KiB"); + amd_iommu_pgsize_bitmap = AMD_IOMMU_PGSIZES_4K; + } else if (strncmp(str, "v2_pgsizes_only", 15) == 0) { + pr_info("Restricting V1 page-sizes to 4KiB/2MiB/1GiB"); + amd_iommu_pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; } else { pr_notice("Unknown option - '%s'\n", str); } diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 1074ee25064d..804b788f3f16 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -24,27 +24,6 @@ #include "amd_iommu.h" #include "../iommu-pages.h" -static void v1_tlb_flush_all(void *cookie) -{ -} - -static void v1_tlb_flush_walk(unsigned long iova, size_t size, - size_t granule, void *cookie) -{ -} - -static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, - unsigned long iova, size_t granule, - void *cookie) -{ -} - -static const struct iommu_flush_ops v1_flush_ops = { - .tlb_flush_all = v1_tlb_flush_all, - .tlb_flush_walk = v1_tlb_flush_walk, - .tlb_add_page = v1_tlb_add_page, -}; - /* * Helper function to get the first pte of a large mapping */ @@ -132,56 +111,40 @@ static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) } } -void amd_iommu_domain_set_pgtable(struct protection_domain *domain, - u64 *root, int mode) -{ - u64 pt_root; - - /* lowest 3 bits encode pgtable mode */ - pt_root = mode & 7; - pt_root |= (u64)root; - - amd_iommu_domain_set_pt_root(domain, pt_root); -} - /* * This function is used to add another level to an IO page table. Adding * another level increases the size of the address space by 9 bits to a size up * to 64 bits. */ -static bool increase_address_space(struct protection_domain *domain, +static bool increase_address_space(struct amd_io_pgtable *pgtable, unsigned long address, gfp_t gfp) { + struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; + struct protection_domain *domain = + container_of(pgtable, struct protection_domain, iop); unsigned long flags; bool ret = true; u64 *pte; - pte = iommu_alloc_page_node(domain->nid, gfp); + pte = iommu_alloc_page_node(cfg->amd.nid, gfp); if (!pte) return false; spin_lock_irqsave(&domain->lock, flags); - if (address <= PM_LEVEL_SIZE(domain->iop.mode)) + if (address <= PM_LEVEL_SIZE(pgtable->mode)) goto out; ret = false; - if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) + if (WARN_ON_ONCE(pgtable->mode == PAGE_MODE_6_LEVEL)) goto out; - *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); + *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); - domain->iop.root = pte; - domain->iop.mode += 1; + pgtable->root = pte; + pgtable->mode += 1; amd_iommu_update_and_flush_device_table(domain); - amd_iommu_domain_flush_complete(domain); - - /* - * Device Table needs to be updated and flushed before the new root can - * be published. - */ - amd_iommu_domain_set_pgtable(domain, pte, domain->iop.mode); pte = NULL; ret = true; @@ -193,30 +156,31 @@ out: return ret; } -static u64 *alloc_pte(struct protection_domain *domain, +static u64 *alloc_pte(struct amd_io_pgtable *pgtable, unsigned long address, unsigned long page_size, u64 **pte_page, gfp_t gfp, bool *updated) { + struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; int level, end_lvl; u64 *pte, *page; BUG_ON(!is_power_of_2(page_size)); - while (address > PM_LEVEL_SIZE(domain->iop.mode)) { + while (address > PM_LEVEL_SIZE(pgtable->mode)) { /* * Return an error if there is no memory to update the * page-table. */ - if (!increase_address_space(domain, address, gfp)) + if (!increase_address_space(pgtable, address, gfp)) return NULL; } - level = domain->iop.mode - 1; - pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; address = PAGE_SIZE_ALIGN(address, page_size); end_lvl = PAGE_SIZE_LEVEL(page_size); @@ -251,7 +215,7 @@ static u64 *alloc_pte(struct protection_domain *domain, if (!IOMMU_PTE_PRESENT(__pte) || pte_level == PAGE_MODE_NONE) { - page = iommu_alloc_page_node(domain->nid, gfp); + page = iommu_alloc_page_node(cfg->amd.nid, gfp); if (!page) return NULL; @@ -365,7 +329,7 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) { - struct protection_domain *dom = io_pgtable_ops_to_domain(ops); + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); LIST_HEAD(freelist); bool updated = false; u64 __pte, *pte; @@ -382,7 +346,7 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, while (pgcount > 0) { count = PAGE_SIZE_PTE_COUNT(pgsize); - pte = alloc_pte(dom, iova, pgsize, NULL, gfp, &updated); + pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); ret = -ENOMEM; if (!pte) @@ -419,6 +383,7 @@ static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, out: if (updated) { + struct protection_domain *dom = io_pgtable_ops_to_domain(ops); unsigned long flags; spin_lock_irqsave(&dom->lock, flags); @@ -560,27 +525,17 @@ static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, */ static void v1_free_pgtable(struct io_pgtable *iop) { - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); - struct protection_domain *dom; + struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); LIST_HEAD(freelist); if (pgtable->mode == PAGE_MODE_NONE) return; - dom = container_of(pgtable, struct protection_domain, iop); - /* Page-table is not visible to IOMMU anymore, so free it */ BUG_ON(pgtable->mode < PAGE_MODE_NONE || pgtable->mode > PAGE_MODE_6_LEVEL); free_sub_pt(pgtable->root, pgtable->mode, &freelist); - - /* Update data structure */ - amd_iommu_domain_clr_pt_root(dom); - - /* Make changes visible to IOMMUs */ - amd_iommu_domain_update(dom); - iommu_put_pages_list(&freelist); } @@ -588,17 +543,21 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo { struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES; + pgtable->root = iommu_alloc_page_node(cfg->amd.nid, GFP_KERNEL); + if (!pgtable->root) + return NULL; + pgtable->mode = PAGE_MODE_3_LEVEL; + + cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - cfg->tlb = &v1_flush_ops; - pgtable->iop.ops.map_pages = iommu_v1_map_pages; - pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; - pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; - pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; + pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; + pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; + pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; + pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; - return &pgtable->iop; + return &pgtable->pgtbl; } struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 664e91c88748..25b9042fa453 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -51,7 +51,7 @@ static inline u64 set_pgtable_attr(u64 *page) u64 prot; prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER; - prot |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; + prot |= IOMMU_PAGE_ACCESS; return (iommu_virt_to_phys(page) | prot); } @@ -233,8 +233,8 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr, size_t pgsize, size_t pgcount, int prot, gfp_t gfp, size_t *mapped) { - struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); - struct io_pgtable_cfg *cfg = &pdom->iop.iop.cfg; + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); + struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; u64 *pte; unsigned long map_size; unsigned long mapped_size = 0; @@ -251,7 +251,7 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, while (mapped_size < size) { map_size = get_alloc_page_size(pgsize); - pte = v2_alloc_pte(pdom->nid, pdom->iop.pgd, + pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd, iova, map_size, gfp, &updated); if (!pte) { ret = -EINVAL; @@ -266,8 +266,11 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, } out: - if (updated) + if (updated) { + struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); + amd_iommu_domain_flush_pages(pdom, o_iova, size); + } if (mapped) *mapped += mapped_size; @@ -281,7 +284,7 @@ static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops, struct iommu_iotlb_gather *gather) { struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); - struct io_pgtable_cfg *cfg = &pgtable->iop.cfg; + struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; unsigned long unmap_size; unsigned long unmapped = 0; size_t size = pgcount << __ffs(pgsize); @@ -323,30 +326,9 @@ static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo /* * ---------------------------------------------------- */ -static void v2_tlb_flush_all(void *cookie) -{ -} - -static void v2_tlb_flush_walk(unsigned long iova, size_t size, - size_t granule, void *cookie) -{ -} - -static void v2_tlb_add_page(struct iommu_iotlb_gather *gather, - unsigned long iova, size_t granule, - void *cookie) -{ -} - -static const struct iommu_flush_ops v2_flush_ops = { - .tlb_flush_all = v2_tlb_flush_all, - .tlb_flush_walk = v2_tlb_flush_walk, - .tlb_add_page = v2_tlb_add_page, -}; - static void v2_free_pgtable(struct io_pgtable *iop) { - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); + struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); if (!pgtable || !pgtable->pgd) return; @@ -359,26 +341,24 @@ static void v2_free_pgtable(struct io_pgtable *iop) static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) { struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); - struct protection_domain *pdom = (struct protection_domain *)cookie; int ias = IOMMU_IN_ADDR_BIT_SIZE; - pgtable->pgd = iommu_alloc_page_node(pdom->nid, GFP_ATOMIC); + pgtable->pgd = iommu_alloc_page_node(cfg->amd.nid, GFP_KERNEL); if (!pgtable->pgd) return NULL; if (get_pgtable_level() == PAGE_MODE_5_LEVEL) ias = 57; - pgtable->iop.ops.map_pages = iommu_v2_map_pages; - pgtable->iop.ops.unmap_pages = iommu_v2_unmap_pages; - pgtable->iop.ops.iova_to_phys = iommu_v2_iova_to_phys; + pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages; + pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages; + pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys; - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2, - cfg->ias = ias, - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, - cfg->tlb = &v2_flush_ops; + cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; + cfg->ias = ias; + cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; - return &pgtable->iop; + return &pgtable->pgtbl; } struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index b19e8c0f48fa..8364cd6fa47d 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -52,8 +52,6 @@ #define HT_RANGE_START (0xfd00000000ULL) #define HT_RANGE_END (0xffffffffffULL) -#define DEFAULT_PGTABLE_LEVEL PAGE_MODE_3_LEVEL - static DEFINE_SPINLOCK(pd_bitmap_lock); LIST_HEAD(ioapic_map); @@ -825,10 +823,12 @@ static void iommu_poll_events(struct amd_iommu *iommu) while (head != tail) { iommu_print_event(iommu, iommu->evt_buf + head); + + /* Update head pointer of hardware ring-buffer */ head = (head + EVENT_ENTRY_SIZE) % EVT_BUFFER_SIZE; + writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); } - writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); } #ifdef CONFIG_IRQ_REMAP @@ -1247,6 +1247,22 @@ out_unlock: return ret; } +static void domain_flush_complete(struct protection_domain *domain) +{ + int i; + + for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { + if (domain && !domain->dev_iommu[i]) + continue; + + /* + * Devices of this domain are behind this IOMMU + * We need to wait for completion of all commands. + */ + iommu_completion_wait(amd_iommus[i]); + } +} + static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) { struct iommu_cmd cmd; @@ -1483,7 +1499,7 @@ void amd_iommu_domain_flush_pages(struct protection_domain *domain, __domain_flush_pages(domain, address, size); /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ - amd_iommu_domain_flush_complete(domain); + domain_flush_complete(domain); return; } @@ -1523,7 +1539,7 @@ void amd_iommu_domain_flush_pages(struct protection_domain *domain, } /* Wait until IOMMU TLB and all device IOTLB flushes are complete */ - amd_iommu_domain_flush_complete(domain); + domain_flush_complete(domain); } /* Flush the whole IO/TLB for a given protection domain - including PDE */ @@ -1549,27 +1565,11 @@ void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data, iommu_completion_wait(iommu); } -void amd_iommu_dev_flush_pasid_all(struct iommu_dev_data *dev_data, - ioasid_t pasid) +static void dev_flush_pasid_all(struct iommu_dev_data *dev_data, + ioasid_t pasid) { - amd_iommu_dev_flush_pasid_pages(dev_data, 0, - CMD_INV_IOMMU_ALL_PAGES_ADDRESS, pasid); -} - -void amd_iommu_domain_flush_complete(struct protection_domain *domain) -{ - int i; - - for (i = 0; i < amd_iommu_get_num_iommus(); ++i) { - if (domain && !domain->dev_iommu[i]) - continue; - - /* - * Devices of this domain are behind this IOMMU - * We need to wait for completion of all commands. - */ - iommu_completion_wait(amd_iommus[i]); - } + amd_iommu_dev_flush_pasid_pages(dev_data, pasid, 0, + CMD_INV_IOMMU_ALL_PAGES_ADDRESS); } /* Flush the not present cache if it exists */ @@ -1589,15 +1589,7 @@ static void domain_flush_np_cache(struct protection_domain *domain, /* * This function flushes the DTEs for all devices in domain */ -static void domain_flush_devices(struct protection_domain *domain) -{ - struct iommu_dev_data *dev_data; - - list_for_each_entry(dev_data, &domain->dev_list, list) - device_flush_dte(dev_data); -} - -static void update_device_table(struct protection_domain *domain) +void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) { struct iommu_dev_data *dev_data; @@ -1607,12 +1599,11 @@ static void update_device_table(struct protection_domain *domain) set_dte_entry(iommu, dev_data); clone_aliases(iommu, dev_data->dev); } -} -void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) -{ - update_device_table(domain); - domain_flush_devices(domain); + list_for_each_entry(dev_data, &domain->dev_list, list) + device_flush_dte(dev_data); + + domain_flush_complete(domain); } void amd_iommu_domain_update(struct protection_domain *domain) @@ -1816,7 +1807,7 @@ static int update_gcr3(struct iommu_dev_data *dev_data, else *pte = 0; - amd_iommu_dev_flush_pasid_all(dev_data, pasid); + dev_flush_pasid_all(dev_data, pasid); return 0; } @@ -1962,7 +1953,7 @@ static void clear_dte_entry(struct amd_iommu *iommu, u16 devid) } /* Update and flush DTE for the given device */ -void amd_iommu_dev_update_dte(struct iommu_dev_data *dev_data, bool set) +static void dev_update_dte(struct iommu_dev_data *dev_data, bool set) { struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); @@ -2032,6 +2023,7 @@ static int do_attach(struct iommu_dev_data *dev_data, struct protection_domain *domain) { struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); + struct io_pgtable_cfg *cfg = &domain->iop.pgtbl.cfg; int ret = 0; /* Update data structures */ @@ -2039,8 +2031,8 @@ static int do_attach(struct iommu_dev_data *dev_data, list_add(&dev_data->list, &domain->dev_list); /* Update NUMA Node ID */ - if (domain->nid == NUMA_NO_NODE) - domain->nid = dev_to_node(dev_data->dev); + if (cfg->amd.nid == NUMA_NO_NODE) + cfg->amd.nid = dev_to_node(dev_data->dev); /* Do reference counting */ domain->dev_iommu[iommu->index] += 1; @@ -2062,7 +2054,7 @@ static void do_detach(struct iommu_dev_data *dev_data) struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); /* Clear DTE and flush the entry */ - amd_iommu_dev_update_dte(dev_data, false); + dev_update_dte(dev_data, false); /* Flush IOTLB and wait for the flushes to finish */ amd_iommu_domain_flush_all(domain); @@ -2185,11 +2177,12 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) dev_err(dev, "Failed to initialize - trying to proceed anyway\n"); iommu_dev = ERR_PTR(ret); iommu_ignore_device(iommu, dev); - } else { - amd_iommu_set_pci_msi_domain(dev, iommu); - iommu_dev = &iommu->iommu; + goto out_err; } + amd_iommu_set_pci_msi_domain(dev, iommu); + iommu_dev = &iommu->iommu; + /* * If IOMMU and device supports PASID then it will contain max * supported PASIDs, else it will be zero. @@ -2201,8 +2194,12 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev) pci_max_pasids(to_pci_dev(dev))); } +out_err: iommu_completion_wait(iommu); + if (dev_is_pci(dev)) + pci_prepare_ats(to_pci_dev(dev), PAGE_SHIFT); + return iommu_dev; } @@ -2259,53 +2256,18 @@ static void cleanup_domain(struct protection_domain *domain) void protection_domain_free(struct protection_domain *domain) { - if (!domain) - return; - - if (domain->iop.pgtbl_cfg.tlb) - free_io_pgtable_ops(&domain->iop.iop.ops); - - if (domain->iop.root) - iommu_free_page(domain->iop.root); - - if (domain->id) - domain_id_free(domain->id); - + WARN_ON(!list_empty(&domain->dev_list)); + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) + free_io_pgtable_ops(&domain->iop.pgtbl.ops); + domain_id_free(domain->id); kfree(domain); } -static int protection_domain_init_v1(struct protection_domain *domain, int mode) -{ - u64 *pt_root = NULL; - - BUG_ON(mode < PAGE_MODE_NONE || mode > PAGE_MODE_6_LEVEL); - - if (mode != PAGE_MODE_NONE) { - pt_root = iommu_alloc_page(GFP_KERNEL); - if (!pt_root) - return -ENOMEM; - } - - domain->pd_mode = PD_MODE_V1; - amd_iommu_domain_set_pgtable(domain, pt_root, mode); - - return 0; -} - -static int protection_domain_init_v2(struct protection_domain *pdom) -{ - pdom->pd_mode = PD_MODE_V2; - pdom->domain.pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; - - return 0; -} - -struct protection_domain *protection_domain_alloc(unsigned int type) +struct protection_domain *protection_domain_alloc(unsigned int type, int nid) { struct io_pgtable_ops *pgtbl_ops; struct protection_domain *domain; int pgtable; - int ret; domain = kzalloc(sizeof(*domain), GFP_KERNEL); if (!domain) @@ -2313,12 +2275,12 @@ struct protection_domain *protection_domain_alloc(unsigned int type) domain->id = domain_id_alloc(); if (!domain->id) - goto out_err; + goto err_free; spin_lock_init(&domain->lock); INIT_LIST_HEAD(&domain->dev_list); INIT_LIST_HEAD(&domain->dev_data_list); - domain->nid = NUMA_NO_NODE; + domain->iop.pgtbl.cfg.amd.nid = nid; switch (type) { /* No need to allocate io pgtable ops in passthrough mode */ @@ -2336,31 +2298,30 @@ struct protection_domain *protection_domain_alloc(unsigned int type) pgtable = AMD_IOMMU_V1; break; default: - goto out_err; + goto err_id; } switch (pgtable) { case AMD_IOMMU_V1: - ret = protection_domain_init_v1(domain, DEFAULT_PGTABLE_LEVEL); + domain->pd_mode = PD_MODE_V1; break; case AMD_IOMMU_V2: - ret = protection_domain_init_v2(domain); + domain->pd_mode = PD_MODE_V2; break; default: - ret = -EINVAL; - break; + goto err_id; } - if (ret) - goto out_err; - - pgtbl_ops = alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl_cfg, domain); + pgtbl_ops = + alloc_io_pgtable_ops(pgtable, &domain->iop.pgtbl.cfg, domain); if (!pgtbl_ops) - goto out_err; + goto err_id; return domain; -out_err: - protection_domain_free(domain); +err_id: + domain_id_free(domain->id); +err_free: + kfree(domain); return NULL; } @@ -2398,17 +2359,18 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, if (dirty_tracking && !amd_iommu_hd_support(iommu)) return ERR_PTR(-EOPNOTSUPP); - domain = protection_domain_alloc(type); + domain = protection_domain_alloc(type, + dev ? dev_to_node(dev) : NUMA_NO_NODE); if (!domain) return ERR_PTR(-ENOMEM); domain->domain.geometry.aperture_start = 0; domain->domain.geometry.aperture_end = dma_max_address(); domain->domain.geometry.force_aperture = true; + domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; if (iommu) { domain->domain.type = type; - domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; domain->domain.ops = iommu->iommu.ops->default_domain_ops; if (dirty_tracking) @@ -2448,9 +2410,6 @@ void amd_iommu_domain_free(struct iommu_domain *dom) struct protection_domain *domain; unsigned long flags; - if (!dom) - return; - domain = to_pdomain(dom); spin_lock_irqsave(&domain->lock, flags); @@ -2462,6 +2421,29 @@ void amd_iommu_domain_free(struct iommu_domain *dom) protection_domain_free(domain); } +static int blocked_domain_attach_device(struct iommu_domain *domain, + struct device *dev) +{ + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + + if (dev_data->domain) + detach_device(dev); + + /* Clear DTE and flush the entry */ + spin_lock(&dev_data->lock); + dev_update_dte(dev_data, false); + spin_unlock(&dev_data->lock); + + return 0; +} + +static struct iommu_domain blocked_domain = { + .type = IOMMU_DOMAIN_BLOCKED, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = blocked_domain_attach_device, + } +}; + static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { @@ -2517,7 +2499,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } /* Update device table */ - amd_iommu_dev_update_dte(dev_data, true); + dev_update_dte(dev_data, true); return ret; } @@ -2526,7 +2508,7 @@ static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, unsigned long iova, size_t size) { struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.iop.ops; + struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; if (ops->map_pages) domain_flush_np_cache(domain, iova, size); @@ -2538,7 +2520,7 @@ static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, int iommu_prot, gfp_t gfp, size_t *mapped) { struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.iop.ops; + struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; int prot = 0; int ret = -EINVAL; @@ -2585,7 +2567,7 @@ static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova struct iommu_iotlb_gather *gather) { struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.iop.ops; + struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; size_t r; if ((domain->pd_mode == PD_MODE_V1) && @@ -2604,7 +2586,7 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, dma_addr_t iova) { struct protection_domain *domain = to_pdomain(dom); - struct io_pgtable_ops *ops = &domain->iop.iop.ops; + struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; return ops->iova_to_phys(ops, iova); } @@ -2682,7 +2664,7 @@ static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, struct iommu_dirty_bitmap *dirty) { struct protection_domain *pdomain = to_pdomain(domain); - struct io_pgtable_ops *ops = &pdomain->iop.iop.ops; + struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops; unsigned long lflags; if (!ops || !ops->read_and_clear_dirty) @@ -2757,7 +2739,7 @@ static void amd_iommu_get_resv_regions(struct device *dev, list_add_tail(®ion->list, head); } -bool amd_iommu_is_attach_deferred(struct device *dev) +static bool amd_iommu_is_attach_deferred(struct device *dev) { struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); @@ -2859,6 +2841,7 @@ static int amd_iommu_dev_disable_feature(struct device *dev, const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, + .blocked_domain = &blocked_domain, .domain_alloc = amd_iommu_domain_alloc, .domain_alloc_user = amd_iommu_domain_alloc_user, .domain_alloc_sva = amd_iommu_domain_alloc_sva, @@ -2867,7 +2850,6 @@ const struct iommu_ops amd_iommu_ops = { .device_group = amd_iommu_device_group, .get_resv_regions = amd_iommu_get_resv_regions, .is_attach_deferred = amd_iommu_is_attach_deferred, - .pgsize_bitmap = AMD_IOMMU_PGSIZES, .def_domain_type = amd_iommu_def_domain_type, .dev_enable_feat = amd_iommu_dev_enable_feature, .dev_disable_feat = amd_iommu_dev_disable_feature, diff --git a/drivers/iommu/amd/pasid.c b/drivers/iommu/amd/pasid.c index a68215f2b3e1..0657b9373be5 100644 --- a/drivers/iommu/amd/pasid.c +++ b/drivers/iommu/amd/pasid.c @@ -181,7 +181,7 @@ struct iommu_domain *amd_iommu_domain_alloc_sva(struct device *dev, struct protection_domain *pdom; int ret; - pdom = protection_domain_alloc(IOMMU_DOMAIN_SVA); + pdom = protection_domain_alloc(IOMMU_DOMAIN_SVA, dev_to_node(dev)); if (!pdom) return ERR_PTR(-ENOMEM); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index edc625ec261d..737c5b882355 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1811,7 +1811,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt) goto out_unlock; } - iommu_report_device_fault(master->dev, &fault_evt); + ret = iommu_report_device_fault(master->dev, &fault_evt); out_unlock: mutex_unlock(&smmu->streams_mutex); return ret; @@ -3307,6 +3307,12 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) smmu->features & ARM_SMMU_FEAT_STALL_FORCE) master->stall_enabled = true; + if (dev_is_pci(dev)) { + unsigned int stu = __ffs(smmu->pgsize_bitmap); + + pci_prepare_ats(to_pci_dev(dev), stu); + } + return &smmu->iommu; err_free_master: diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index e9d2bff4659b..30be786bff11 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -416,14 +416,12 @@ static struct iommu_group *fsl_pamu_device_group(struct device *dev) static struct iommu_device *fsl_pamu_probe_device(struct device *dev) { - int len; - /* * uboot must fill the fsl,liodn for platform devices to be supported by * the iommu. */ if (!dev_is_pci(dev) && - !of_get_property(dev->of_node, "fsl,liodn", &len)) + !of_property_present(dev->of_node, "fsl,liodn")) return ERR_PTR(-ENODEV); return &pamu_iommu; diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c index 44e92638c0cd..e5b89f728ad3 100644 --- a/drivers/iommu/intel/cache.c +++ b/drivers/iommu/intel/cache.c @@ -190,6 +190,13 @@ int cache_tag_assign_domain(struct dmar_domain *domain, u16 did = domain_get_id_for_dev(domain, dev); int ret; + /* domain->qi_bach will be freed in iommu_free_domain() path. */ + if (!domain->qi_batch) { + domain->qi_batch = kzalloc(sizeof(*domain->qi_batch), GFP_KERNEL); + if (!domain->qi_batch) + return -ENOMEM; + } + ret = __cache_tag_assign_domain(domain, did, dev, pasid); if (ret || domain->domain.type != IOMMU_DOMAIN_NESTED) return ret; @@ -255,6 +262,154 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask); } +static void qi_batch_flush_descs(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (!iommu || !batch->index) + return; + + qi_submit_sync(iommu, batch->descs, batch->index, 0); + + /* Reset the index value and clean the whole batch buffer. */ + memset(batch, 0, sizeof(*batch)); +} + +static void qi_batch_increment_index(struct intel_iommu *iommu, struct qi_batch *batch) +{ + if (++batch->index == QI_MAX_BATCHED_DESC_COUNT) + qi_batch_flush_descs(iommu, batch); +} + +static void qi_batch_add_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_batch *batch) +{ + qi_desc_iotlb(iommu, did, addr, size_order, type, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u16 qdep, u64 addr, unsigned int mask, + struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any Device-TLB + * invalidation requests while address remapping hardware is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, + u64 addr, unsigned long npages, bool ih, + struct qi_batch *batch) +{ + /* + * npages == -1 means a PASID-selective invalidation, otherwise, + * a positive value for Page-selective-within-PASID invalidation. + * 0 is not a valid input. + */ + if (!npages) + return; + + qi_desc_piotlb(did, pasid, addr, npages, ih, &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void qi_batch_add_pasid_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, + u32 pasid, u16 qdep, u64 addr, + unsigned int size_order, struct qi_batch *batch) +{ + /* + * According to VT-d spec, software is recommended to not submit any + * Device-TLB invalidation requests while address remapping hardware + * is disabled. + */ + if (!(iommu->gcmd & DMA_GCMD_TE)) + return; + + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, qdep, addr, size_order, + &batch->descs[batch->index]); + qi_batch_increment_index(iommu, batch); +} + +static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long pages, + unsigned long mask, int ih) +{ + struct intel_iommu *iommu = tag->iommu; + u64 type = DMA_TLB_PSI_FLUSH; + + if (domain->use_first_level) { + qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, + pages, ih, domain->qi_batch); + return; + } + + /* + * Fallback to domain selective flush if no PSI support or the size + * is too big. + */ + if (!cap_pgsel_inv(iommu->cap) || + mask > cap_max_amask_val(iommu->cap) || pages == -1) { + addr = 0; + mask = 0; + ih = 0; + type = DMA_TLB_DSI_FLUSH; + } + + if (ecap_qis(iommu->ecap)) + qi_batch_add_iotlb(iommu, tag->domain_id, addr | ih, mask, type, + domain->qi_batch); + else + __iommu_flush_iotlb(iommu, tag->domain_id, addr | ih, mask, type); +} + +static void cache_tag_flush_devtlb_psi(struct dmar_domain *domain, struct cache_tag *tag, + unsigned long addr, unsigned long mask) +{ + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + if (tag->pasid == IOMMU_NO_PASID) { + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, + addr, mask, domain->qi_batch); + return; + } + + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_pasid_dev_iotlb(iommu, sid, info->pfsid, tag->pasid, + info->ats_qdep, addr, mask, + domain->qi_batch); +} + +static void cache_tag_flush_devtlb_all(struct dmar_domain *domain, struct cache_tag *tag) +{ + struct intel_iommu *iommu = tag->iommu; + struct device_domain_info *info; + u16 sid; + + info = dev_iommu_priv_get(tag->dev); + sid = PCI_DEVID(info->bus, info->devfn); + + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, domain->qi_batch); + if (info->dtlb_extra_inval) + qi_batch_add_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, 0, + MAX_AGAW_PFN_WIDTH, domain->qi_batch); +} + /* * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive) * when the memory mappings in the target domain have been modified. @@ -262,6 +417,7 @@ static unsigned long calculate_psi_aligned_address(unsigned long start, void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, unsigned long end, int ih) { + struct intel_iommu *iommu = NULL; unsigned long pages, mask, addr; struct cache_tag *tag; unsigned long flags; @@ -270,30 +426,14 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; switch (tag->type) { case CACHE_TAG_IOTLB: case CACHE_TAG_NESTING_IOTLB: - if (domain->use_first_level) { - qi_flush_piotlb(iommu, tag->domain_id, - tag->pasid, addr, pages, ih); - } else { - /* - * Fallback to domain selective flush if no - * PSI support or the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - addr | ih, mask, - DMA_TLB_PSI_FLUSH); - } + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, ih); break; case CACHE_TAG_NESTING_DEVTLB: /* @@ -307,23 +447,13 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, mask = MAX_AGAW_PFN_WIDTH; fallthrough; case CACHE_TAG_DEVTLB: - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - if (tag->pasid == IOMMU_NO_PASID) - qi_flush_dev_iotlb(iommu, sid, info->pfsid, - info->ats_qdep, addr, mask); - else - qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid, - tag->pasid, info->ats_qdep, - addr, mask); - - quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep); + cache_tag_flush_devtlb_psi(domain, tag, addr, mask); break; } trace_cache_tag_flush_range(tag, start, end, addr, pages, mask); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } @@ -333,39 +463,30 @@ void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start, */ void cache_tag_flush_all(struct dmar_domain *domain) { + struct intel_iommu *iommu = NULL; struct cache_tag *tag; unsigned long flags; spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; - struct device_domain_info *info; - u16 sid; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; switch (tag->type) { case CACHE_TAG_IOTLB: case CACHE_TAG_NESTING_IOTLB: - if (domain->use_first_level) - qi_flush_piotlb(iommu, tag->domain_id, - tag->pasid, 0, -1, 0); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); + cache_tag_flush_iotlb(domain, tag, 0, -1, 0, 0); break; case CACHE_TAG_DEVTLB: case CACHE_TAG_NESTING_DEVTLB: - info = dev_iommu_priv_get(tag->dev); - sid = PCI_DEVID(info->bus, info->devfn); - - qi_flush_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep, - 0, MAX_AGAW_PFN_WIDTH); - quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH, - IOMMU_NO_PASID, info->ats_qdep); + cache_tag_flush_devtlb_all(domain, tag); break; } trace_cache_tag_flush_all(tag); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } @@ -383,6 +504,7 @@ void cache_tag_flush_all(struct dmar_domain *domain) void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, unsigned long end) { + struct intel_iommu *iommu = NULL; unsigned long pages, mask, addr; struct cache_tag *tag; unsigned long flags; @@ -391,7 +513,9 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, spin_lock_irqsave(&domain->cache_lock, flags); list_for_each_entry(tag, &domain->cache_tags, node) { - struct intel_iommu *iommu = tag->iommu; + if (iommu && iommu != tag->iommu) + qi_batch_flush_descs(iommu, domain->qi_batch); + iommu = tag->iommu; if (!cap_caching_mode(iommu->cap) || domain->use_first_level) { iommu_flush_write_buffer(iommu); @@ -399,22 +523,11 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, } if (tag->type == CACHE_TAG_IOTLB || - tag->type == CACHE_TAG_NESTING_IOTLB) { - /* - * Fallback to domain selective flush if no - * PSI support or the size is too big. - */ - if (!cap_pgsel_inv(iommu->cap) || - mask > cap_max_amask_val(iommu->cap)) - iommu->flush.flush_iotlb(iommu, tag->domain_id, - 0, 0, DMA_TLB_DSI_FLUSH); - else - iommu->flush.flush_iotlb(iommu, tag->domain_id, - addr, mask, - DMA_TLB_PSI_FLUSH); - } + tag->type == CACHE_TAG_NESTING_IOTLB) + cache_tag_flush_iotlb(domain, tag, addr, pages, mask, 0); trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask); } + qi_batch_flush_descs(iommu, domain->qi_batch); spin_unlock_irqrestore(&domain->cache_lock, flags); } diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 1c8d3141cb55..eaf862e8dea1 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1204,9 +1204,7 @@ static void free_iommu(struct intel_iommu *iommu) */ static inline void reclaim_free_desc(struct q_inval *qi) { - while (qi->desc_status[qi->free_tail] == QI_DONE || - qi->desc_status[qi->free_tail] == QI_ABORT) { - qi->desc_status[qi->free_tail] = QI_FREE; + while (qi->desc_status[qi->free_tail] == QI_FREE && qi->free_tail != qi->free_head) { qi->free_tail = (qi->free_tail + 1) % QI_LENGTH; qi->free_cnt++; } @@ -1463,8 +1461,16 @@ restart: raw_spin_lock(&qi->q_lock); } - for (i = 0; i < count; i++) - qi->desc_status[(index + i) % QI_LENGTH] = QI_DONE; + /* + * The reclaim code can free descriptors from multiple submissions + * starting from the tail of the queue. When count == 0, the + * status of the standalone wait descriptor at the tail of the queue + * must be set to QI_FREE to allow the reclaim code to proceed. + * It is also possible that descriptors from one of the previous + * submissions has to be reclaimed by a subsequent submission. + */ + for (i = 0; i <= count; i++) + qi->desc_status[(index + i) % QI_LENGTH] = QI_FREE; reclaim_free_desc(qi); raw_spin_unlock_irqrestore(&qi->q_lock, flags); @@ -1520,24 +1526,9 @@ void qi_flush_context(struct intel_iommu *iommu, u16 did, u16 sid, u8 fm, void qi_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int size_order, u64 type) { - u8 dw = 0, dr = 0; - struct qi_desc desc; - int ih = 0; - - if (cap_write_drain(iommu->cap)) - dw = 1; - - if (cap_read_drain(iommu->cap)) - dr = 1; - - desc.qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) - | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; - desc.qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) - | QI_IOTLB_AM(size_order); - desc.qw2 = 0; - desc.qw3 = 0; + qi_desc_iotlb(iommu, did, addr, size_order, type, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1555,20 +1546,7 @@ void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (!(iommu->gcmd & DMA_GCMD_TE)) return; - if (mask) { - addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; - desc.qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; - } else - desc.qw1 = QI_DEV_IOTLB_ADDR(addr); - - if (qdep >= QI_DEV_IOTLB_MAX_INVS) - qdep = 0; - - desc.qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | - QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); - desc.qw2 = 0; - desc.qw3 = 0; - + qi_desc_dev_iotlb(sid, pfsid, qdep, addr, mask, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1588,28 +1566,7 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, return; } - if (npages == -1) { - desc.qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | - QI_EIOTLB_TYPE; - desc.qw1 = 0; - } else { - int mask = ilog2(__roundup_pow_of_two(npages)); - unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); - - if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) - addr = ALIGN_DOWN(addr, align); - - desc.qw0 = QI_EIOTLB_PASID(pasid) | - QI_EIOTLB_DID(did) | - QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | - QI_EIOTLB_TYPE; - desc.qw1 = QI_EIOTLB_ADDR(addr) | - QI_EIOTLB_IH(ih) | - QI_EIOTLB_AM(mask); - } - + qi_desc_piotlb(did, pasid, addr, npages, ih, &desc); qi_submit_sync(iommu, &desc, 1, 0); } @@ -1617,7 +1574,6 @@ void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, u32 pasid, u16 qdep, u64 addr, unsigned int size_order) { - unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); struct qi_desc desc = {.qw1 = 0, .qw2 = 0, .qw3 = 0}; /* @@ -1629,40 +1585,9 @@ void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, if (!(iommu->gcmd & DMA_GCMD_TE)) return; - desc.qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | - QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | - QI_DEV_IOTLB_PFSID(pfsid); - - /* - * If S bit is 0, we only flush a single page. If S bit is set, - * The least significant zero bit indicates the invalidation address - * range. VT-d spec 6.5.2.6. - * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. - * size order = 0 is PAGE_SIZE 4KB - * Max Invs Pending (MIP) is set to 0 for now until we have DIT in - * ECAP. - */ - if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) - pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", - addr, size_order); - - /* Take page address */ - desc.qw1 = QI_DEV_EIOTLB_ADDR(addr); - - if (size_order) { - /* - * Existing 0s in address below size_order may be the least - * significant bit, we must set them to 1s to avoid having - * smaller size than desired. - */ - desc.qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, - VTD_PAGE_SHIFT); - /* Clear size_order bit to indicate size */ - desc.qw1 &= ~mask; - /* Set the S bit to indicate flushing more than 1 page */ - desc.qw1 |= QI_DEV_EIOTLB_SIZE; - } - + qi_desc_dev_iotlb_pasid(sid, pfsid, pasid, + qdep, addr, size_order, + &desc); qi_submit_sync(iommu, &desc, 1, 0); } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 9ff8b83c19a3..9f6b0780f2ef 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -167,15 +167,6 @@ static void device_rbtree_remove(struct device_domain_info *info) spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags); } -/* - * This domain is a statically identity mapping domain. - * 1. This domain creats a static 1:1 mapping to all usable memory. - * 2. It maps to each iommu if successful. - * 3. Each iommu mapps to this domain if successful. - */ -static struct dmar_domain *si_domain; -static int hw_pass_through = 1; - struct dmar_rmrr_unit { struct list_head list; /* list of rmrr units */ struct acpi_dmar_header *hdr; /* ACPI header */ @@ -293,11 +284,6 @@ static int __init intel_iommu_setup(char *str) } __setup("intel_iommu=", intel_iommu_setup); -static int domain_type_is_si(struct dmar_domain *domain) -{ - return domain->domain.type == IOMMU_DOMAIN_IDENTITY; -} - static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; @@ -492,7 +478,6 @@ void domain_update_iommu_cap(struct dmar_domain *domain) domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); - domain_update_iotlb(domain); } struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, @@ -1199,9 +1184,8 @@ static void __iommu_flush_context(struct intel_iommu *iommu, raw_spin_unlock_irqrestore(&iommu->register_lock, flag); } -/* return value determine if we need a write buffer flush */ -static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int size_order, u64 type) +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; @@ -1270,32 +1254,6 @@ domain_lookup_dev_info(struct dmar_domain *domain, return NULL; } -void domain_update_iotlb(struct dmar_domain *domain) -{ - struct dev_pasid_info *dev_pasid; - struct device_domain_info *info; - bool has_iotlb_device = false; - unsigned long flags; - - spin_lock_irqsave(&domain->lock, flags); - list_for_each_entry(info, &domain->devices, link) { - if (info->ats_enabled) { - has_iotlb_device = true; - break; - } - } - - list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { - info = dev_iommu_priv_get(dev_pasid->dev); - if (info->ats_enabled) { - has_iotlb_device = true; - break; - } - } - domain->has_iotlb_device = has_iotlb_device; - spin_unlock_irqrestore(&domain->lock, flags); -} - /* * The extra devTLB flush quirk impacts those QAT devices with PCI device * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device() @@ -1322,20 +1280,9 @@ static void iommu_enable_pci_caps(struct device_domain_info *info) return; pdev = to_pci_dev(info->dev); - - /* The PCIe spec, in its wisdom, declares that the behaviour of - the device if you enable PASID support after ATS support is - undefined. So always enable PASID support on devices which - have it, even if we can't yet know if we're ever going to - use it. */ - if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1)) - info->pasid_enabled = 1; - if (info->ats_supported && pci_ats_page_aligned(pdev) && - !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) { + !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) info->ats_enabled = 1; - domain_update_iotlb(info->domain); - } } static void iommu_disable_pci_caps(struct device_domain_info *info) @@ -1350,12 +1297,6 @@ static void iommu_disable_pci_caps(struct device_domain_info *info) if (info->ats_enabled) { pci_disable_ats(pdev); info->ats_enabled = 0; - domain_update_iotlb(info->domain); - } - - if (info->pasid_enabled) { - pci_disable_pasid(pdev); - info->pasid_enabled = 0; } } @@ -1447,10 +1388,10 @@ static int iommu_init_domains(struct intel_iommu *iommu) * entry for first-level or pass-through translation modes should * be programmed with a domain id different from those used for * second-level or nested translation. We reserve a domain id for - * this purpose. + * this purpose. This domain id is also used for identity domain + * in legacy mode. */ - if (sm_supported(iommu)) - set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); + set_bit(FLPT_DEFAULT_DID, iommu->domain_ids); return 0; } @@ -1524,7 +1465,6 @@ static struct dmar_domain *alloc_domain(unsigned int type) domain->nid = NUMA_NO_NODE; if (first_level_by_default(type)) domain->use_first_level = true; - domain->has_iotlb_device = false; INIT_LIST_HEAD(&domain->devices); INIT_LIST_HEAD(&domain->dev_pasids); INIT_LIST_HEAD(&domain->cache_tags); @@ -1632,9 +1572,65 @@ static void domain_exit(struct dmar_domain *domain) if (WARN_ON(!list_empty(&domain->devices))) return; + kfree(domain->qi_batch); kfree(domain); } +/* + * For kdump cases, old valid entries may be cached due to the + * in-flight DMA and copied pgtable, but there is no unmapping + * behaviour for them, thus we need an explicit cache flush for + * the newly-mapped device. For kdump, at this point, the device + * is supposed to finish reset at its driver probe stage, so no + * in-flight DMA will exist, and we don't need to worry anymore + * hereafter. + */ +static void copied_context_tear_down(struct intel_iommu *iommu, + struct context_entry *context, + u8 bus, u8 devfn) +{ + u16 did_old; + + if (!context_copied(iommu, bus, devfn)) + return; + + assert_spin_locked(&iommu->lock); + + did_old = context_domain_id(context); + context_clear_entry(context); + + if (did_old < cap_ndoms(iommu->cap)) { + iommu->flush.flush_context(iommu, did_old, + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did_old, 0, 0, + DMA_TLB_DSI_FLUSH); + } + + clear_context_copied(iommu, bus, devfn); +} + +/* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ +static void context_present_cache_flush(struct intel_iommu *iommu, u16 did, + u8 bus, u8 devfn) +{ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + } else { + iommu_flush_write_buffer(iommu); + } +} + static int domain_context_mapping_one(struct dmar_domain *domain, struct intel_iommu *iommu, u8 bus, u8 devfn) @@ -1647,9 +1643,6 @@ static int domain_context_mapping_one(struct dmar_domain *domain, struct context_entry *context; int agaw, ret; - if (hw_pass_through && domain_type_is_si(domain)) - translation = CONTEXT_TT_PASS_THROUGH; - pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1663,83 +1656,35 @@ static int domain_context_mapping_one(struct dmar_domain *domain, if (context_present(context) && !context_copied(iommu, bus, devfn)) goto out_unlock; - /* - * For kdump cases, old valid entries may be cached due to the - * in-flight DMA and copied pgtable, but there is no unmapping - * behaviour for them, thus we need an explicit cache flush for - * the newly-mapped device. For kdump, at this point, the device - * is supposed to finish reset at its driver probe stage, so no - * in-flight DMA will exist, and we don't need to worry anymore - * hereafter. - */ - if (context_copied(iommu, bus, devfn)) { - u16 did_old = context_domain_id(context); - - if (did_old < cap_ndoms(iommu->cap)) { - iommu->flush.flush_context(iommu, did_old, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, did_old, 0, 0, - DMA_TLB_DSI_FLUSH); - } - - clear_context_copied(iommu, bus, devfn); - } - + copied_context_tear_down(iommu, context, bus, devfn); context_clear_entry(context); - context_set_domain_id(context, did); - if (translation != CONTEXT_TT_PASS_THROUGH) { - /* - * Skip top levels of page tables for iommu which has - * less agaw than default. Unnecessary for PT mode. - */ - for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { - ret = -ENOMEM; - pgd = phys_to_virt(dma_pte_addr(pgd)); - if (!dma_pte_present(pgd)) - goto out_unlock; - } - - if (info && info->ats_supported) - translation = CONTEXT_TT_DEV_IOTLB; - else - translation = CONTEXT_TT_MULTI_LEVEL; + context_set_domain_id(context, did); - context_set_address_root(context, virt_to_phys(pgd)); - context_set_address_width(context, agaw); - } else { - /* - * In pass through mode, AW must be programmed to - * indicate the largest AGAW value supported by - * hardware. And ASR is ignored by hardware. - */ - context_set_address_width(context, iommu->msagaw); + /* + * Skip top levels of page tables for iommu which has + * less agaw than default. Unnecessary for PT mode. + */ + for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) { + ret = -ENOMEM; + pgd = phys_to_virt(dma_pte_addr(pgd)); + if (!dma_pte_present(pgd)) + goto out_unlock; } + if (info && info->ats_supported) + translation = CONTEXT_TT_DEV_IOTLB; + else + translation = CONTEXT_TT_MULTI_LEVEL; + + context_set_address_root(context, virt_to_phys(pgd)); + context_set_address_width(context, agaw); context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); - - /* - * It's a non-present to present mapping. If hardware doesn't cache - * non-present entry we only need to flush the write-buffer. If the - * _does_ cache non-present entries, then it does so in the special - * domain #0, which we have to flush: - */ - if (cap_caching_mode(iommu->cap)) { - iommu->flush.flush_context(iommu, 0, - (((u16)bus) << 8) | devfn, - DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL); - iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - } else { - iommu_flush_write_buffer(iommu); - } - + context_present_cache_flush(iommu, did, bus, devfn); ret = 0; out_unlock: @@ -1944,6 +1889,7 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 { struct intel_iommu *iommu = info->iommu; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, 0); @@ -1952,10 +1898,11 @@ static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 return; } + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, true); + intel_context_flush_present(info, context, did, true); } static int domain_setup_first_level(struct intel_iommu *iommu, @@ -1998,80 +1945,6 @@ static bool dev_is_real_dma_subdevice(struct device *dev) pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev); } -static int iommu_domain_identity_map(struct dmar_domain *domain, - unsigned long first_vpfn, - unsigned long last_vpfn) -{ - /* - * RMRR range might have overlap with physical memory range, - * clear it first - */ - dma_pte_clear_range(domain, first_vpfn, last_vpfn); - - return __domain_mapping(domain, first_vpfn, - first_vpfn, last_vpfn - first_vpfn + 1, - DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL); -} - -static int md_domain_init(struct dmar_domain *domain, int guest_width); - -static int __init si_domain_init(int hw) -{ - struct dmar_rmrr_unit *rmrr; - struct device *dev; - int i, nid, ret; - - si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY); - if (!si_domain) - return -EFAULT; - - if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { - domain_exit(si_domain); - si_domain = NULL; - return -EFAULT; - } - - if (hw) - return 0; - - for_each_online_node(nid) { - unsigned long start_pfn, end_pfn; - int i; - - for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - ret = iommu_domain_identity_map(si_domain, - mm_to_dma_pfn_start(start_pfn), - mm_to_dma_pfn_end(end_pfn-1)); - if (ret) - return ret; - } - } - - /* - * Identity map the RMRRs so that devices with RMRRs could also use - * the si_domain. - */ - for_each_rmrr_units(rmrr) { - for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt, - i, dev) { - unsigned long long start = rmrr->base_address; - unsigned long long end = rmrr->end_address; - - if (WARN_ON(end < start || - end >> agaw_to_width(si_domain->agaw))) - continue; - - ret = iommu_domain_identity_map(si_domain, - mm_to_dma_pfn_start(start >> PAGE_SHIFT), - mm_to_dma_pfn_end(end >> PAGE_SHIFT)); - if (ret) - return ret; - } - } - - return 0; -} - static int dmar_domain_attach_device(struct dmar_domain *domain, struct device *dev) { @@ -2094,8 +1967,6 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (!sm_supported(iommu)) ret = domain_context_mapping(domain, dev); - else if (hw_pass_through && domain_type_is_si(domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); else if (domain->use_first_level) ret = domain_setup_first_level(iommu, domain, dev, IOMMU_NO_PASID); else @@ -2104,8 +1975,7 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (ret) goto out_block_translation; - if (sm_supported(info->iommu) || !domain_type_is_si(info->domain)) - iommu_enable_pci_caps(info); + iommu_enable_pci_caps(info); ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID); if (ret) @@ -2149,6 +2019,16 @@ static bool device_rmrr_is_relaxable(struct device *dev) static int device_def_domain_type(struct device *dev) { + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + /* + * Hardware does not support the passthrough translation mode. + * Always use a dynamaic mapping domain. + */ + if (!ecap_pass_through(iommu->ecap)) + return IOMMU_DOMAIN_DMA; + if (dev_is_pci(dev)) { struct pci_dev *pdev = to_pci_dev(dev); @@ -2439,8 +2319,6 @@ static int __init init_dmars(void) } } - if (!ecap_pass_through(iommu->ecap)) - hw_pass_through = 0; intel_svm_check(iommu); } @@ -2456,10 +2334,6 @@ static int __init init_dmars(void) check_tylersburg_isoch(); - ret = si_domain_init(hw_pass_through); - if (ret) - goto free_iommu; - /* * for each drhd * enable fault log @@ -2505,10 +2379,6 @@ free_iommu: disable_dmar_iommu(iommu); free_dmar_iommu(iommu); } - if (si_domain) { - domain_exit(si_domain); - si_domain = NULL; - } return ret; } @@ -2883,12 +2753,6 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) if (ret) goto out; - if (hw_pass_through && !ecap_pass_through(iommu->ecap)) { - pr_warn("%s: Doesn't support hardware pass through.\n", - iommu->name); - return -ENXIO; - } - sp = domain_update_iommu_superpage(NULL, iommu) - 1; if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) { pr_warn("%s: Doesn't support large page.\n", @@ -3139,43 +3003,6 @@ int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info) return 0; } -static int intel_iommu_memory_notifier(struct notifier_block *nb, - unsigned long val, void *v) -{ - struct memory_notify *mhp = v; - unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn); - unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn + - mhp->nr_pages - 1); - - switch (val) { - case MEM_GOING_ONLINE: - if (iommu_domain_identity_map(si_domain, - start_vpfn, last_vpfn)) { - pr_warn("Failed to build identity map for [%lx-%lx]\n", - start_vpfn, last_vpfn); - return NOTIFY_BAD; - } - break; - - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - { - LIST_HEAD(freelist); - - domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist); - iommu_put_pages_list(&freelist); - } - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block intel_iommu_memory_nb = { - .notifier_call = intel_iommu_memory_notifier, - .priority = 0 -}; - static void intel_disable_iommus(void) { struct intel_iommu *iommu = NULL; @@ -3472,12 +3299,7 @@ int __init intel_iommu_init(void) iommu_pmu_register(iommu); } - up_read(&dmar_global_lock); - - if (si_domain && !hw_pass_through) - register_memory_notifier(&intel_iommu_memory_nb); - down_read(&dmar_global_lock); if (probe_acpi_namespace_devices()) pr_warn("ACPI name space devices didn't probe correctly\n"); @@ -3622,7 +3444,6 @@ static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_st xa_init(&domain->iommu_array); domain->nid = dev_to_node(dev); - domain->has_iotlb_device = info->ats_enabled; domain->use_first_level = first_stage; /* calculate the address width */ @@ -3691,8 +3512,6 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) domain->geometry.force_aperture = true; return domain; - case IOMMU_DOMAIN_IDENTITY: - return &si_domain->domain; default: return NULL; } @@ -3759,8 +3578,7 @@ static void intel_iommu_domain_free(struct iommu_domain *domain) WARN_ON(dmar_domain->nested_parent && !list_empty(&dmar_domain->s1_domains)); - if (domain != &si_domain->domain) - domain_exit(dmar_domain); + domain_exit(dmar_domain); } int prepare_domain_attach_device(struct iommu_domain *domain, @@ -3810,11 +3628,9 @@ int prepare_domain_attach_device(struct iommu_domain *domain, static int intel_iommu_attach_device(struct iommu_domain *domain, struct device *dev) { - struct device_domain_info *info = dev_iommu_priv_get(dev); int ret; - if (info->domain) - device_block_translation(dev); + device_block_translation(dev); ret = prepare_domain_attach_device(domain, dev); if (ret) @@ -4091,6 +3907,7 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) dev_iommu_priv_set(dev, info); if (pdev && pci_ats_supported(pdev)) { + pci_prepare_ats(pdev, VTD_PAGE_SHIFT); ret = device_rbtree_insert(iommu, info); if (ret) goto free; @@ -4112,6 +3929,16 @@ static struct iommu_device *intel_iommu_probe_device(struct device *dev) intel_iommu_debugfs_create_dev(info); + /* + * The PCIe spec, in its wisdom, declares that the behaviour of the + * device is undefined if you enable PASID support after ATS support. + * So always enable PASID support on devices which have it, even if + * we can't yet know if we're ever going to use it. + */ + if (info->pasid_supported && + !pci_enable_pasid(pdev, info->pasid_supported & ~1)) + info->pasid_enabled = 1; + return &iommu->iommu; free_table: intel_pasid_free_table(dev); @@ -4128,6 +3955,11 @@ static void intel_iommu_release_device(struct device *dev) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + if (info->pasid_enabled) { + pci_disable_pasid(to_pci_dev(dev)); + info->pasid_enabled = 0; + } + mutex_lock(&iommu->iopf_lock); if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev))) device_rbtree_remove(info); @@ -4249,6 +4081,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) struct intel_iommu *iommu = info->iommu; u8 bus = info->bus, devfn = info->devfn; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); if (context_copied(iommu, bus, devfn)) { @@ -4261,6 +4094,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) spin_unlock(&iommu->lock); return -ENODEV; } + did = context_domain_id(context); if (enable) context_set_sm_pre(context); @@ -4269,7 +4103,7 @@ static int context_flip_pri(struct device_domain_info *info, bool enable) if (!ecap_coherent(iommu->ecap)) clflush_cache_range(context, sizeof(*context)); - intel_context_flush_present(info, context, true); + intel_context_flush_present(info, context, did, true); spin_unlock(&iommu->lock); return 0; @@ -4420,11 +4254,17 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, struct iommu_domain *domain) { struct device_domain_info *info = dev_iommu_priv_get(dev); - struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct dev_pasid_info *curr, *dev_pasid = NULL; struct intel_iommu *iommu = info->iommu; + struct dmar_domain *dmar_domain; unsigned long flags; + if (domain->type == IOMMU_DOMAIN_IDENTITY) { + intel_pasid_tear_down_entry(iommu, dev, pasid, false); + return; + } + + dmar_domain = to_dmar_domain(domain); spin_lock_irqsave(&dmar_domain->lock, flags); list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) { if (curr->dev == dev && curr->pasid == pasid) { @@ -4479,9 +4319,7 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (ret) goto out_detach_iommu; - if (domain_type_is_si(dmar_domain)) - ret = intel_pasid_setup_pass_through(iommu, dev, pasid); - else if (dmar_domain->use_first_level) + if (dmar_domain->use_first_level) ret = domain_setup_first_level(iommu, dmar_domain, dev, pasid); else @@ -4651,9 +4489,111 @@ static const struct iommu_dirty_ops intel_dirty_ops = { .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, }; +static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct context_entry *context; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, bus, devfn, 1); + if (!context) { + spin_unlock(&iommu->lock); + return -ENOMEM; + } + + if (context_present(context) && !context_copied(iommu, bus, devfn)) { + spin_unlock(&iommu->lock); + return 0; + } + + copied_context_tear_down(iommu, context, bus, devfn); + context_clear_entry(context); + context_set_domain_id(context, FLPT_DEFAULT_DID); + + /* + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. + */ + context_set_address_width(context, iommu->msagaw); + context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH); + context_set_fault_enable(context); + context_set_present(context); + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(context, sizeof(*context)); + context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn); + spin_unlock(&iommu->lock); + + return 0; +} + +static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data) +{ + struct device *dev = data; + + if (dev != &pdev->dev) + return 0; + + return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff); +} + +static int device_setup_pass_through(struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) + return context_setup_pass_through(dev, info->bus, info->devfn); + + return pci_for_each_dma_alias(to_pci_dev(dev), + context_setup_pass_through_cb, dev); +} + +static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + int ret; + + device_block_translation(dev); + + if (dev_is_real_dma_subdevice(dev)) + return 0; + + if (sm_supported(iommu)) { + ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID); + if (!ret) + iommu_enable_pci_caps(info); + } else { + ret = device_setup_pass_through(dev); + } + + return ret; +} + +static int identity_domain_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + + if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) + return -EOPNOTSUPP; + + return intel_pasid_setup_pass_through(iommu, dev, pasid); +} + +static struct iommu_domain identity_domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = identity_domain_attach_dev, + .set_dev_pasid = identity_domain_set_dev_pasid, + }, +}; + const struct iommu_ops intel_iommu_ops = { .blocked_domain = &blocking_domain, .release_domain = &blocking_domain, + .identity_domain = &identity_domain, .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index b67c14da1240..428d253f1348 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -584,11 +584,23 @@ struct iommu_domain_info { * to VT-d spec, section 9.3 */ }; +/* + * We start simply by using a fixed size for the batched descriptors. This + * size is currently sufficient for our needs. Future improvements could + * involve dynamically allocating the batch buffer based on actual demand, + * allowing us to adjust the batch size for optimal performance in different + * scenarios. + */ +#define QI_MAX_BATCHED_DESC_COUNT 16 +struct qi_batch { + struct qi_desc descs[QI_MAX_BATCHED_DESC_COUNT]; + unsigned int index; +}; + struct dmar_domain { int nid; /* node id */ struct xarray iommu_array; /* Attached IOMMU array */ - u8 has_iotlb_device: 1; u8 iommu_coherency: 1; /* indicate coherency of iommu access */ u8 force_snooping : 1; /* Create IOPTEs with snoop control */ u8 set_pte_snp:1; @@ -609,6 +621,7 @@ struct dmar_domain { spinlock_t cache_lock; /* Protect the cache tag list */ struct list_head cache_tags; /* Cache tag list */ + struct qi_batch *qi_batch; /* Batched QI descriptors */ int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, @@ -1067,6 +1080,115 @@ static inline unsigned long nrpages_to_size(unsigned long npages) return npages << VTD_PAGE_SHIFT; } +static inline void qi_desc_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type, + struct qi_desc *desc) +{ + u8 dw = 0, dr = 0; + int ih = 0; + + if (cap_write_drain(iommu->cap)) + dw = 1; + + if (cap_read_drain(iommu->cap)) + dr = 1; + + desc->qw0 = QI_IOTLB_DID(did) | QI_IOTLB_DR(dr) | QI_IOTLB_DW(dw) + | QI_IOTLB_GRAN(type) | QI_IOTLB_TYPE; + desc->qw1 = QI_IOTLB_ADDR(addr) | QI_IOTLB_IH(ih) + | QI_IOTLB_AM(size_order); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_dev_iotlb(u16 sid, u16 pfsid, u16 qdep, u64 addr, + unsigned int mask, struct qi_desc *desc) +{ + if (mask) { + addr |= (1ULL << (VTD_PAGE_SHIFT + mask - 1)) - 1; + desc->qw1 = QI_DEV_IOTLB_ADDR(addr) | QI_DEV_IOTLB_SIZE; + } else { + desc->qw1 = QI_DEV_IOTLB_ADDR(addr); + } + + if (qdep >= QI_DEV_IOTLB_MAX_INVS) + qdep = 0; + + desc->qw0 = QI_DEV_IOTLB_SID(sid) | QI_DEV_IOTLB_QDEP(qdep) | + QI_DIOTLB_TYPE | QI_DEV_IOTLB_PFSID(pfsid); + desc->qw2 = 0; + desc->qw3 = 0; +} + +static inline void qi_desc_piotlb(u16 did, u32 pasid, u64 addr, + unsigned long npages, bool ih, + struct qi_desc *desc) +{ + if (npages == -1) { + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = 0; + } else { + int mask = ilog2(__roundup_pow_of_two(npages)); + unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); + + if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) + addr = ALIGN_DOWN(addr, align); + + desc->qw0 = QI_EIOTLB_PASID(pasid) | + QI_EIOTLB_DID(did) | + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | + QI_EIOTLB_TYPE; + desc->qw1 = QI_EIOTLB_ADDR(addr) | + QI_EIOTLB_IH(ih) | + QI_EIOTLB_AM(mask); + } +} + +static inline void qi_desc_dev_iotlb_pasid(u16 sid, u16 pfsid, u32 pasid, + u16 qdep, u64 addr, + unsigned int size_order, + struct qi_desc *desc) +{ + unsigned long mask = 1UL << (VTD_PAGE_SHIFT + size_order - 1); + + desc->qw0 = QI_DEV_EIOTLB_PASID(pasid) | QI_DEV_EIOTLB_SID(sid) | + QI_DEV_EIOTLB_QDEP(qdep) | QI_DEIOTLB_TYPE | + QI_DEV_IOTLB_PFSID(pfsid); + + /* + * If S bit is 0, we only flush a single page. If S bit is set, + * The least significant zero bit indicates the invalidation address + * range. VT-d spec 6.5.2.6. + * e.g. address bit 12[0] indicates 8KB, 13[0] indicates 16KB. + * size order = 0 is PAGE_SIZE 4KB + * Max Invs Pending (MIP) is set to 0 for now until we have DIT in + * ECAP. + */ + if (!IS_ALIGNED(addr, VTD_PAGE_SIZE << size_order)) + pr_warn_ratelimited("Invalidate non-aligned address %llx, order %d\n", + addr, size_order); + + /* Take page address */ + desc->qw1 = QI_DEV_EIOTLB_ADDR(addr); + + if (size_order) { + /* + * Existing 0s in address below size_order may be the least + * significant bit, we must set them to 1s to avoid having + * smaller size than desired. + */ + desc->qw1 |= GENMASK_ULL(size_order + VTD_PAGE_SHIFT - 1, + VTD_PAGE_SHIFT); + /* Clear size_order bit to indicate size */ + desc->qw1 &= ~mask; + /* Set the S bit to indicate flushing more than 1 page */ + desc->qw1 |= QI_DEV_EIOTLB_SIZE; + } +} + /* Convert value to context PASID directory size field coding. */ #define context_pdts(pds) (((pds) & 0x7) << 9) @@ -1098,13 +1220,15 @@ void qi_flush_pasid_cache(struct intel_iommu *iommu, u16 did, u64 granu, int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, unsigned int count, unsigned long options); + +void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, + unsigned int size_order, u64 type); /* * Options used in qi_submit_sync: * QI_OPT_WAIT_DRAIN - Wait for PRQ drain completion, spec 6.5.2.8. */ #define QI_OPT_WAIT_DRAIN BIT(0) -void domain_update_iotlb(struct dmar_domain *domain); int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); @@ -1154,7 +1278,7 @@ void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start, void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, - bool affect_domains); + u16 did, bool affect_domains); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index 16a2bcf5cfeb..433c58944401 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -66,8 +66,6 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, list_add(&info->link, &dmar_domain->devices); spin_unlock_irqrestore(&dmar_domain->lock, flags); - domain_update_iotlb(dmar_domain); - return 0; unassign_tag: cache_tag_unassign_domain(dmar_domain, dev, IOMMU_NO_PASID); @@ -85,6 +83,7 @@ static void intel_nested_domain_free(struct iommu_domain *domain) spin_lock(&s2_domain->s1_lock); list_del(&dmar_domain->s2_link); spin_unlock(&s2_domain->s1_lock); + kfree(dmar_domain->qi_batch); kfree(dmar_domain); } diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 5792c817cefa..2e5fa0a23299 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -264,9 +264,7 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, else iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); } /* @@ -493,9 +491,7 @@ int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); return 0; } @@ -572,9 +568,7 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, pasid_cache_invalidation_with_pasid(iommu, did, pasid); qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); - /* Device IOTLB doesn't need to be flushed in caching mode. */ - if (!cap_caching_mode(iommu->cap)) - devtlb_invalidation_with_pasid(iommu, dev, pasid); + devtlb_invalidation_with_pasid(iommu, dev, pasid); } /** @@ -683,6 +677,7 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; struct context_entry *context; + u16 did; spin_lock(&iommu->lock); context = iommu_context_addr(iommu, bus, devfn, false); @@ -691,10 +686,11 @@ static void device_pasid_table_teardown(struct device *dev, u8 bus, u8 devfn) return; } + did = context_domain_id(context); context_clear_entry(context); __iommu_flush_cache(iommu, context, sizeof(*context)); spin_unlock(&iommu->lock); - intel_context_flush_present(info, context, false); + intel_context_flush_present(info, context, did, false); } static int pci_pasid_table_teardown(struct pci_dev *pdev, u16 alias, void *data) @@ -885,10 +881,9 @@ static void __context_flush_dev_iotlb(struct device_domain_info *info) */ void intel_context_flush_present(struct device_domain_info *info, struct context_entry *context, - bool flush_domains) + u16 did, bool flush_domains) { struct intel_iommu *iommu = info->iommu; - u16 did = context_domain_id(context); struct pasid_entry *pte; int i; diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 0e3a9b38bef2..078d1e32a24e 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -184,7 +184,10 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static void intel_mm_free_notifier(struct mmu_notifier *mn) { - kfree(container_of(mn, struct dmar_domain, notifier)); + struct dmar_domain *domain = container_of(mn, struct dmar_domain, notifier); + + kfree(domain->qi_batch); + kfree(domain); } static const struct mmu_notifier_ops intel_mmuops = { @@ -311,7 +314,7 @@ void intel_drain_pasid_prq(struct device *dev, u32 pasid) domain = info->domain; pdev = to_pci_dev(dev); sid = PCI_DEVID(info->bus, info->devfn); - did = domain_id_iommu(domain, iommu); + did = domain ? domain_id_iommu(domain, iommu) : FLPT_DEFAULT_DID; qdep = pci_ats_queue_depth(pdev); /* diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index cd679c13752e..4674e618797c 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -115,6 +115,59 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, return group; } +static struct iommu_attach_handle *find_fault_handler(struct device *dev, + struct iopf_fault *evt) +{ + struct iommu_fault *fault = &evt->fault; + struct iommu_attach_handle *attach_handle; + + if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) { + attach_handle = iommu_attach_handle_get(dev->iommu_group, + fault->prm.pasid, 0); + if (IS_ERR(attach_handle)) { + const struct iommu_ops *ops = dev_iommu_ops(dev); + + if (!ops->user_pasid_table) + return NULL; + /* + * The iommu driver for this device supports user- + * managed PASID table. Therefore page faults for + * any PASID should go through the NESTING domain + * attached to the device RID. + */ + attach_handle = iommu_attach_handle_get( + dev->iommu_group, IOMMU_NO_PASID, + IOMMU_DOMAIN_NESTED); + if (IS_ERR(attach_handle)) + return NULL; + } + } else { + attach_handle = iommu_attach_handle_get(dev->iommu_group, + IOMMU_NO_PASID, 0); + + if (IS_ERR(attach_handle)) + return NULL; + } + + if (!attach_handle->domain->iopf_handler) + return NULL; + + return attach_handle; +} + +static void iopf_error_response(struct device *dev, struct iopf_fault *evt) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_fault *fault = &evt->fault; + struct iommu_page_response resp = { + .pasid = fault->prm.pasid, + .grpid = fault->prm.grpid, + .code = IOMMU_PAGE_RESP_INVALID + }; + + ops->page_response(dev, evt, &resp); +} + /** * iommu_report_device_fault() - Report fault event to device driver * @dev: the device @@ -153,23 +206,39 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param, * handling framework should guarantee that the iommu domain could only be * freed after the device has stopped generating page faults (or the iommu * hardware has been set to block the page faults) and the pending page faults - * have been flushed. + * have been flushed. In case no page fault handler is attached or no iopf params + * are setup, then the ops->page_response() is called to complete the evt. + * + * Returns 0 on success, or an error in case of a bad/failed iopf setup. */ -void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) +int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) { + struct iommu_attach_handle *attach_handle; struct iommu_fault *fault = &evt->fault; struct iommu_fault_param *iopf_param; struct iopf_group abort_group = {}; struct iopf_group *group; + attach_handle = find_fault_handler(dev, evt); + if (!attach_handle) + goto err_bad_iopf; + + /* + * Something has gone wrong if a fault capable domain is attached but no + * iopf_param is setup + */ iopf_param = iopf_get_dev_fault_param(dev); if (WARN_ON(!iopf_param)) - return; + goto err_bad_iopf; if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) { - report_partial_fault(iopf_param, fault); + int ret; + + ret = report_partial_fault(iopf_param, fault); iopf_put_dev_fault_param(iopf_param); /* A request that is not the last does not need to be ack'd */ + + return ret; } /* @@ -184,38 +253,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) if (group == &abort_group) goto err_abort; - if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) { - group->attach_handle = iommu_attach_handle_get(dev->iommu_group, - fault->prm.pasid, - 0); - if (IS_ERR(group->attach_handle)) { - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (!ops->user_pasid_table) - goto err_abort; - - /* - * The iommu driver for this device supports user- - * managed PASID table. Therefore page faults for - * any PASID should go through the NESTING domain - * attached to the device RID. - */ - group->attach_handle = - iommu_attach_handle_get(dev->iommu_group, - IOMMU_NO_PASID, - IOMMU_DOMAIN_NESTED); - if (IS_ERR(group->attach_handle)) - goto err_abort; - } - } else { - group->attach_handle = - iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); - if (IS_ERR(group->attach_handle)) - goto err_abort; - } - - if (!group->attach_handle->domain->iopf_handler) - goto err_abort; + group->attach_handle = attach_handle; /* * On success iopf_handler must call iopf_group_response() and @@ -224,7 +262,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt) if (group->attach_handle->domain->iopf_handler(group)) goto err_abort; - return; + return 0; err_abort: dev_warn_ratelimited(dev, "iopf with pasid %d aborted\n", @@ -234,6 +272,14 @@ err_abort: __iopf_free_group(group); else iopf_free_group(group); + + return 0; + +err_bad_iopf: + if (fault->type == IOMMU_FAULT_PAGE_REQ) + iopf_error_response(dev, evt); + + return -EINVAL; } EXPORT_SYMBOL_GPL(iommu_report_device_fault); diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c index 75f244a3e12d..06ffc683b28f 100644 --- a/drivers/iommu/io-pgtable-arm-v7s.c +++ b/drivers/iommu/io-pgtable-arm-v7s.c @@ -552,9 +552,8 @@ static int arm_v7s_map_pages(struct io_pgtable_ops *ops, unsigned long iova, paddr >= (1ULL << data->iop.cfg.oas))) return -ERANGE; - /* If no access, then nothing to do */ if (!(prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; while (pgcount--) { ret = __arm_v7s_map(data, iova, paddr, pgsize, prot, 1, data->pgd, diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index f5d9fd1f45bf..0e67f1721a3d 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -274,13 +274,13 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries, sizeof(*ptep) * num_entries, DMA_TO_DEVICE); } -static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg) +static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries) { + for (int i = 0; i < num_entries; i++) + ptep[i] = 0; - *ptep = 0; - - if (!cfg->coherent_walk) - __arm_lpae_sync_pte(ptep, 1, cfg); + if (!cfg->coherent_walk && num_entries) + __arm_lpae_sync_pte(ptep, num_entries, cfg); } static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, @@ -515,9 +515,8 @@ static int arm_lpae_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(iaext || paddr >> cfg->oas)) return -ERANGE; - /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; prot = arm_lpae_prot_to_pte(data, iommu_prot); ret = __arm_lpae_map(data, iova, paddr, pgsize, pgcount, prot, lvl, @@ -654,26 +653,29 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data, max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start; num_entries = min_t(int, pgcount, max_entries); - while (i < num_entries) { - pte = READ_ONCE(*ptep); + /* Find and handle non-leaf entries */ + for (i = 0; i < num_entries; i++) { + pte = READ_ONCE(ptep[i]); if (WARN_ON(!pte)) break; - __arm_lpae_clear_pte(ptep, &iop->cfg); - if (!iopte_leaf(pte, lvl, iop->fmt)) { + __arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1); + /* Also flush any partial walks */ io_pgtable_tlb_flush_walk(iop, iova + i * size, size, ARM_LPAE_GRANULE(data)); __arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data)); - } else if (!iommu_iotlb_gather_queued(gather)) { - io_pgtable_tlb_add_page(iop, gather, iova + i * size, size); } - - ptep++; - i++; } + /* Clear the remaining entries */ + __arm_lpae_clear_pte(ptep, &iop->cfg, i); + + if (gather && !iommu_iotlb_gather_queued(gather)) + for (int j = 0; j < i; j++) + io_pgtable_tlb_add_page(iop, gather, iova + j * size, size); + return i * size; } else if (iopte_leaf(pte, lvl, iop->fmt)) { /* diff --git a/drivers/iommu/io-pgtable-dart.c b/drivers/iommu/io-pgtable-dart.c index ad28031e1e93..c004640640ee 100644 --- a/drivers/iommu/io-pgtable-dart.c +++ b/drivers/iommu/io-pgtable-dart.c @@ -245,9 +245,8 @@ static int dart_map_pages(struct io_pgtable_ops *ops, unsigned long iova, if (WARN_ON(paddr >> cfg->oas)) return -ERANGE; - /* If no access, then nothing to do */ if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) - return 0; + return -EINVAL; tbl = dart_get_table(data, iova); diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 9a7ec5997c61..3214a4c17c6b 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -526,7 +526,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, err_unresv: if (hwpt_is_paging(hwpt)) iommufd_group_remove_reserved_iova(igroup, - to_hwpt_paging(old_hwpt)); + to_hwpt_paging(hwpt)); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c index 742248276548..157a89b993e4 100644 --- a/drivers/iommu/iommufd/ioas.c +++ b/drivers/iommu/iommufd/ioas.c @@ -213,6 +213,10 @@ int iommufd_ioas_map(struct iommufd_ucmd *ucmd) if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id); if (IS_ERR(ioas)) return PTR_ERR(ioas); @@ -253,6 +257,10 @@ int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) cmd->dst_iova >= ULONG_MAX) return -EOVERFLOW; + if (!(cmd->flags & + (IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE))) + return -EINVAL; + src_ioas = iommufd_get_ioas(ucmd->ictx, cmd->src_ioas_id); if (IS_ERR(src_ioas)) return PTR_ERR(src_ioas); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index f95e32e29133..222cfc11ebfd 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -273,7 +273,7 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, return 0; } -const struct iommu_dirty_ops dirty_ops = { +static const struct iommu_dirty_ops dirty_ops = { .set_dirty_tracking = mock_domain_set_dirty_tracking, .read_and_clear_dirty = mock_domain_read_and_clear_dirty, }; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 78d61da75257..e7a6a1611d19 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -214,7 +214,7 @@ void of_iommu_get_resv_regions(struct device *dev, struct list_head *list) * that represent reservations in the IOVA space, which are regions that should * not be mapped. */ - if (of_find_property(it.node, "reg", NULL)) { + if (of_property_present(it.node, "reg")) { err = of_address_to_resource(it.node, 0, &phys); if (err < 0) { dev_err(dev, "failed to parse memory region %pOF: %d\n", |