diff options
27 files changed, 1462 insertions, 282 deletions
diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 4da60b463995..ccc60324e738 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -53,6 +53,7 @@ stable kernels. | ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 | | ARM | Cortex-A57 | #852523 | N/A | | ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | +| ARM | Cortex-A72 | #853709 | N/A | | ARM | MMU-500 | #841119,#826419 | N/A | | | | | | | Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 46c030a49186..748ef7bdd64d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -460,6 +460,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. driver will print ACPI tables for AMD IOMMU during IOMMU initialization. + amd_iommu_intr= [HW,X86-64] + Specifies one of the following AMD IOMMU interrupt + remapping modes: + legacy - Use legacy interrupt remapping mode. + vapic - Use virtual APIC mode, which allows IOMMU + to inject interrupts directly into guest. + This mode requires kvm-amd.avic=1. + (Default when IOMMU HW support is present.) + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: <a>,<b> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index bda27b6b1aa2..29d0b23af2a9 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1309,7 +1309,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, smp_rmb(); pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); - if (is_error_pfn(pfn)) + if (is_error_noslot_pfn(pfn)) return -EFAULT; if (kvm_is_device_pfn(pfn)) { diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index ae7855f16ec2..5a84b4562603 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -256,7 +256,7 @@ static int __hyp_text __guest_run(struct kvm_vcpu *vcpu) /* * We must restore the 32-bit state before the sysregs, thanks - * to Cortex-A57 erratum #852523. + * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72). */ __sysreg32_restore_state(vcpu); __sysreg_restore_guest_state(guest_ctxt); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index b0b225ceca18..e51367d159d0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -823,14 +823,6 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Architected system registers. * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 * - * We could trap ID_DFR0 and tell the guest we don't support performance - * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was - * NAKed, so it will read the PMCR anyway. - * - * Therefore we tell the guest we have 0 counters. Unfortunately, we - * must always support PMCCNTR (the cycle counter): we just RAZ/WI for - * all PM registers, which doesn't crash the guest kernel at least. - * * Debug handling: We do trap most, if not all debug related system * registers. The implementation is good enough to ensure that a guest * can use these with minimal performance degradation. The drawback is @@ -1360,7 +1352,7 @@ static const struct sys_reg_desc cp15_regs[] = { { Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, c10_AMAIR1 }, /* ICC_SRE */ - { Op1( 0), CRn(12), CRm(12), Op2( 5), trap_raz_wi }, + { Op1( 0), CRn(12), CRm(12), Op2( 5), access_gic_sre }, { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID }, diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 6cfdcf55572d..121008c0fcc9 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -40,7 +40,7 @@ static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) srcu_idx = srcu_read_lock(&kvm->srcu); pfn = gfn_to_pfn(kvm, gfn); - if (is_error_pfn(pfn)) { + if (is_error_noslot_pfn(pfn)) { kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn); err = -EFAULT; goto out; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33ae3a4d0159..4c738c206be3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -781,9 +781,11 @@ struct kvm_arch { bool disabled_lapic_found; /* Struct members for AVIC */ + u32 avic_vm_id; u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; + struct hlist_node hnode; bool x2apic_format; bool x2apic_broadcast_quirk_disabled; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3235e0fe7792..afa7bbb596cd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -366,7 +366,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB); + F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(AVX512BW) | F(AVX512VL); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b62c85229711..23b99f305382 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1761,9 +1761,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if (value & MSR_IA32_APICBASE_ENABLE) { kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); static_key_slow_dec_deferred(&apic_hw_disabled); - } else + } else { static_key_slow_inc(&apic_hw_disabled.key); - recalculate_apic_map(vcpu->kvm); + recalculate_apic_map(vcpu->kvm); + } } if ((old_value ^ value) & X2APIC_ENABLE) { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3d4cc8cc56a3..d9c7e986b4e4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1207,7 +1207,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) * * Return true if tlb need be flushed. */ -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) +static bool spte_write_protect(u64 *sptep, bool pt_protect) { u64 spte = *sptep; @@ -1233,12 +1233,12 @@ static bool __rmap_write_protect(struct kvm *kvm, bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_write_protect(kvm, sptep, pt_protect); + flush |= spte_write_protect(sptep, pt_protect); return flush; } -static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) +static bool spte_clear_dirty(u64 *sptep) { u64 spte = *sptep; @@ -1256,12 +1256,12 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_clear_dirty(kvm, sptep); + flush |= spte_clear_dirty(sptep); return flush; } -static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) +static bool spte_set_dirty(u64 *sptep) { u64 spte = *sptep; @@ -1279,7 +1279,7 @@ static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - flush |= spte_set_dirty(kvm, sptep); + flush |= spte_set_dirty(sptep); return flush; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af523d84d102..db77c1ca9e76 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -34,6 +34,8 @@ #include <linux/sched.h> #include <linux/trace_events.h> #include <linux/slab.h> +#include <linux/amd-iommu.h> +#include <linux/hashtable.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -41,6 +43,7 @@ #include <asm/desc.h> #include <asm/debugreg.h> #include <asm/kvm_para.h> +#include <asm/irq_remapping.h> #include <asm/virtext.h> #include "trace.h" @@ -96,6 +99,19 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF +/* AVIC GATAG is encoded using VM and VCPU IDs */ +#define AVIC_VCPU_ID_BITS 8 +#define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) + +#define AVIC_VM_ID_BITS 24 +#define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) +#define AVIC_VM_ID_MASK ((1 << AVIC_VM_ID_BITS) - 1) + +#define AVIC_GATAG(x, y) (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \ + (y & AVIC_VCPU_ID_MASK)) +#define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK) +#define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -185,6 +201,23 @@ struct vcpu_svm { struct page *avic_backing_page; u64 *avic_physical_id_cache; bool avic_is_running; + + /* + * Per-vcpu list of struct amd_svm_iommu_ir: + * This is used mainly to store interrupt remapping information used + * when update the vcpu affinity. This avoids the need to scan for + * IRTE and try to match ga_tag in the IOMMU driver. + */ + struct list_head ir_list; + spinlock_t ir_list_lock; +}; + +/* + * This is a wrapper of struct amd_iommu_ir_data. + */ +struct amd_svm_iommu_ir { + struct list_head node; /* Used by SVM for per-vcpu ir_list */ + void *data; /* Storing pointer to struct amd_ir_data */ }; #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) @@ -242,6 +275,10 @@ static int avic; module_param(avic, int, S_IRUGO); #endif +/* AVIC VM ID bit masks and lock */ +static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); +static DEFINE_SPINLOCK(avic_vm_id_lock); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -928,6 +965,55 @@ static void svm_disable_lbrv(struct vcpu_svm *svm) set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); } +/* Note: + * This hash table is used to map VM_ID to a struct kvm_arch, + * when handling AMD IOMMU GALOG notification to schedule in + * a particular vCPU. + */ +#define SVM_VM_DATA_HASH_BITS 8 +DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static spinlock_t svm_vm_data_hash_lock; + +/* Note: + * This function is called from IOMMU driver to notify + * SVM to schedule in a particular vCPU of a particular VM. + */ +static int avic_ga_log_notifier(u32 ga_tag) +{ + unsigned long flags; + struct kvm_arch *ka = NULL; + struct kvm_vcpu *vcpu = NULL; + u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); + + pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct kvm_arch *vm_data = &kvm->arch; + + if (vm_data->avic_vm_id != vm_id) + continue; + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + break; + } + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + + if (!vcpu) + return 0; + + /* Note: + * At this point, the IOMMU should have already set the pending + * bit in the vAPIC backing page. So, we just need to schedule + * in the vcpu. + */ + if (vcpu->mode == OUTSIDE_GUEST_MODE) + kvm_vcpu_wake_up(vcpu); + + return 0; +} + static __init int svm_hardware_setup(void) { int cpu; @@ -986,10 +1072,15 @@ static __init int svm_hardware_setup(void) if (avic) { if (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC) || - !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) + !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) { avic = false; - else + } else { pr_info("AVIC enabled\n"); + + hash_init(svm_vm_data_hash); + spin_lock_init(&svm_vm_data_hash_lock); + amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier); + } } return 0; @@ -1280,18 +1371,54 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static inline int avic_get_next_vm_id(void) +{ + int id; + + spin_lock(&avic_vm_id_lock); + + /* AVIC VM ID is one-based. */ + id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1); + if (id <= AVIC_VM_ID_MASK) + __set_bit(id, avic_vm_id_bitmap); + else + id = -EAGAIN; + + spin_unlock(&avic_vm_id_lock); + return id; +} + +static inline int avic_free_vm_id(int id) +{ + if (id <= 0 || id > AVIC_VM_ID_MASK) + return -EINVAL; + + spin_lock(&avic_vm_id_lock); + __clear_bit(id, avic_vm_id_bitmap); + spin_unlock(&avic_vm_id_lock); + return 0; +} + static void avic_vm_destroy(struct kvm *kvm) { + unsigned long flags; struct kvm_arch *vm_data = &kvm->arch; + avic_free_vm_id(vm_data->avic_vm_id); + if (vm_data->avic_logical_id_table_page) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) __free_page(vm_data->avic_physical_id_table_page); + + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_del(&vm_data->hnode); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } static int avic_vm_init(struct kvm *kvm) { + unsigned long flags; int err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; @@ -1300,6 +1427,10 @@ static int avic_vm_init(struct kvm *kvm) if (!avic) return 0; + vm_data->avic_vm_id = avic_get_next_vm_id(); + if (vm_data->avic_vm_id < 0) + return vm_data->avic_vm_id; + /* Allocating physical APIC ID table (4KB) */ p_page = alloc_page(GFP_KERNEL); if (!p_page) @@ -1316,6 +1447,10 @@ static int avic_vm_init(struct kvm *kvm) vm_data->avic_logical_id_table_page = l_page; clear_page(page_address(l_page)); + spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); + spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); + return 0; free_avic: @@ -1323,31 +1458,34 @@ free_avic: return err; } -/** - * This function is called during VCPU halt/unhalt. - */ -static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +static inline int +avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) { - u64 entry; - int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu); + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - svm->avic_is_running = is_run; + if (!kvm_arch_has_assigned_device(vcpu->kvm)) + return 0; - /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT)) - return; + /* + * Here, we go through the per-vcpu ir_list to update all existing + * interrupt remapping table entry targeting this vcpu. + */ + spin_lock_irqsave(&svm->ir_list_lock, flags); - entry = READ_ONCE(*(svm->avic_physical_id_cache)); - WARN_ON(is_run == !!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)); + if (list_empty(&svm->ir_list)) + goto out; - entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - if (is_run) - entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; - WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + list_for_each_entry(ir, &svm->ir_list, node) { + ret = amd_iommu_update_ga(cpu, r, ir->data); + if (ret) + break; + } +out: + spin_unlock_irqrestore(&svm->ir_list_lock, flags); + return ret; } static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) @@ -1374,6 +1512,8 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); + avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, + svm->avic_is_running); } static void avic_vcpu_put(struct kvm_vcpu *vcpu) @@ -1385,10 +1525,27 @@ static void avic_vcpu_put(struct kvm_vcpu *vcpu) return; entry = READ_ONCE(*(svm->avic_physical_id_cache)); + if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) + avic_update_iommu_vcpu_affinity(vcpu, -1, 0); + entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; WRITE_ONCE(*(svm->avic_physical_id_cache), entry); } +/** + * This function is called during VCPU halt/unhalt. + */ +static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->avic_is_running = is_run; + if (is_run) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); +} + static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1450,6 +1607,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) err = avic_init_backing_page(&svm->vcpu); if (err) goto free_page4; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); } /* We initialize this flag to true to make sure that the is_running @@ -4246,6 +4406,209 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) kvm_vcpu_wake_up(vcpu); } +static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + unsigned long flags; + struct amd_svm_iommu_ir *cur; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_for_each_entry(cur, &svm->ir_list, node) { + if (cur->data != pi->ir_data) + continue; + list_del(&cur->node); + kfree(cur); + break; + } + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +} + +static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) +{ + int ret = 0; + unsigned long flags; + struct amd_svm_iommu_ir *ir; + + /** + * In some cases, the existing irte is updaed and re-set, + * so we need to check here if it's already been * added + * to the ir_list. + */ + if (pi->ir_data && (pi->prev_ga_tag != 0)) { + struct kvm *kvm = svm->vcpu.kvm; + u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); + struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + struct vcpu_svm *prev_svm; + + if (!prev_vcpu) { + ret = -EINVAL; + goto out; + } + + prev_svm = to_svm(prev_vcpu); + svm_ir_list_del(prev_svm, pi); + } + + /** + * Allocating new amd_iommu_pi_data, which will get + * add to the per-vcpu ir_list. + */ + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL); + if (!ir) { + ret = -ENOMEM; + goto out; + } + ir->data = pi->ir_data; + + spin_lock_irqsave(&svm->ir_list_lock, flags); + list_add(&ir->node, &svm->ir_list); + spin_unlock_irqrestore(&svm->ir_list_lock, flags); +out: + return ret; +} + +/** + * Note: + * The HW cannot support posting multicast/broadcast + * interrupts to a vCPU. So, we still use legacy interrupt + * remapping for these kind of interrupts. + * + * For lowest-priority interrupts, we only support + * those with single CPU as the destination, e.g. user + * configures the interrupts via /proc/irq or uses + * irqbalance to make the interrupts single-CPU. + */ +static int +get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, + struct vcpu_data *vcpu_info, struct vcpu_svm **svm) +{ + struct kvm_lapic_irq irq; + struct kvm_vcpu *vcpu = NULL; + + kvm_set_msi_irq(kvm, e, &irq); + + if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) { + pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n", + __func__, irq.vector); + return -1; + } + + pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, + irq.vector); + *svm = to_svm(vcpu); + vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); + vcpu_info->vector = irq.vector; + + return 0; +} + +/* + * svm_update_pi_irte - set IRTE for Posted-Interrupts + * + * @kvm: kvm + * @host_irq: host irq of the interrupt + * @guest_irq: gsi of the interrupt + * @set: set or unset PI + * returns 0 on success, < 0 on failure + */ +static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, + uint32_t guest_irq, bool set) +{ + struct kvm_kernel_irq_routing_entry *e; + struct kvm_irq_routing_table *irq_rt; + int idx, ret = -EINVAL; + + if (!kvm_arch_has_assigned_device(kvm) || + !irq_remapping_cap(IRQ_POSTING_CAP)) + return 0; + + pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", + __func__, host_irq, guest_irq, set); + + idx = srcu_read_lock(&kvm->irq_srcu); + irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); + WARN_ON(guest_irq >= irq_rt->nr_rt_entries); + + hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { + struct vcpu_data vcpu_info; + struct vcpu_svm *svm = NULL; + + if (e->type != KVM_IRQ_ROUTING_MSI) + continue; + + /** + * Here, we setup with legacy mode in the following cases: + * 1. When cannot target interrupt to a specific vcpu. + * 2. Unsetting posted interrupt. + * 3. APIC virtialization is disabled for the vcpu. + */ + if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set && + kvm_vcpu_apicv_active(&svm->vcpu)) { + struct amd_iommu_pi_data pi; + + /* Try to enable guest_mode in IRTE */ + pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; + pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, + svm->vcpu.vcpu_id); + pi.is_guest_mode = true; + pi.vcpu_data = &vcpu_info; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Here, we successfully setting up vcpu affinity in + * IOMMU guest mode. Now, we need to store the posted + * interrupt information in a per-vcpu ir_list so that + * we can reference to them directly when we update vcpu + * scheduling information in IOMMU irte. + */ + if (!ret && pi.is_guest_mode) + svm_ir_list_add(svm, &pi); + } else { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } + + if (!ret && svm) { + trace_kvm_pi_irte_update(svm->vcpu.vcpu_id, + host_irq, e->gsi, + vcpu_info.vector, + vcpu_info.pi_desc_addr, set); + } + + if (ret < 0) { + pr_err("%s: failed to update PI IRTE\n", __func__); + goto out; + } + } + + ret = 0; +out: + srcu_read_unlock(&kvm->irq_srcu, idx); + return ret; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5078,6 +5441,7 @@ static struct kvm_x86_ops svm_x86_ops = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, + .update_pi_irte = svm_update_pi_irte, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a45d8580f91e..f9939d0722b4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -422,6 +422,7 @@ struct nested_vmx { struct list_head vmcs02_pool; int vmcs02_num; u64 vmcs01_tsc_offset; + bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; /* @@ -435,6 +436,8 @@ struct nested_vmx { bool pi_pending; u16 posted_intr_nv; + unsigned long *msr_bitmap; + struct hrtimer preemption_timer; bool preemption_timer_expired; @@ -924,7 +927,6 @@ static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; static unsigned long *vmx_msr_bitmap_legacy_x2apic; static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_nested; static unsigned long *vmx_vmread_bitmap; static unsigned long *vmx_vmwrite_bitmap; @@ -937,6 +939,7 @@ static DEFINE_SPINLOCK(vmx_vpid_lock); static struct vmcs_config { int size; int order; + u32 basic_cap; u32 revision_id; u32 pin_based_exec_ctrl; u32 cpu_based_exec_ctrl; @@ -1213,6 +1216,11 @@ static inline bool cpu_has_vmx_ple(void) SECONDARY_EXEC_PAUSE_LOOP_EXITING; } +static inline bool cpu_has_vmx_basic_inout(void) +{ + return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT); +} + static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu) { return flexpriority_enabled && lapic_in_kernel(vcpu); @@ -2198,6 +2206,12 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) new.control) != old.control); } +static void decache_tsc_multiplier(struct vcpu_vmx *vmx) +{ + vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; + vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); +} + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -2256,10 +2270,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) /* Setup TSC multiplier */ if (kvm_has_tsc_control && - vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) { - vmx->current_tsc_ratio = vcpu->arch.tsc_scaling_ratio; - vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); - } + vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio) + decache_tsc_multiplier(vmx); vmx_vcpu_pi_load(vcpu, cpu); vmx->host_pkru = read_pkru(); @@ -2508,7 +2520,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) unsigned long *msr_bitmap; if (is_guest_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_nested; + msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap; else if (cpu_has_secondary_exec_ctrls() && (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) & SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { @@ -2871,6 +2883,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + if (cpu_has_vmx_basic_inout()) + *pdata |= VMX_BASIC_INOUT; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: @@ -3451,7 +3465,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) return -EIO; vmcs_conf->size = vmx_msr_high & 0x1fff; - vmcs_conf->order = get_order(vmcs_config.size); + vmcs_conf->order = get_order(vmcs_conf->size); + vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff; vmcs_conf->revision_id = vmx_msr_low; vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; @@ -6103,7 +6118,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { + if (gla_validity == 0x2) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), @@ -6363,13 +6378,6 @@ static __init int hardware_setup(void) if (!vmx_msr_bitmap_longmode_x2apic) goto out4; - if (nested) { - vmx_msr_bitmap_nested = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_nested) - goto out5; - } - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_vmread_bitmap) goto out6; @@ -6392,8 +6400,6 @@ static __init int hardware_setup(void) memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); - if (nested) - memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; @@ -6529,9 +6535,6 @@ out8: out7: free_page((unsigned long)vmx_vmread_bitmap); out6: - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); -out5: free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); out4: free_page((unsigned long)vmx_msr_bitmap_longmode); @@ -6557,8 +6560,6 @@ static __exit void hardware_unsetup(void) free_page((unsigned long)vmx_io_bitmap_a); free_page((unsigned long)vmx_vmwrite_bitmap); free_page((unsigned long)vmx_vmread_bitmap); - if (nested) - free_page((unsigned long)vmx_msr_bitmap_nested); free_kvm_area(); } @@ -6734,7 +6735,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); - pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); + pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator); } static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) @@ -6995,16 +6996,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu) return 1; } + if (cpu_has_vmx_msr_bitmap()) { + vmx->nested.msr_bitmap = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx->nested.msr_bitmap) + goto out_msr_bitmap; + } + vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); if (!vmx->nested.cached_vmcs12) - return -ENOMEM; + goto out_cached_vmcs12; if (enable_shadow_vmcs) { shadow_vmcs = alloc_vmcs(); - if (!shadow_vmcs) { - kfree(vmx->nested.cached_vmcs12); - return -ENOMEM; - } + if (!shadow_vmcs) + goto out_shadow_vmcs; /* mark vmcs as shadow */ shadow_vmcs->revision_id |= (1u << 31); /* init shadow vmcs */ @@ -7016,7 +7022,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmcs02_num = 0; hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; vmx->nested.vmxon = true; @@ -7024,6 +7030,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); return 1; + +out_shadow_vmcs: + kfree(vmx->nested.cached_vmcs12); + +out_cached_vmcs12: + free_page((unsigned long)vmx->nested.msr_bitmap); + +out_msr_bitmap: + return -ENOMEM; } /* @@ -7098,6 +7113,10 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.vmxon = false; free_vpid(vmx->nested.vpid02); nested_release_vmcs12(vmx); + if (vmx->nested.msr_bitmap) { + free_page((unsigned long)vmx->nested.msr_bitmap); + vmx->nested.msr_bitmap = NULL; + } if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); kfree(vmx->nested.cached_vmcs12); @@ -8419,6 +8438,12 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) { u32 sec_exec_control; + /* Postpone execution until vmcs01 is the current VMCS. */ + if (is_guest_mode(vcpu)) { + to_vmx(vcpu)->nested.change_vmcs01_virtual_x2apic_mode = true; + return; + } + /* * There is not point to enable virtualize x2apic without enable * apicv @@ -9472,8 +9497,10 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, { int msr; struct page *page; - unsigned long *msr_bitmap; + unsigned long *msr_bitmap_l1; + unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; @@ -9482,63 +9509,37 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, WARN_ON(1); return false; } - msr_bitmap = (unsigned long *)kmap(page); - if (!msr_bitmap) { + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (!msr_bitmap_l1) { nested_release_page_clean(page); WARN_ON(1); return false; } + memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { if (nested_cpu_has_apic_reg_virt(vmcs12)) for (msr = 0x800; msr <= 0x8ff; msr++) nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, msr, MSR_TYPE_R); - /* TPR is allowed */ - nested_vmx_disable_intercept_for_msr(msr_bitmap, - vmx_msr_bitmap_nested, + + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_TASKPRI >> 4), MSR_TYPE_R | MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { - /* EOI and self-IPI are allowed */ nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_EOI >> 4), MSR_TYPE_W); nested_vmx_disable_intercept_for_msr( - msr_bitmap, - vmx_msr_bitmap_nested, + msr_bitmap_l1, msr_bitmap_l0, APIC_BASE_MSR + (APIC_SELF_IPI >> 4), MSR_TYPE_W); } - } else { - /* - * Enable reading intercept of all the x2apic - * MSRs. We should not rely on vmcs12 to do any - * optimizations here, it may have been modified - * by L1. - */ - for (msr = 0x800; msr <= 0x8ff; msr++) - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - msr, - MSR_TYPE_R); - - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - __vmx_enable_intercept_for_msr( - vmx_msr_bitmap_nested, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); } kunmap(page); nested_release_page_clean(page); @@ -9606,7 +9607,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, maxphyaddr = cpuid_maxphyaddr(vcpu); if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { - pr_warn_ratelimited( + pr_debug_ratelimited( "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", addr_field, maxphyaddr, count, addr); return -EINVAL; @@ -9679,13 +9680,13 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) for (i = 0; i < count; i++) { if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, sizeof(e))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); goto fail; } if (nested_vmx_load_msr_check(vcpu, &e)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); goto fail; @@ -9693,7 +9694,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr.index = e.index; msr.data = e.value; if (kvm_set_msr(vcpu, &msr)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, e.value); goto fail; @@ -9714,13 +9715,13 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), &e, 2 * sizeof(u32))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); return -EINVAL; } if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s check failed (%u, 0x%x, 0x%x)\n", __func__, i, e.index, e.reserved); return -EINVAL; @@ -9728,7 +9729,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr_info.host_initiated = false; msr_info.index = e.index; if (kvm_get_msr(vcpu, &msr_info)) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); return -EINVAL; @@ -9737,7 +9738,7 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), &msr_info.data, sizeof(msr_info.data))) { - pr_warn_ratelimited( + pr_debug_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", __func__, i, e.index, msr_info.data); return -EINVAL; @@ -9957,10 +9958,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) } if (cpu_has_vmx_msr_bitmap() && - exec_control & CPU_BASED_USE_MSR_BITMAPS) { - nested_vmx_merge_msr_bitmap(vcpu, vmcs12); - /* MSR_BITMAP will be set by following vmx_set_efer. */ - } else + exec_control & CPU_BASED_USE_MSR_BITMAPS && + nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) + ; /* MSR_BITMAP will be set by following vmx_set_efer. */ + else exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; /* @@ -10011,6 +10012,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); else vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); if (enable_vpid) { /* @@ -10506,6 +10509,9 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); } + if (nested_cpu_has_ept(vmcs12)) + vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); + if (nested_cpu_has_vid(vmcs12)) vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); @@ -10767,6 +10773,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, else vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) + decache_tsc_multiplier(vmx); + + if (vmx->nested.change_vmcs01_virtual_x2apic_mode) { + vmx->nested.change_vmcs01_virtual_x2apic_mode = false; + vmx_set_virtual_x2apic_mode(vcpu, + vcpu->arch.apic_base & X2APIC_ENABLE); + } /* This is needed for same reason as it was needed in prepare_vmcs02 */ vmx->host_rsp = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19f9f9e05c2a..57ffe7893104 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6700,7 +6700,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); - /* Interrupt is enabled by handle_external_intr() */ kvm_x86_ops->handle_external_intr(vcpu); ++vcpu->stat.exits; diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 96de97a46079..a7aa0e76eebd 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -137,6 +137,7 @@ struct iommu_dev_data { bool pri_tlp; /* PASID TLB required for PPR completions */ u32 errata; /* Bitmap for errata to apply */ + bool use_vapic; /* Enable device to use vapic mode */ }; /* @@ -707,14 +708,74 @@ static void iommu_poll_ppr_log(struct amd_iommu *iommu) } } +#ifdef CONFIG_IRQ_REMAP +static int (*iommu_ga_log_notifier)(u32); + +int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) +{ + iommu_ga_log_notifier = notifier; + + return 0; +} +EXPORT_SYMBOL(amd_iommu_register_ga_log_notifier); + +static void iommu_poll_ga_log(struct amd_iommu *iommu) +{ + u32 head, tail, cnt = 0; + + if (iommu->ga_log == NULL) + return; + + head = readl(iommu->mmio_base + MMIO_GA_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_GA_TAIL_OFFSET); + + while (head != tail) { + volatile u64 *raw; + u64 log_entry; + + raw = (u64 *)(iommu->ga_log + head); + cnt++; + + /* Avoid memcpy function-call overhead */ + log_entry = *raw; + + /* Update head pointer of hardware ring-buffer */ + head = (head + GA_ENTRY_SIZE) % GA_LOG_SIZE; + writel(head, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); + + /* Handle GA entry */ + switch (GA_REQ_TYPE(log_entry)) { + case GA_GUEST_NR: + if (!iommu_ga_log_notifier) + break; + + pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n", + __func__, GA_DEVID(log_entry), + GA_TAG(log_entry)); + + if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0) + pr_err("AMD-Vi: GA log notifier failed.\n"); + break; + default: + break; + } + } +} +#endif /* CONFIG_IRQ_REMAP */ + +#define AMD_IOMMU_INT_MASK \ + (MMIO_STATUS_EVT_INT_MASK | \ + MMIO_STATUS_PPR_INT_MASK | \ + MMIO_STATUS_GALOG_INT_MASK) + irqreturn_t amd_iommu_int_thread(int irq, void *data) { struct amd_iommu *iommu = (struct amd_iommu *) data; u32 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - while (status & (MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK)) { - /* Enable EVT and PPR interrupts again */ - writel((MMIO_STATUS_EVT_INT_MASK | MMIO_STATUS_PPR_INT_MASK), + while (status & AMD_IOMMU_INT_MASK) { + /* Enable EVT and PPR and GA interrupts again */ + writel(AMD_IOMMU_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); if (status & MMIO_STATUS_EVT_INT_MASK) { @@ -727,6 +788,13 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data) iommu_poll_ppr_log(iommu); } +#ifdef CONFIG_IRQ_REMAP + if (status & MMIO_STATUS_GALOG_INT_MASK) { + pr_devel("AMD-Vi: Processing IOMMU GA Log\n"); + iommu_poll_ga_log(iommu); + } +#endif + /* * Hardware bug: ERBT1312 * When re-enabling interrupt (by writing 1 @@ -2948,6 +3016,12 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, if (!iommu) return; +#ifdef CONFIG_IRQ_REMAP + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && + (dom->type == IOMMU_DOMAIN_UNMANAGED)) + dev_data->use_vapic = 0; +#endif + iommu_completion_wait(iommu); } @@ -2973,6 +3047,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, ret = attach_device(dev, domain); +#ifdef CONFIG_IRQ_REMAP + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { + if (dom->type == IOMMU_DOMAIN_UNMANAGED) + dev_data->use_vapic = 1; + else + dev_data->use_vapic = 0; + } +#endif + iommu_completion_wait(iommu); return ret; @@ -3511,34 +3594,6 @@ EXPORT_SYMBOL(amd_iommu_device_info); * *****************************************************************************/ -union irte { - u32 val; - struct { - u32 valid : 1, - no_fault : 1, - int_type : 3, - rq_eoi : 1, - dm : 1, - rsvd_1 : 1, - destination : 8, - vector : 8, - rsvd_2 : 8; - } fields; -}; - -struct irq_2_irte { - u16 devid; /* Device ID for IRTE table */ - u16 index; /* Index into IRTE table*/ -}; - -struct amd_ir_data { - struct irq_2_irte irq_2_irte; - union irte irte_entry; - union { - struct msi_msg msi_entry; - }; -}; - static struct irq_chip amd_ir_chip; #define DTE_IRQ_PHYS_ADDR_MASK (((1ULL << 45)-1) << 6) @@ -3560,8 +3615,6 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table) amd_iommu_dev_table[devid].data[2] = dte; } -#define IRTE_ALLOCATED (~1U) - static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) { struct irq_remap_table *table = NULL; @@ -3607,13 +3660,18 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) goto out; } - memset(table->table, 0, MAX_IRQS_PER_TABLE * sizeof(u32)); + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + memset(table->table, 0, + MAX_IRQS_PER_TABLE * sizeof(u32)); + else + memset(table->table, 0, + (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); if (ioapic) { int i; for (i = 0; i < 32; ++i) - table->table[i] = IRTE_ALLOCATED; + iommu->irte_ops->set_allocated(table, i); } irq_lookup_table[devid] = table; @@ -3639,6 +3697,10 @@ static int alloc_irq_index(u16 devid, int count) struct irq_remap_table *table; unsigned long flags; int index, c; + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + if (!iommu) + return -ENODEV; table = get_irq_table(devid, false); if (!table) @@ -3650,14 +3712,14 @@ static int alloc_irq_index(u16 devid, int count) for (c = 0, index = table->min_index; index < MAX_IRQS_PER_TABLE; ++index) { - if (table->table[index] == 0) + if (!iommu->irte_ops->is_allocated(table, index)) c += 1; else c = 0; if (c == count) { for (; c != 0; --c) - table->table[index - c + 1] = IRTE_ALLOCATED; + iommu->irte_ops->set_allocated(table, index - c + 1); index -= count - 1; goto out; @@ -3672,7 +3734,42 @@ out: return index; } -static int modify_irte(u16 devid, int index, union irte irte) +static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte, + struct amd_ir_data *data) +{ + struct irq_remap_table *table; + struct amd_iommu *iommu; + unsigned long flags; + struct irte_ga *entry; + + iommu = amd_iommu_rlookup_table[devid]; + if (iommu == NULL) + return -EINVAL; + + table = get_irq_table(devid, false); + if (!table) + return -ENOMEM; + + spin_lock_irqsave(&table->lock, flags); + + entry = (struct irte_ga *)table->table; + entry = &entry[index]; + entry->lo.fields_remap.valid = 0; + entry->hi.val = irte->hi.val; + entry->lo.val = irte->lo.val; + entry->lo.fields_remap.valid = 1; + if (data) + data->ref = entry; + + spin_unlock_irqrestore(&table->lock, flags); + + iommu_flush_irt(iommu, devid); + iommu_completion_wait(iommu); + + return 0; +} + +static int modify_irte(u16 devid, int index, union irte *irte) { struct irq_remap_table *table; struct amd_iommu *iommu; @@ -3687,7 +3784,7 @@ static int modify_irte(u16 devid, int index, union irte irte) return -ENOMEM; spin_lock_irqsave(&table->lock, flags); - table->table[index] = irte.val; + table->table[index] = irte->val; spin_unlock_irqrestore(&table->lock, flags); iommu_flush_irt(iommu, devid); @@ -3711,13 +3808,146 @@ static void free_irte(u16 devid, int index) return; spin_lock_irqsave(&table->lock, flags); - table->table[index] = 0; + iommu->irte_ops->clear_allocated(table, index); spin_unlock_irqrestore(&table->lock, flags); iommu_flush_irt(iommu, devid); iommu_completion_wait(iommu); } +static void irte_prepare(void *entry, + u32 delivery_mode, u32 dest_mode, + u8 vector, u32 dest_apicid, int devid) +{ + union irte *irte = (union irte *) entry; + + irte->val = 0; + irte->fields.vector = vector; + irte->fields.int_type = delivery_mode; + irte->fields.destination = dest_apicid; + irte->fields.dm = dest_mode; + irte->fields.valid = 1; +} + +static void irte_ga_prepare(void *entry, + u32 delivery_mode, u32 dest_mode, + u8 vector, u32 dest_apicid, int devid) +{ + struct irte_ga *irte = (struct irte_ga *) entry; + struct iommu_dev_data *dev_data = search_dev_data(devid); + + irte->lo.val = 0; + irte->hi.val = 0; + irte->lo.fields_remap.guest_mode = dev_data ? dev_data->use_vapic : 0; + irte->lo.fields_remap.int_type = delivery_mode; + irte->lo.fields_remap.dm = dest_mode; + irte->hi.fields.vector = vector; + irte->lo.fields_remap.destination = dest_apicid; + irte->lo.fields_remap.valid = 1; +} + +static void irte_activate(void *entry, u16 devid, u16 index) +{ + union irte *irte = (union irte *) entry; + + irte->fields.valid = 1; + modify_irte(devid, index, irte); +} + +static void irte_ga_activate(void *entry, u16 devid, u16 index) +{ + struct irte_ga *irte = (struct irte_ga *) entry; + + irte->lo.fields_remap.valid = 1; + modify_irte_ga(devid, index, irte, NULL); +} + +static void irte_deactivate(void *entry, u16 devid, u16 index) +{ + union irte *irte = (union irte *) entry; + + irte->fields.valid = 0; + modify_irte(devid, index, irte); +} + +static void irte_ga_deactivate(void *entry, u16 devid, u16 index) +{ + struct irte_ga *irte = (struct irte_ga *) entry; + + irte->lo.fields_remap.valid = 0; + modify_irte_ga(devid, index, irte, NULL); +} + +static void irte_set_affinity(void *entry, u16 devid, u16 index, + u8 vector, u32 dest_apicid) +{ + union irte *irte = (union irte *) entry; + + irte->fields.vector = vector; + irte->fields.destination = dest_apicid; + modify_irte(devid, index, irte); +} + +static void irte_ga_set_affinity(void *entry, u16 devid, u16 index, + u8 vector, u32 dest_apicid) +{ + struct irte_ga *irte = (struct irte_ga *) entry; + struct iommu_dev_data *dev_data = search_dev_data(devid); + + if (!dev_data || !dev_data->use_vapic) { + irte->hi.fields.vector = vector; + irte->lo.fields_remap.destination = dest_apicid; + irte->lo.fields_remap.guest_mode = 0; + modify_irte_ga(devid, index, irte, NULL); + } +} + +#define IRTE_ALLOCATED (~1U) +static void irte_set_allocated(struct irq_remap_table *table, int index) +{ + table->table[index] = IRTE_ALLOCATED; +} + +static void irte_ga_set_allocated(struct irq_remap_table *table, int index) +{ + struct irte_ga *ptr = (struct irte_ga *)table->table; + struct irte_ga *irte = &ptr[index]; + + memset(&irte->lo.val, 0, sizeof(u64)); + memset(&irte->hi.val, 0, sizeof(u64)); + irte->hi.fields.vector = 0xff; +} + +static bool irte_is_allocated(struct irq_remap_table *table, int index) +{ + union irte *ptr = (union irte *)table->table; + union irte *irte = &ptr[index]; + + return irte->val != 0; +} + +static bool irte_ga_is_allocated(struct irq_remap_table *table, int index) +{ + struct irte_ga *ptr = (struct irte_ga *)table->table; + struct irte_ga *irte = &ptr[index]; + + return irte->hi.fields.vector != 0; +} + +static void irte_clear_allocated(struct irq_remap_table *table, int index) +{ + table->table[index] = 0; +} + +static void irte_ga_clear_allocated(struct irq_remap_table *table, int index) +{ + struct irte_ga *ptr = (struct irte_ga *)table->table; + struct irte_ga *irte = &ptr[index]; + + memset(&irte->lo.val, 0, sizeof(u64)); + memset(&irte->hi.val, 0, sizeof(u64)); +} + static int get_devid(struct irq_alloc_info *info) { int devid = -1; @@ -3802,19 +4032,17 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, { struct irq_2_irte *irte_info = &data->irq_2_irte; struct msi_msg *msg = &data->msi_entry; - union irte *irte = &data->irte_entry; struct IO_APIC_route_entry *entry; + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + if (!iommu) + return; data->irq_2_irte.devid = devid; data->irq_2_irte.index = index + sub_handle; - - /* Setup IRTE for IOMMU */ - irte->val = 0; - irte->fields.vector = irq_cfg->vector; - irte->fields.int_type = apic->irq_delivery_mode; - irte->fields.destination = irq_cfg->dest_apicid; - irte->fields.dm = apic->irq_dest_mode; - irte->fields.valid = 1; + iommu->irte_ops->prepare(data->entry, apic->irq_delivery_mode, + apic->irq_dest_mode, irq_cfg->vector, + irq_cfg->dest_apicid, devid); switch (info->type) { case X86_IRQ_ALLOC_TYPE_IOAPIC: @@ -3845,12 +4073,32 @@ static void irq_remapping_prepare_irte(struct amd_ir_data *data, } } +struct amd_irte_ops irte_32_ops = { + .prepare = irte_prepare, + .activate = irte_activate, + .deactivate = irte_deactivate, + .set_affinity = irte_set_affinity, + .set_allocated = irte_set_allocated, + .is_allocated = irte_is_allocated, + .clear_allocated = irte_clear_allocated, +}; + +struct amd_irte_ops irte_128_ops = { + .prepare = irte_ga_prepare, + .activate = irte_ga_activate, + .deactivate = irte_ga_deactivate, + .set_affinity = irte_ga_set_affinity, + .set_allocated = irte_ga_set_allocated, + .is_allocated = irte_ga_is_allocated, + .clear_allocated = irte_ga_clear_allocated, +}; + static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { struct irq_alloc_info *info = arg; struct irq_data *irq_data; - struct amd_ir_data *data; + struct amd_ir_data *data = NULL; struct irq_cfg *cfg; int i, ret, devid; int index = -1; @@ -3902,6 +4150,16 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, if (!data) goto out_free_data; + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + data->entry = kzalloc(sizeof(union irte), GFP_KERNEL); + else + data->entry = kzalloc(sizeof(struct irte_ga), + GFP_KERNEL); + if (!data->entry) { + kfree(data); + goto out_free_data; + } + irq_data->hwirq = (devid << 16) + i; irq_data->chip_data = data; irq_data->chip = &amd_ir_chip; @@ -3938,6 +4196,7 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq, data = irq_data->chip_data; irte_info = &data->irq_2_irte; free_irte(irte_info->devid, irte_info->index); + kfree(data->entry); kfree(data); } } @@ -3949,8 +4208,11 @@ static void irq_remapping_activate(struct irq_domain *domain, { struct amd_ir_data *data = irq_data->chip_data; struct irq_2_irte *irte_info = &data->irq_2_irte; + struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid]; - modify_irte(irte_info->devid, irte_info->index, data->irte_entry); + if (iommu) + iommu->irte_ops->activate(data->entry, irte_info->devid, + irte_info->index); } static void irq_remapping_deactivate(struct irq_domain *domain, @@ -3958,10 +4220,11 @@ static void irq_remapping_deactivate(struct irq_domain *domain, { struct amd_ir_data *data = irq_data->chip_data; struct irq_2_irte *irte_info = &data->irq_2_irte; - union irte entry; + struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid]; - entry.val = 0; - modify_irte(irte_info->devid, irte_info->index, data->irte_entry); + if (iommu) + iommu->irte_ops->deactivate(data->entry, irte_info->devid, + irte_info->index); } static struct irq_domain_ops amd_ir_domain_ops = { @@ -3971,6 +4234,70 @@ static struct irq_domain_ops amd_ir_domain_ops = { .deactivate = irq_remapping_deactivate, }; +static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) +{ + struct amd_iommu *iommu; + struct amd_iommu_pi_data *pi_data = vcpu_info; + struct vcpu_data *vcpu_pi_info = pi_data->vcpu_data; + struct amd_ir_data *ir_data = data->chip_data; + struct irte_ga *irte = (struct irte_ga *) ir_data->entry; + struct irq_2_irte *irte_info = &ir_data->irq_2_irte; + struct iommu_dev_data *dev_data = search_dev_data(irte_info->devid); + + /* Note: + * This device has never been set up for guest mode. + * we should not modify the IRTE + */ + if (!dev_data || !dev_data->use_vapic) + return 0; + + pi_data->ir_data = ir_data; + + /* Note: + * SVM tries to set up for VAPIC mode, but we are in + * legacy mode. So, we force legacy mode instead. + */ + if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { + pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n", + __func__); + pi_data->is_guest_mode = false; + } + + iommu = amd_iommu_rlookup_table[irte_info->devid]; + if (iommu == NULL) + return -EINVAL; + + pi_data->prev_ga_tag = ir_data->cached_ga_tag; + if (pi_data->is_guest_mode) { + /* Setting */ + irte->hi.fields.ga_root_ptr = (pi_data->base >> 12); + irte->hi.fields.vector = vcpu_pi_info->vector; + irte->lo.fields_vapic.guest_mode = 1; + irte->lo.fields_vapic.ga_tag = pi_data->ga_tag; + + ir_data->cached_ga_tag = pi_data->ga_tag; + } else { + /* Un-Setting */ + struct irq_cfg *cfg = irqd_cfg(data); + + irte->hi.val = 0; + irte->lo.val = 0; + irte->hi.fields.vector = cfg->vector; + irte->lo.fields_remap.guest_mode = 0; + irte->lo.fields_remap.destination = cfg->dest_apicid; + irte->lo.fields_remap.int_type = apic->irq_delivery_mode; + irte->lo.fields_remap.dm = apic->irq_dest_mode; + + /* + * This communicates the ga_tag back to the caller + * so that it can do all the necessary clean up. + */ + ir_data->cached_ga_tag = 0; + } + + return modify_irte_ga(irte_info->devid, irte_info->index, irte, ir_data); +} + static int amd_ir_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3978,8 +4305,12 @@ static int amd_ir_set_affinity(struct irq_data *data, struct irq_2_irte *irte_info = &ir_data->irq_2_irte; struct irq_cfg *cfg = irqd_cfg(data); struct irq_data *parent = data->parent_data; + struct amd_iommu *iommu = amd_iommu_rlookup_table[irte_info->devid]; int ret; + if (!iommu) + return -ENODEV; + ret = parent->chip->irq_set_affinity(parent, mask, force); if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE) return ret; @@ -3988,9 +4319,8 @@ static int amd_ir_set_affinity(struct irq_data *data, * Atomically updates the IRTE with the new destination, vector * and flushes the interrupt entry cache. */ - ir_data->irte_entry.fields.vector = cfg->vector; - ir_data->irte_entry.fields.destination = cfg->dest_apicid; - modify_irte(irte_info->devid, irte_info->index, ir_data->irte_entry); + iommu->irte_ops->set_affinity(ir_data->entry, irte_info->devid, + irte_info->index, cfg->vector, cfg->dest_apicid); /* * After this point, all the interrupts will start arriving @@ -4012,6 +4342,7 @@ static void ir_compose_msi_msg(struct irq_data *irq_data, struct msi_msg *msg) static struct irq_chip amd_ir_chip = { .irq_ack = ir_ack_apic_edge, .irq_set_affinity = amd_ir_set_affinity, + .irq_set_vcpu_affinity = amd_ir_set_vcpu_affinity, .irq_compose_msi_msg = ir_compose_msi_msg, }; @@ -4026,4 +4357,43 @@ int amd_iommu_create_irq_domain(struct amd_iommu *iommu) return 0; } + +int amd_iommu_update_ga(int cpu, bool is_run, void *data) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct irq_remap_table *irt; + struct amd_ir_data *ir_data = (struct amd_ir_data *)data; + int devid = ir_data->irq_2_irte.devid; + struct irte_ga *entry = (struct irte_ga *) ir_data->entry; + struct irte_ga *ref = (struct irte_ga *) ir_data->ref; + + if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || + !ref || !entry || !entry->lo.fields_vapic.guest_mode) + return 0; + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return -ENODEV; + + irt = get_irq_table(devid, false); + if (!irt) + return -ENODEV; + + spin_lock_irqsave(&irt->lock, flags); + + if (ref->lo.fields_vapic.guest_mode) { + if (cpu >= 0) + ref->lo.fields_vapic.destination = cpu; + ref->lo.fields_vapic.is_run = is_run; + barrier(); + } + + spin_unlock_irqrestore(&irt->lock, flags); + + iommu_flush_irt(iommu, devid); + iommu_completion_wait(iommu); + return 0; +} +EXPORT_SYMBOL(amd_iommu_update_ga); #endif diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 59741ead7e15..cd1713631a4a 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -84,6 +84,7 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 +#define LOOP_TIMEOUT 100000 /* * ACPI table definitions * @@ -145,6 +146,8 @@ struct ivmd_header { bool amd_iommu_dump; bool amd_iommu_irq_remap __read_mostly; +int amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC; + static bool amd_iommu_detected; static bool __initdata amd_iommu_disabled; static int amd_iommu_target_ivhd_type; @@ -386,6 +389,10 @@ static void iommu_disable(struct amd_iommu *iommu) iommu_feature_disable(iommu, CONTROL_EVT_INT_EN); iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN); + /* Disable IOMMU GA_LOG */ + iommu_feature_disable(iommu, CONTROL_GALOG_EN); + iommu_feature_disable(iommu, CONTROL_GAINT_EN); + /* Disable IOMMU hardware itself */ iommu_feature_disable(iommu, CONTROL_IOMMU_EN); } @@ -671,6 +678,99 @@ static void __init free_ppr_log(struct amd_iommu *iommu) free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE)); } +static void free_ga_log(struct amd_iommu *iommu) +{ +#ifdef CONFIG_IRQ_REMAP + if (iommu->ga_log) + free_pages((unsigned long)iommu->ga_log, + get_order(GA_LOG_SIZE)); + if (iommu->ga_log_tail) + free_pages((unsigned long)iommu->ga_log_tail, + get_order(8)); +#endif +} + +static int iommu_ga_log_enable(struct amd_iommu *iommu) +{ +#ifdef CONFIG_IRQ_REMAP + u32 status, i; + + if (!iommu->ga_log) + return -EINVAL; + + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + + /* Check if already running */ + if (status & (MMIO_STATUS_GALOG_RUN_MASK)) + return 0; + + iommu_feature_enable(iommu, CONTROL_GAINT_EN); + iommu_feature_enable(iommu, CONTROL_GALOG_EN); + + for (i = 0; i < LOOP_TIMEOUT; ++i) { + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + if (status & (MMIO_STATUS_GALOG_RUN_MASK)) + break; + } + + if (i >= LOOP_TIMEOUT) + return -EINVAL; +#endif /* CONFIG_IRQ_REMAP */ + return 0; +} + +#ifdef CONFIG_IRQ_REMAP +static int iommu_init_ga_log(struct amd_iommu *iommu) +{ + u64 entry; + + if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) + return 0; + + iommu->ga_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(GA_LOG_SIZE)); + if (!iommu->ga_log) + goto err_out; + + iommu->ga_log_tail = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(8)); + if (!iommu->ga_log_tail) + goto err_out; + + entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET, + &entry, sizeof(entry)); + entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL; + memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET, + &entry, sizeof(entry)); + writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_GA_TAIL_OFFSET); + + return 0; +err_out: + free_ga_log(iommu); + return -EINVAL; +} +#endif /* CONFIG_IRQ_REMAP */ + +static int iommu_init_ga(struct amd_iommu *iommu) +{ + int ret = 0; + +#ifdef CONFIG_IRQ_REMAP + /* Note: We have already checked GASup from IVRS table. + * Now, we need to make sure that GAMSup is set. + */ + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) && + !iommu_feature(iommu, FEATURE_GAM_VAPIC)) + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA; + + ret = iommu_init_ga_log(iommu); +#endif /* CONFIG_IRQ_REMAP */ + + return ret; +} + static void iommu_enable_gt(struct amd_iommu *iommu) { if (!iommu_feature(iommu, FEATURE_GT)) @@ -1144,6 +1244,7 @@ static void __init free_iommu_one(struct amd_iommu *iommu) free_command_buffer(iommu); free_event_buffer(iommu); free_ppr_log(iommu); + free_ga_log(iommu); iommu_unmap_mmio_space(iommu); } @@ -1258,6 +1359,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) iommu->mmio_phys_end = MMIO_REG_END_OFFSET; else iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; + if (((h->efr_attr & (0x1 << IOMMU_FEAT_GASUP_SHIFT)) == 0)) + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; break; case 0x11: case 0x40: @@ -1265,6 +1368,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) iommu->mmio_phys_end = MMIO_REG_END_OFFSET; else iommu->mmio_phys_end = MMIO_CNTR_CONF_OFFSET; + if (((h->efr_reg & (0x1 << IOMMU_EFR_GASUP_SHIFT)) == 0)) + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; break; default: return -EINVAL; @@ -1432,6 +1537,7 @@ static int iommu_init_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; u32 range, misc, low, high; + int ret; iommu->dev = pci_get_bus_and_slot(PCI_BUS_NUM(iommu->devid), iommu->devid & 0xff); @@ -1488,6 +1594,10 @@ static int iommu_init_pci(struct amd_iommu *iommu) if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu)) return -ENOMEM; + ret = iommu_init_ga(iommu); + if (ret) + return ret; + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) amd_iommu_np_cache = true; @@ -1545,16 +1655,24 @@ static void print_iommu_info(void) dev_name(&iommu->dev->dev), iommu->cap_ptr); if (iommu->cap & (1 << IOMMU_CAP_EFR)) { - pr_info("AMD-Vi: Extended features: "); + pr_info("AMD-Vi: Extended features (%#llx):\n", + iommu->features); for (i = 0; i < ARRAY_SIZE(feat_str); ++i) { if (iommu_feature(iommu, (1ULL << i))) pr_cont(" %s", feat_str[i]); } + + if (iommu->features & FEATURE_GAM_VAPIC) + pr_cont(" GA_vAPIC"); + pr_cont("\n"); } } - if (irq_remapping_enabled) + if (irq_remapping_enabled) { pr_info("AMD-Vi: Interrupt remapping enabled\n"); + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) + pr_info("AMD-Vi: virtual APIC enabled\n"); + } } static int __init amd_iommu_init_pci(void) @@ -1645,6 +1763,8 @@ enable_faults: if (iommu->ppr_log != NULL) iommu_feature_enable(iommu, CONTROL_PPFINT_EN); + iommu_ga_log_enable(iommu); + return 0; } @@ -1862,6 +1982,24 @@ static void iommu_apply_resume_quirks(struct amd_iommu *iommu) iommu->stored_addr_lo | 1); } +static void iommu_enable_ga(struct amd_iommu *iommu) +{ +#ifdef CONFIG_IRQ_REMAP + switch (amd_iommu_guest_ir) { + case AMD_IOMMU_GUEST_IR_VAPIC: + iommu_feature_enable(iommu, CONTROL_GAM_EN); + /* Fall through */ + case AMD_IOMMU_GUEST_IR_LEGACY_GA: + iommu_feature_enable(iommu, CONTROL_GA_EN); + iommu->irte_ops = &irte_128_ops; + break; + default: + iommu->irte_ops = &irte_32_ops; + break; + } +#endif +} + /* * This function finally enables all IOMMUs found in the system after * they have been initialized @@ -1877,9 +2015,15 @@ static void early_enable_iommus(void) iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); iommu_set_exclusion_range(iommu); + iommu_enable_ga(iommu); iommu_enable(iommu); iommu_flush_all_caches(iommu); } + +#ifdef CONFIG_IRQ_REMAP + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) + amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP); +#endif } static void enable_iommus_v2(void) @@ -1905,6 +2049,11 @@ static void disable_iommus(void) for_each_iommu(iommu) iommu_disable(iommu); + +#ifdef CONFIG_IRQ_REMAP + if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) + amd_iommu_irq_ops.capability &= ~(1 << IRQ_POSTING_CAP); +#endif } /* @@ -2059,7 +2208,7 @@ static int __init early_amd_iommu_init(void) struct acpi_table_header *ivrs_base; acpi_size ivrs_size; acpi_status status; - int i, ret = 0; + int i, remap_cache_sz, ret = 0; if (!amd_iommu_detected) return -ENODEV; @@ -2157,10 +2306,14 @@ static int __init early_amd_iommu_init(void) * remapping tables. */ ret = -ENOMEM; + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) + remap_cache_sz = MAX_IRQS_PER_TABLE * sizeof(u32); + else + remap_cache_sz = MAX_IRQS_PER_TABLE * (sizeof(u64) * 2); amd_iommu_irq_cache = kmem_cache_create("irq_remap_cache", - MAX_IRQS_PER_TABLE * sizeof(u32), - IRQ_TABLE_ALIGNMENT, - 0, NULL); + remap_cache_sz, + IRQ_TABLE_ALIGNMENT, + 0, NULL); if (!amd_iommu_irq_cache) goto out; @@ -2413,6 +2566,21 @@ static int __init parse_amd_iommu_dump(char *str) return 1; } +static int __init parse_amd_iommu_intr(char *str) +{ + for (; *str; ++str) { + if (strncmp(str, "legacy", 6) == 0) { + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY; + break; + } + if (strncmp(str, "vapic", 5) == 0) { + amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_VAPIC; + break; + } + } + return 1; +} + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { @@ -2521,6 +2689,7 @@ static int __init parse_ivrs_acpihid(char *str) __setup("amd_iommu_dump", parse_amd_iommu_dump); __setup("amd_iommu=", parse_amd_iommu_options); +__setup("amd_iommu_intr=", parse_amd_iommu_intr); __setup("ivrs_ioapic", parse_ivrs_ioapic); __setup("ivrs_hpet", parse_ivrs_hpet); __setup("ivrs_acpihid", parse_ivrs_acpihid); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index 0bd9eb374462..faa3b4895cf0 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -38,6 +38,7 @@ extern int amd_iommu_enable(void); extern void amd_iommu_disable(void); extern int amd_iommu_reenable(int); extern int amd_iommu_enable_faulting(void); +extern int amd_iommu_guest_ir; /* IOMMUv2 specific functions */ struct iommu_domain; diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index caf5e3822715..fa766eefd590 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -22,6 +22,7 @@ #include <linux/types.h> #include <linux/mutex.h> +#include <linux/msi.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/pci.h> @@ -69,6 +70,8 @@ #define MMIO_EXCL_LIMIT_OFFSET 0x0028 #define MMIO_EXT_FEATURES 0x0030 #define MMIO_PPR_LOG_OFFSET 0x0038 +#define MMIO_GA_LOG_BASE_OFFSET 0x00e0 +#define MMIO_GA_LOG_TAIL_OFFSET 0x00e8 #define MMIO_CMD_HEAD_OFFSET 0x2000 #define MMIO_CMD_TAIL_OFFSET 0x2008 #define MMIO_EVT_HEAD_OFFSET 0x2010 @@ -76,6 +79,8 @@ #define MMIO_STATUS_OFFSET 0x2020 #define MMIO_PPR_HEAD_OFFSET 0x2030 #define MMIO_PPR_TAIL_OFFSET 0x2038 +#define MMIO_GA_HEAD_OFFSET 0x2040 +#define MMIO_GA_TAIL_OFFSET 0x2048 #define MMIO_CNTR_CONF_OFFSET 0x4000 #define MMIO_CNTR_REG_OFFSET 0x40000 #define MMIO_REG_END_OFFSET 0x80000 @@ -92,6 +97,7 @@ #define FEATURE_GA (1ULL<<7) #define FEATURE_HE (1ULL<<8) #define FEATURE_PC (1ULL<<9) +#define FEATURE_GAM_VAPIC (1ULL<<21) #define FEATURE_PASID_SHIFT 32 #define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT) @@ -110,6 +116,9 @@ #define MMIO_STATUS_EVT_INT_MASK (1 << 1) #define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2) #define MMIO_STATUS_PPR_INT_MASK (1 << 6) +#define MMIO_STATUS_GALOG_RUN_MASK (1 << 8) +#define MMIO_STATUS_GALOG_OVERFLOW_MASK (1 << 9) +#define MMIO_STATUS_GALOG_INT_MASK (1 << 10) /* event logging constants */ #define EVENT_ENTRY_SIZE 0x10 @@ -146,6 +155,10 @@ #define CONTROL_PPFINT_EN 0x0eULL #define CONTROL_PPR_EN 0x0fULL #define CONTROL_GT_EN 0x10ULL +#define CONTROL_GA_EN 0x11ULL +#define CONTROL_GAM_EN 0x19ULL +#define CONTROL_GALOG_EN 0x1CULL +#define CONTROL_GAINT_EN 0x1DULL #define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT) #define CTRL_INV_TO_NONE 0 @@ -224,6 +237,19 @@ #define PPR_REQ_FAULT 0x01 +/* Constants for GA Log handling */ +#define GA_LOG_ENTRIES 512 +#define GA_LOG_SIZE_SHIFT 56 +#define GA_LOG_SIZE_512 (0x8ULL << GA_LOG_SIZE_SHIFT) +#define GA_ENTRY_SIZE 8 +#define GA_LOG_SIZE (GA_ENTRY_SIZE * GA_LOG_ENTRIES) + +#define GA_TAG(x) (u32)(x & 0xffffffffULL) +#define GA_DEVID(x) (u16)(((x) >> 32) & 0xffffULL) +#define GA_REQ_TYPE(x) (((x) >> 60) & 0xfULL) + +#define GA_GUEST_NR 0x1 + #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 #define PAGE_MODE_2_LEVEL 0x02 @@ -329,6 +355,12 @@ #define IOMMU_CAP_NPCACHE 26 #define IOMMU_CAP_EFR 27 +/* IOMMU Feature Reporting Field (for IVHD type 10h */ +#define IOMMU_FEAT_GASUP_SHIFT 6 + +/* IOMMU Extended Feature Register (EFR) */ +#define IOMMU_EFR_GASUP_SHIFT 7 + #define MAX_DOMAIN_ID 65536 /* Protection domain flags */ @@ -400,6 +432,7 @@ struct amd_iommu_fault { struct iommu_domain; struct irq_domain; +struct amd_irte_ops; /* * This structure contains generic data for IOMMU protection domains @@ -490,6 +523,12 @@ struct amd_iommu { /* Base of the PPR log, if present */ u8 *ppr_log; + /* Base of the GA log, if present */ + u8 *ga_log; + + /* Tail of the GA log, if present */ + u8 *ga_log_tail; + /* true if interrupts for this IOMMU are already enabled */ bool int_enabled; @@ -523,6 +562,8 @@ struct amd_iommu { #ifdef CONFIG_IRQ_REMAP struct irq_domain *ir_domain; struct irq_domain *msi_domain; + + struct amd_irte_ops *irte_ops; #endif }; @@ -681,4 +722,112 @@ static inline int get_hpet_devid(int id) return -EINVAL; } +enum amd_iommu_intr_mode_type { + AMD_IOMMU_GUEST_IR_LEGACY, + + /* This mode is not visible to users. It is used when + * we cannot fully enable vAPIC and fallback to only support + * legacy interrupt remapping via 128-bit IRTE. + */ + AMD_IOMMU_GUEST_IR_LEGACY_GA, + AMD_IOMMU_GUEST_IR_VAPIC, +}; + +#define AMD_IOMMU_GUEST_IR_GA(x) (x == AMD_IOMMU_GUEST_IR_VAPIC || \ + x == AMD_IOMMU_GUEST_IR_LEGACY_GA) + +#define AMD_IOMMU_GUEST_IR_VAPIC(x) (x == AMD_IOMMU_GUEST_IR_VAPIC) + +union irte { + u32 val; + struct { + u32 valid : 1, + no_fault : 1, + int_type : 3, + rq_eoi : 1, + dm : 1, + rsvd_1 : 1, + destination : 8, + vector : 8, + rsvd_2 : 8; + } fields; +}; + +union irte_ga_lo { + u64 val; + + /* For int remapping */ + struct { + u64 valid : 1, + no_fault : 1, + /* ------ */ + int_type : 3, + rq_eoi : 1, + dm : 1, + /* ------ */ + guest_mode : 1, + destination : 8, + rsvd : 48; + } fields_remap; + + /* For guest vAPIC */ + struct { + u64 valid : 1, + no_fault : 1, + /* ------ */ + ga_log_intr : 1, + rsvd1 : 3, + is_run : 1, + /* ------ */ + guest_mode : 1, + destination : 8, + rsvd2 : 16, + ga_tag : 32; + } fields_vapic; +}; + +union irte_ga_hi { + u64 val; + struct { + u64 vector : 8, + rsvd_1 : 4, + ga_root_ptr : 40, + rsvd_2 : 12; + } fields; +}; + +struct irte_ga { + union irte_ga_lo lo; + union irte_ga_hi hi; +}; + +struct irq_2_irte { + u16 devid; /* Device ID for IRTE table */ + u16 index; /* Index into IRTE table*/ +}; + +struct amd_ir_data { + u32 cached_ga_tag; + struct irq_2_irte irq_2_irte; + struct msi_msg msi_entry; + void *entry; /* Pointer to union irte or struct irte_ga */ + void *ref; /* Pointer to the actual irte */ +}; + +struct amd_irte_ops { + void (*prepare)(void *, u32, u32, u8, u32, int); + void (*activate)(void *, u16, u16); + void (*deactivate)(void *, u16, u16); + void (*set_affinity)(void *, u16, u16, u8, u32); + void *(*get)(struct irq_remap_table *, int); + void (*set_allocated)(struct irq_remap_table *, int); + bool (*is_allocated)(struct irq_remap_table *, int); + void (*clear_allocated)(struct irq_remap_table *, int); +}; + +#ifdef CONFIG_IRQ_REMAP +extern struct amd_irte_ops irte_32_ops; +extern struct amd_irte_ops irte_128_ops; +#endif + #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 2b08e79f5100..09751d349963 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -22,6 +22,20 @@ #include <linux/types.h> +/* + * This is mainly used to communicate information back-and-forth + * between SVM and IOMMU for setting up and tearing down posted + * interrupt + */ +struct amd_iommu_pi_data { + u32 ga_tag; + u32 prev_ga_tag; + u64 base; + bool is_guest_mode; + struct vcpu_data *vcpu_data; + void *ir_data; +}; + #ifdef CONFIG_AMD_IOMMU struct task_struct; @@ -168,11 +182,34 @@ typedef void (*amd_iommu_invalidate_ctx)(struct pci_dev *pdev, int pasid); extern int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, amd_iommu_invalidate_ctx cb); - -#else +#else /* CONFIG_AMD_IOMMU */ static inline int amd_iommu_detect(void) { return -ENODEV; } -#endif +#endif /* CONFIG_AMD_IOMMU */ + +#if defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) + +/* IOMMU AVIC Function */ +extern int amd_iommu_register_ga_log_notifier(int (*notifier)(u32)); + +extern int +amd_iommu_update_ga(int cpu, bool is_run, void *data); + +#else /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ + +static inline int +amd_iommu_register_ga_log_notifier(int (*notifier)(u32)) +{ + return 0; +} + +static inline int +amd_iommu_update_ga(int cpu, bool is_run, void *data) +{ + return 0; +} + +#endif /* defined(CONFIG_AMD_IOMMU) && defined(CONFIG_IRQ_REMAP) */ #endif /* _ASM_X86_AMD_IOMMU_H */ diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 56b0b7ec66aa..99ac022edc60 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -337,6 +337,7 @@ */ #define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107 #define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109 +#define E_ITS_INT_UNMAPPED_INTERRUPT 0x010307 #define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 #define E_ITS_MAPD_DEVICE_OOR 0x010801 #define E_ITS_MAPC_PROCNUM_OOR 0x010902 diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 4fde8c7dfcfe..4309b60ebf17 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -31,8 +31,8 @@ #include "trace.h" static struct timecounter *timecounter; -static struct workqueue_struct *wqueue; static unsigned int host_vtimer_irq; +static u32 host_vtimer_irq_flags; void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { @@ -140,7 +140,7 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) return HRTIMER_RESTART; } - queue_work(wqueue, &timer->expired); + schedule_work(&timer->expired); return HRTIMER_NORESTART; } @@ -365,7 +365,7 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) static void kvm_timer_init_interrupt(void *info) { - enable_percpu_irq(host_vtimer_irq, 0); + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) @@ -432,6 +432,14 @@ int kvm_timer_hyp_init(void) } host_vtimer_irq = info->virtual_irq; + host_vtimer_irq_flags = irq_get_trigger_type(host_vtimer_irq); + if (host_vtimer_irq_flags != IRQF_TRIGGER_HIGH && + host_vtimer_irq_flags != IRQF_TRIGGER_LOW) { + kvm_err("Invalid trigger for IRQ%d, assuming level low\n", + host_vtimer_irq); + host_vtimer_irq_flags = IRQF_TRIGGER_LOW; + } + err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler, "kvm guest timer", kvm_get_running_vcpus()); if (err) { @@ -440,12 +448,6 @@ int kvm_timer_hyp_init(void) goto out; } - wqueue = create_singlethread_workqueue("kvm_arch_timer"); - if (!wqueue) { - err = -ENOMEM; - goto out_free; - } - kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, @@ -509,7 +511,7 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) * VCPUs have the enabled variable set, before entering the guest, if * the arch timers are enabled. */ - if (timecounter && wqueue) + if (timecounter) timer->enabled = 1; return 0; diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 07411cf967b9..4660a7d04eea 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -51,7 +51,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid) irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL); if (!irq) - return NULL; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&irq->lpi_list); INIT_LIST_HEAD(&irq->ap_list); @@ -441,39 +441,63 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, * Find the target VCPU and the LPI number for a given devid/eventid pair * and make this IRQ pending, possibly injecting it. * Must be called with the its_lock mutex held. + * Returns 0 on success, a positive error value for any ITS mapping + * related errors and negative error values for generic errors. */ -static void vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid) +static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid) { + struct kvm_vcpu *vcpu; struct its_itte *itte; if (!its->enabled) - return; + return -EBUSY; itte = find_itte(its, devid, eventid); - /* Triggering an unmapped IRQ gets silently dropped. */ - if (itte && its_is_collection_mapped(itte->collection)) { - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr); - if (vcpu && vcpu->arch.vgic_cpu.lpis_enabled) { - spin_lock(&itte->irq->irq_lock); - itte->irq->pending = true; - vgic_queue_irq_unlock(kvm, itte->irq); - } - } + if (!itte || !its_is_collection_mapped(itte->collection)) + return E_ITS_INT_UNMAPPED_INTERRUPT; + + vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr); + if (!vcpu) + return E_ITS_INT_UNMAPPED_INTERRUPT; + + if (!vcpu->arch.vgic_cpu.lpis_enabled) + return -EBUSY; + + spin_lock(&itte->irq->irq_lock); + itte->irq->pending = true; + vgic_queue_irq_unlock(kvm, itte->irq); + + return 0; +} + +static struct vgic_io_device *vgic_get_its_iodev(struct kvm_io_device *dev) +{ + struct vgic_io_device *iodev; + + if (dev->ops != &kvm_io_gic_ops) + return NULL; + + iodev = container_of(dev, struct vgic_io_device, dev); + + if (iodev->iodev_type != IODEV_ITS) + return NULL; + + return iodev; } /* * Queries the KVM IO bus framework to get the ITS pointer from the given * doorbell address. * We then call vgic_its_trigger_msi() with the decoded data. + * According to the KVM_SIGNAL_MSI API description returns 1 on success. */ int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) { u64 address; struct kvm_io_device *kvm_io_dev; struct vgic_io_device *iodev; + int ret; if (!vgic_has_its(kvm)) return -ENODEV; @@ -485,15 +509,28 @@ int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); if (!kvm_io_dev) - return -ENODEV; + return -EINVAL; - iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); + iodev = vgic_get_its_iodev(kvm_io_dev); + if (!iodev) + return -EINVAL; mutex_lock(&iodev->its->its_lock); - vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data); + ret = vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data); mutex_unlock(&iodev->its->its_lock); - return 0; + if (ret < 0) + return ret; + + /* + * KVM_SIGNAL_MSI demands a return value > 0 for success and 0 + * if the guest has blocked the MSI. So we map any LPI mapping + * related error to that. + */ + if (ret) + return 0; + else + return 1; } /* Requires the its_lock to be held. */ @@ -502,7 +539,8 @@ static void its_free_itte(struct kvm *kvm, struct its_itte *itte) list_del(&itte->itte_list); /* This put matches the get in vgic_add_lpi. */ - vgic_put_irq(kvm, itte->irq); + if (itte->irq) + vgic_put_irq(kvm, itte->irq); kfree(itte); } @@ -697,6 +735,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, struct its_device *device; struct its_collection *collection, *new_coll = NULL; int lpi_nr; + struct vgic_irq *irq; device = find_its_device(its, device_id); if (!device) @@ -710,6 +749,10 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser)) return E_ITS_MAPTI_PHYSICALID_OOR; + /* If there is an existing mapping, behavior is UNPREDICTABLE. */ + if (find_itte(its, device_id, event_id)) + return 0; + collection = find_collection(its, coll_id); if (!collection) { int ret = vgic_its_alloc_collection(its, &collection, coll_id); @@ -718,22 +761,28 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, new_coll = collection; } - itte = find_itte(its, device_id, event_id); + itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL); if (!itte) { - itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL); - if (!itte) { - if (new_coll) - vgic_its_free_collection(its, coll_id); - return -ENOMEM; - } - - itte->event_id = event_id; - list_add_tail(&itte->itte_list, &device->itt_head); + if (new_coll) + vgic_its_free_collection(its, coll_id); + return -ENOMEM; } + itte->event_id = event_id; + list_add_tail(&itte->itte_list, &device->itt_head); + itte->collection = collection; itte->lpi = lpi_nr; - itte->irq = vgic_add_lpi(kvm, lpi_nr); + + irq = vgic_add_lpi(kvm, lpi_nr); + if (IS_ERR(irq)) { + if (new_coll) + vgic_its_free_collection(its, coll_id); + its_free_itte(kvm, itte); + return PTR_ERR(irq); + } + itte->irq = irq; + update_affinity_itte(kvm, itte); /* @@ -981,9 +1030,7 @@ static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its, u32 msi_data = its_cmd_get_id(its_cmd); u64 msi_devid = its_cmd_get_deviceid(its_cmd); - vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); - - return 0; + return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data); } /* @@ -1288,13 +1335,13 @@ void vgic_enable_lpis(struct kvm_vcpu *vcpu) its_sync_lpi_pending_table(vcpu); } -static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) +static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its) { struct vgic_io_device *iodev = &its->iodev; int ret; - if (its->initialized) - return 0; + if (!its->initialized) + return -EBUSY; if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) return -ENXIO; @@ -1311,9 +1358,6 @@ static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its) KVM_VGIC_V3_ITS_SIZE, &iodev->dev); mutex_unlock(&kvm->slots_lock); - if (!ret) - its->initialized = true; - return ret; } @@ -1435,9 +1479,6 @@ static int vgic_its_set_attr(struct kvm_device *dev, if (type != KVM_VGIC_ITS_ADDR_TYPE) return -ENODEV; - if (its->initialized) - return -EBUSY; - if (copy_from_user(&addr, uaddr, sizeof(addr))) return -EFAULT; @@ -1453,7 +1494,9 @@ static int vgic_its_set_attr(struct kvm_device *dev, case KVM_DEV_ARM_VGIC_GRP_CTRL: switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: - return vgic_its_init_its(dev->kvm, its); + its->initialized = true; + + return 0; } break; } @@ -1498,3 +1541,30 @@ int kvm_vgic_register_its_device(void) return kvm_register_device_ops(&kvm_arm_vgic_its_ops, KVM_DEV_TYPE_ARM_VGIC_ITS); } + +/* + * Registers all ITSes with the kvm_io_bus framework. + * To follow the existing VGIC initialization sequence, this has to be + * done as late as possible, just before the first VCPU runs. + */ +int vgic_register_its_iodevs(struct kvm *kvm) +{ + struct kvm_device *dev; + int ret = 0; + + list_for_each_entry(dev, &kvm->devices, vm_node) { + if (dev->ops != &kvm_arm_vgic_its_ops) + continue; + + ret = vgic_register_its_iodev(kvm, dev->private); + if (ret) + return ret; + /* + * We don't need to care about tearing down previously + * registered ITSes, as the kvm_io_bus framework removes + * them for us if the VM gets destroyed. + */ + } + + return ret; +} diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index ff668e0dd586..90d81811fdda 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -306,16 +306,19 @@ static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu, { struct vgic_dist *dist = &vcpu->kvm->arch.vgic; struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 propbaser = dist->propbaser; + u64 old_propbaser, propbaser; /* Storing a value with LPIs already enabled is undefined */ if (vgic_cpu->lpis_enabled) return; - propbaser = update_64bit_reg(propbaser, addr & 4, len, val); - propbaser = vgic_sanitise_propbaser(propbaser); - - dist->propbaser = propbaser; + do { + old_propbaser = dist->propbaser; + propbaser = old_propbaser; + propbaser = update_64bit_reg(propbaser, addr & 4, len, val); + propbaser = vgic_sanitise_propbaser(propbaser); + } while (cmpxchg64(&dist->propbaser, old_propbaser, + propbaser) != old_propbaser); } static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu, @@ -331,16 +334,19 @@ static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu, unsigned long val) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - u64 pendbaser = vgic_cpu->pendbaser; + u64 old_pendbaser, pendbaser; /* Storing a value with LPIs already enabled is undefined */ if (vgic_cpu->lpis_enabled) return; - pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); - pendbaser = vgic_sanitise_pendbaser(pendbaser); - - vgic_cpu->pendbaser = pendbaser; + do { + old_pendbaser = vgic_cpu->pendbaser; + pendbaser = old_pendbaser; + pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val); + pendbaser = vgic_sanitise_pendbaser(pendbaser); + } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser, + pendbaser) != old_pendbaser); } /* diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 0506543df38a..9f0dae397d9c 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -289,6 +289,14 @@ int vgic_v3_map_resources(struct kvm *kvm) goto out; } + if (vgic_has_its(kvm)) { + ret = vgic_register_its_iodevs(kvm); + if (ret) { + kvm_err("Unable to register VGIC ITS MMIO regions\n"); + goto out; + } + } + dist->ready = true; out: diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index e7aeac719e09..e83b7fe4baae 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -117,17 +117,17 @@ static void vgic_irq_release(struct kref *ref) void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) { - struct vgic_dist *dist; + struct vgic_dist *dist = &kvm->arch.vgic; if (irq->intid < VGIC_MIN_LPI) return; - if (!kref_put(&irq->refcount, vgic_irq_release)) + spin_lock(&dist->lpi_list_lock); + if (!kref_put(&irq->refcount, vgic_irq_release)) { + spin_unlock(&dist->lpi_list_lock); return; + }; - dist = &kvm->arch.vgic; - - spin_lock(&dist->lpi_list_lock); list_del(&irq->lpi_list); dist->lpi_list_count--; spin_unlock(&dist->lpi_list_lock); diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 1d8e21d5c13f..6c4625c46368 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -84,6 +84,7 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu); int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); +int vgic_register_its_iodevs(struct kvm *kvm); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); void vgic_enable_lpis(struct kvm_vcpu *vcpu); @@ -140,6 +141,11 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm, return -ENODEV; } +static inline int vgic_register_its_iodevs(struct kvm *kvm) +{ + return -ENODEV; +} + static inline bool vgic_has_its(struct kvm *kvm) { return false; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index e469b6012471..f397e9b20370 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -42,7 +42,6 @@ #ifdef CONFIG_HAVE_KVM_IRQFD -static struct workqueue_struct *irqfd_cleanup_wq; static void irqfd_inject(struct work_struct *work) @@ -168,7 +167,7 @@ irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) list_del_init(&irqfd->list); - queue_work(irqfd_cleanup_wq, &irqfd->shutdown); + schedule_work(&irqfd->shutdown); } int __attribute__((weak)) kvm_arch_set_irq_inatomic( @@ -555,7 +554,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) * so that we guarantee there will not be any more interrupts on this * gsi once this deassign function returns. */ - flush_workqueue(irqfd_cleanup_wq); + flush_work(&irqfd->shutdown); return 0; } @@ -592,7 +591,7 @@ kvm_irqfd_release(struct kvm *kvm) * Block until we know all outstanding shutdown jobs have completed * since we do not take a kvm* reference. */ - flush_workqueue(irqfd_cleanup_wq); + flush_work(&irqfd->shutdown); } @@ -622,23 +621,8 @@ void kvm_irq_routing_update(struct kvm *kvm) spin_unlock_irq(&kvm->irqfds.lock); } -/* - * create a host-wide workqueue for issuing deferred shutdown requests - * aggregated from all vm* instances. We need our own isolated single-thread - * queue to prevent deadlock against flushing the normal work-queue. - */ -int kvm_irqfd_init(void) -{ - irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); - if (!irqfd_cleanup_wq) - return -ENOMEM; - - return 0; -} - void kvm_irqfd_exit(void) { - destroy_workqueue(irqfd_cleanup_wq); } #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 195078225aa5..b3fa12ce1166 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3807,12 +3807,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, * kvm_arch_init makes sure there's at most one caller * for architectures that support multiple implementations, * like intel and amd on x86. - * kvm_arch_init must be called before kvm_irqfd_init to avoid creating - * conflicts in case kvm is already setup for another implementation. */ - r = kvm_irqfd_init(); - if (r) - goto out_irqfd; if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; @@ -3894,7 +3889,6 @@ out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: kvm_irqfd_exit(); -out_irqfd: kvm_arch_exit(); out_fail: return r; |