diff options
author | Michael Ellerman <mpe@ellerman.id.au> | 2022-05-19 23:10:42 +1000 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2022-05-19 23:10:42 +1000 |
commit | b104e41cda1ef9c5e851a7de3f30b53535e7d528 (patch) | |
tree | 7bab688d125e67e42d387b97c45522c76155bbe2 /arch/powerpc/kvm | |
parent | a5fc286f69fc9590c22995fe05dca461fd6295b1 (diff) | |
parent | ad55bae7dc364417434b69dd6c30104f20d0f84d (diff) |
Merge branch 'topic/ppc-kvm' into next
Merge our KVM topic branch.
Diffstat (limited to 'arch/powerpc/kvm')
-rw-r--r-- | arch/powerpc/kvm/Makefile | 5 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_mmu_hv.c | 40 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_vio.c | 43 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_64_vio_hv.c | 672 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv.c | 73 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_builtin.c | 64 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_nested.c | 137 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_p9_entry.c | 15 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rm_xics.c | 7 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rm_xive.c | 46 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_rmhandlers.S | 30 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_hv_uvmem.c | 8 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_pr_papr.c | 6 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_xive.c | 649 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_xive.h | 7 | ||||
-rw-r--r-- | arch/powerpc/kvm/book3s_xive_template.c | 636 | ||||
-rw-r--r-- | arch/powerpc/kvm/e500mc.c | 1 | ||||
-rw-r--r-- | arch/powerpc/kvm/powerpc.c | 30 | ||||
-rw-r--r-- | arch/powerpc/kvm/trace_hv.h | 8 |
19 files changed, 883 insertions, 1594 deletions
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 9bdfc8b50899..f17379b0f161 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -37,9 +37,6 @@ kvm-e500mc-objs := \ e500_emulate.o kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) -kvm-book3s_64-builtin-objs-$(CONFIG_SPAPR_TCE_IOMMU) := \ - book3s_64_vio_hv.o - kvm-pr-y := \ fpu.o \ emulate.o \ @@ -76,7 +73,7 @@ kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ book3s_hv_tm.o kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ - book3s_hv_rm_xics.o book3s_hv_rm_xive.o + book3s_hv_rm_xics.o kvm-book3s_64-builtin-tm-objs-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ book3s_hv_tm_builtin.o diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1137c4df726c..514fd45c1994 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -256,26 +256,34 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, int kvmppc_mmu_hv_init(void) { - unsigned long host_lpid, rsvd_lpid; + unsigned long nr_lpids; if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) return -EINVAL; - host_lpid = 0; - if (cpu_has_feature(CPU_FTR_HVMODE)) - host_lpid = mfspr(SPRN_LPID); + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (WARN_ON(mfspr(SPRN_LPID) != 0)) + return -EINVAL; + nr_lpids = 1UL << mmu_lpid_bits; + } else { + nr_lpids = 1UL << KVM_MAX_NESTED_GUESTS_SHIFT; + } - /* POWER8 and above have 12-bit LPIDs (10-bit in POWER7) */ - if (cpu_has_feature(CPU_FTR_ARCH_207S)) - rsvd_lpid = LPID_RSVD; - else - rsvd_lpid = LPID_RSVD_POWER7; + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* POWER7 has 10-bit LPIDs, POWER8 has 12-bit LPIDs */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + WARN_ON(nr_lpids != 1UL << 12); + else + WARN_ON(nr_lpids != 1UL << 10); - kvmppc_init_lpid(rsvd_lpid + 1); + /* + * Reserve the last implemented LPID use in partition + * switching for POWER7 and POWER8. + */ + nr_lpids -= 1; + } - kvmppc_claim_lpid(host_lpid); - /* rsvd_lpid is reserved for use in partition switching */ - kvmppc_claim_lpid(rsvd_lpid); + kvmppc_init_lpid(nr_lpids); return 0; } @@ -879,7 +887,7 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, struct revmap_entry *rev = kvm->arch.hpt.rev; unsigned long head, i, j; __be64 *hptep; - int ret = 0; + bool ret = false; unsigned long *rmapp; rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; @@ -887,7 +895,7 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_REFERENCED) { *rmapp &= ~KVMPPC_RMAP_REFERENCED; - ret = 1; + ret = true; } if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { unlock_rmap(rmapp); @@ -919,7 +927,7 @@ static bool kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, rev[i].guest_rpte |= HPTE_R_R; note_hpte_modification(kvm, &rev[i]); } - ret = 1; + ret = true; } __unlock_hpte(hptep, be64_to_cpu(hptep[0])); } while ((i = j) != head); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 85cfa6328222..d6589c4fe889 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -32,6 +32,18 @@ #include <asm/tce.h> #include <asm/mmu_context.h> +static struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, + unsigned long liobn) +{ + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) + if (stt->liobn == liobn) + return stt; + + return NULL; +} + static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) { return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; @@ -753,3 +765,34 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba) +{ + struct kvmppc_spapr_tce_table *stt; + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; + + stt = kvmppc_find_table(vcpu->kvm, liobn); + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; + + idx = (ioba >> stt->page_shift) - stt->offset; + page = stt->pages[idx / TCES_PER_PAGE]; + if (!page) { + vcpu->arch.regs.gpr[4] = 0; + return H_SUCCESS; + } + tbl = (u64 *)page_address(page); + + vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c deleted file mode 100644 index fdcc7b287dd8..000000000000 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ /dev/null @@ -1,672 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * - * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> - * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> - * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com> - */ - -#include <linux/types.h> -#include <linux/string.h> -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <linux/highmem.h> -#include <linux/gfp.h> -#include <linux/slab.h> -#include <linux/hugetlb.h> -#include <linux/list.h> -#include <linux/stringify.h> - -#include <asm/kvm_ppc.h> -#include <asm/kvm_book3s.h> -#include <asm/book3s/64/mmu-hash.h> -#include <asm/mmu_context.h> -#include <asm/hvcall.h> -#include <asm/synch.h> -#include <asm/ppc-opcode.h> -#include <asm/udbg.h> -#include <asm/iommu.h> -#include <asm/tce.h> -#include <asm/pte-walk.h> - -#ifdef CONFIG_BUG - -#define WARN_ON_ONCE_RM(condition) ({ \ - static bool __section(".data.unlikely") __warned; \ - int __ret_warn_once = !!(condition); \ - \ - if (unlikely(__ret_warn_once && !__warned)) { \ - __warned = true; \ - pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n", \ - __stringify(condition), \ - __func__, __LINE__); \ - dump_stack(); \ - } \ - unlikely(__ret_warn_once); \ -}) - -#else - -#define WARN_ON_ONCE_RM(condition) ({ \ - int __ret_warn_on = !!(condition); \ - unlikely(__ret_warn_on); \ -}) - -#endif - -/* - * Finds a TCE table descriptor by LIOBN. - * - * WARNING: This will be called in real or virtual mode on HV KVM and virtual - * mode on PR KVM - */ -struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, - unsigned long liobn) -{ - struct kvmppc_spapr_tce_table *stt; - - list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) - if (stt->liobn == liobn) - return stt; - - return NULL; -} -EXPORT_SYMBOL_GPL(kvmppc_find_table); - -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -static long kvmppc_rm_tce_to_ua(struct kvm *kvm, - unsigned long tce, unsigned long *ua) -{ - unsigned long gfn = tce >> PAGE_SHIFT; - struct kvm_memory_slot *memslot; - - memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); - if (!memslot) - return -EINVAL; - - *ua = __gfn_to_hva_memslot(memslot, gfn) | - (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); - - return 0; -} - -/* - * Validates TCE address. - * At the moment flags and page mask are validated. - * As the host kernel does not access those addresses (just puts them - * to the table and user space is supposed to process them), we can skip - * checking other things (such as TCE is a guest RAM address or the page - * was actually allocated). - */ -static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt, - unsigned long tce) -{ - unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); - enum dma_data_direction dir = iommu_tce_direction(tce); - struct kvmppc_spapr_tce_iommu_table *stit; - unsigned long ua = 0; - - /* Allow userspace to poison TCE table */ - if (dir == DMA_NONE) - return H_SUCCESS; - - if (iommu_tce_check_gpa(stt->page_shift, gpa)) - return H_PARAMETER; - - if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua)) - return H_TOO_HARD; - - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - unsigned long hpa = 0; - struct mm_iommu_table_group_mem_t *mem; - long shift = stit->tbl->it_page_shift; - - mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift); - if (!mem) - return H_TOO_HARD; - - if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa)) - return H_TOO_HARD; - } - - return H_SUCCESS; -} - -/* Note on the use of page_address() in real mode, - * - * It is safe to use page_address() in real mode on ppc64 because - * page_address() is always defined as lowmem_page_address() - * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic - * operation and does not access page struct. - * - * Theoretically page_address() could be defined different - * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL - * would have to be enabled. - * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64, - * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only - * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP - * is not expected to be enabled on ppc32, page_address() - * is safe for ppc32 as well. - * - * WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM - */ -static u64 *kvmppc_page_address(struct page *page) -{ -#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL) -#error TODO: fix to avoid page_address() here -#endif - return (u64 *) page_address(page); -} - -/* - * Handles TCE requests for emulated devices. - * Puts guest TCE values to the table and expects user space to convert them. - * Cannot fail so kvmppc_rm_tce_validate must be called before it. - */ -static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt, - unsigned long idx, unsigned long tce) -{ - struct page *page; - u64 *tbl; - - idx -= stt->offset; - page = stt->pages[idx / TCES_PER_PAGE]; - /* - * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is - * being cleared, otherwise it returns H_TOO_HARD and we skip this. - */ - if (!page) { - WARN_ON_ONCE_RM(tce != 0); - return; - } - tbl = kvmppc_page_address(page); - - tbl[idx % TCES_PER_PAGE] = tce; -} - -/* - * TCEs pages are allocated in kvmppc_rm_tce_put() which won't be able to do so - * in real mode. - * Check if kvmppc_rm_tce_put() can succeed in real mode, i.e. a TCEs page is - * allocated or not required (when clearing a tce entry). - */ -static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, - unsigned long ioba, unsigned long npages, bool clearing) -{ - unsigned long i, idx, sttpage, sttpages; - unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages); - - if (ret) - return ret; - /* - * clearing==true says kvmppc_rm_tce_put won't be allocating pages - * for empty tces. - */ - if (clearing) - return H_SUCCESS; - - idx = (ioba >> stt->page_shift) - stt->offset; - sttpage = idx / TCES_PER_PAGE; - sttpages = ALIGN(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) / - TCES_PER_PAGE; - for (i = sttpage; i < sttpage + sttpages; ++i) - if (!stt->pages[i]) - return H_TOO_HARD; - - return H_SUCCESS; -} - -static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm, - struct iommu_table *tbl, - unsigned long entry, unsigned long *hpa, - enum dma_data_direction *direction) -{ - long ret; - - ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, true); - - if (!ret && ((*direction == DMA_FROM_DEVICE) || - (*direction == DMA_BIDIRECTIONAL))) { - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); - /* - * kvmppc_rm_tce_iommu_do_map() updates the UA cache after - * calling this so we still get here a valid UA. - */ - if (pua && *pua) - mm_iommu_ua_mark_dirty_rm(mm, be64_to_cpu(*pua)); - } - - return ret; -} - -static void iommu_tce_kill_rm(struct iommu_table *tbl, - unsigned long entry, unsigned long pages) -{ - if (tbl->it_ops->tce_kill) - tbl->it_ops->tce_kill(tbl, entry, pages, true); -} - -static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt, - struct iommu_table *tbl, unsigned long entry) -{ - unsigned long i; - unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); - unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); - - for (i = 0; i < subpages; ++i) { - unsigned long hpa = 0; - enum dma_data_direction dir = DMA_NONE; - - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir); - } -} - -static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, - struct iommu_table *tbl, unsigned long entry) -{ - struct mm_iommu_table_group_mem_t *mem = NULL; - const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); - - if (!pua) - /* it_userspace allocation might be delayed */ - return H_TOO_HARD; - - mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize); - if (!mem) - return H_TOO_HARD; - - mm_iommu_mapped_dec(mem); - - *pua = cpu_to_be64(0); - - return H_SUCCESS; -} - -static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, - struct iommu_table *tbl, unsigned long entry) -{ - enum dma_data_direction dir = DMA_NONE; - unsigned long hpa = 0; - long ret; - - if (iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir)) - /* - * real mode xchg can fail if struct page crosses - * a page boundary - */ - return H_TOO_HARD; - - if (dir == DMA_NONE) - return H_SUCCESS; - - ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); - if (ret) - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); - - return ret; -} - -static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, - struct kvmppc_spapr_tce_table *stt, struct iommu_table *tbl, - unsigned long entry) -{ - unsigned long i, ret = H_SUCCESS; - unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); - unsigned long io_entry = entry * subpages; - - for (i = 0; i < subpages; ++i) { - ret = kvmppc_rm_tce_iommu_do_unmap(kvm, tbl, io_entry + i); - if (ret != H_SUCCESS) - break; - } - - iommu_tce_kill_rm(tbl, io_entry, subpages); - - return ret; -} - -static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, - unsigned long entry, unsigned long ua, - enum dma_data_direction dir) -{ - long ret; - unsigned long hpa = 0; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); - struct mm_iommu_table_group_mem_t *mem; - - if (!pua) - /* it_userspace allocation might be delayed */ - return H_TOO_HARD; - - mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift); - if (!mem) - return H_TOO_HARD; - - if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift, - &hpa))) - return H_TOO_HARD; - - if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) - return H_TOO_HARD; - - ret = iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); - if (ret) { - mm_iommu_mapped_dec(mem); - /* - * real mode xchg can fail if struct page crosses - * a page boundary - */ - return H_TOO_HARD; - } - - if (dir != DMA_NONE) - kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); - - *pua = cpu_to_be64(ua); - - return 0; -} - -static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, - struct kvmppc_spapr_tce_table *stt, struct iommu_table *tbl, - unsigned long entry, unsigned long ua, - enum dma_data_direction dir) -{ - unsigned long i, pgoff, ret = H_SUCCESS; - unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); - unsigned long io_entry = entry * subpages; - - for (i = 0, pgoff = 0; i < subpages; - ++i, pgoff += IOMMU_PAGE_SIZE(tbl)) { - - ret = kvmppc_rm_tce_iommu_do_map(kvm, tbl, - io_entry + i, ua + pgoff, dir); - if (ret != H_SUCCESS) - break; - } - - iommu_tce_kill_rm(tbl, io_entry, subpages); - - return ret; -} - -long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba, unsigned long tce) -{ - struct kvmppc_spapr_tce_table *stt; - long ret; - struct kvmppc_spapr_tce_iommu_table *stit; - unsigned long entry, ua = 0; - enum dma_data_direction dir; - - /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ - /* liobn, ioba, tce); */ - - stt = kvmppc_find_table(vcpu->kvm, liobn); - if (!stt) - return H_TOO_HARD; - - ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0); - if (ret != H_SUCCESS) - return ret; - - ret = kvmppc_rm_tce_validate(stt, tce); - if (ret != H_SUCCESS) - return ret; - - dir = iommu_tce_direction(tce); - if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) - return H_PARAMETER; - - entry = ioba >> stt->page_shift; - - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - if (dir == DMA_NONE) - ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, stt, - stit->tbl, entry); - else - ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, - stit->tbl, entry, ua, dir); - - if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry); - return ret; - } - } - - kvmppc_rm_tce_put(stt, entry, tce); - - return H_SUCCESS; -} - -static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq, - unsigned long ua, unsigned long *phpa) -{ - pte_t *ptep, pte; - unsigned shift = 0; - - /* - * Called in real mode with MSR_EE = 0. We are safe here. - * It is ok to do the lookup with arch.pgdir here, because - * we are doing this on secondary cpus and current task there - * is not the hypervisor. Also this is safe against THP in the - * host, because an IPI to primary thread will wait for the secondary - * to exit which will again result in the below page table walk - * to finish. - */ - /* an rmap lock won't make it safe. because that just ensure hash - * page table entries are removed with rmap lock held. After that - * mmu notifier returns and we go ahead and removing ptes from Qemu page table. - */ - ptep = find_kvm_host_pte(vcpu->kvm, mmu_seq, ua, &shift); - if (!ptep) - return -ENXIO; - - pte = READ_ONCE(*ptep); - if (!pte_present(pte)) - return -ENXIO; - - if (!shift) - shift = PAGE_SHIFT; - - /* Avoid handling anything potentially complicated in realmode */ - if (shift > PAGE_SHIFT) - return -EAGAIN; - - if (!pte_young(pte)) - return -EAGAIN; - - *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) | - (ua & ~PAGE_MASK); - - return 0; -} - -long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, - unsigned long liobn, unsigned long ioba, - unsigned long tce_list, unsigned long npages) -{ - struct kvm *kvm = vcpu->kvm; - struct kvmppc_spapr_tce_table *stt; - long i, ret = H_SUCCESS; - unsigned long tces, entry, ua = 0; - unsigned long mmu_seq; - bool prereg = false; - struct kvmppc_spapr_tce_iommu_table *stit; - - /* - * used to check for invalidations in progress - */ - mmu_seq = kvm->mmu_notifier_seq; - smp_rmb(); - - stt = kvmppc_find_table(vcpu->kvm, liobn); - if (!stt) - return H_TOO_HARD; - - entry = ioba >> stt->page_shift; - /* - * The spec says that the maximum size of the list is 512 TCEs - * so the whole table addressed resides in 4K page - */ - if (npages > 512) - return H_PARAMETER; - - if (tce_list & (SZ_4K - 1)) - return H_PARAMETER; - - ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false); - if (ret != H_SUCCESS) - return ret; - - if (mm_iommu_preregistered(vcpu->kvm->mm)) { - /* - * We get here if guest memory was pre-registered which - * is normally VFIO case and gpa->hpa translation does not - * depend on hpt. - */ - struct mm_iommu_table_group_mem_t *mem; - - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua)) - return H_TOO_HARD; - - mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); - if (mem) - prereg = mm_iommu_ua_to_hpa_rm(mem, ua, - IOMMU_PAGE_SHIFT_4K, &tces) == 0; - } - - if (!prereg) { - /* - * This is usually a case of a guest with emulated devices only - * when TCE list is not in preregistered memory. - * We do not require memory to be preregistered in this case - * so lock rmap and do __find_linux_pte_or_hugepte(). - */ - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua)) - return H_TOO_HARD; - - arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock); - if (kvmppc_rm_ua_to_hpa(vcpu, mmu_seq, ua, &tces)) { - ret = H_TOO_HARD; - goto unlock_exit; - } - } - - for (i = 0; i < npages; ++i) { - unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); - - ret = kvmppc_rm_tce_validate(stt, tce); - if (ret != H_SUCCESS) - goto unlock_exit; - } - - for (i = 0; i < npages; ++i) { - unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); - - ua = 0; - if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) { - ret = H_PARAMETER; - goto unlock_exit; - } - - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, - stit->tbl, entry + i, ua, - iommu_tce_direction(tce)); - - if (ret != H_SUCCESS) { - kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, - entry + i); - goto unlock_exit; - } - } - - kvmppc_rm_tce_put(stt, entry + i, tce); - } - -unlock_exit: - if (!prereg) - arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock); - return ret; -} - -long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, - unsigned long liobn, unsigned long ioba, - unsigned long tce_value, unsigned long npages) -{ - struct kvmppc_spapr_tce_table *stt; - long i, ret; - struct kvmppc_spapr_tce_iommu_table *stit; - - stt = kvmppc_find_table(vcpu->kvm, liobn); - if (!stt) - return H_TOO_HARD; - - ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0); - if (ret != H_SUCCESS) - return ret; - - /* Check permission bits only to allow userspace poison TCE for debug */ - if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) - return H_PARAMETER; - - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - unsigned long entry = ioba >> stt->page_shift; - - for (i = 0; i < npages; ++i) { - ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, stt, - stit->tbl, entry + i); - - if (ret == H_SUCCESS) - continue; - - if (ret == H_TOO_HARD) - return ret; - - WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i); - } - } - - for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) - kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); - - return ret; -} - -/* This can be called in either virtual mode or real mode */ -long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba) -{ - struct kvmppc_spapr_tce_table *stt; - long ret; - unsigned long idx; - struct page *page; - u64 *tbl; - - stt = kvmppc_find_table(vcpu->kvm, liobn); - if (!stt) - return H_TOO_HARD; - - ret = kvmppc_ioba_validate(stt, ioba, 1); - if (ret != H_SUCCESS) - return ret; - - idx = (ioba >> stt->page_shift) - stt->offset; - page = stt->pages[idx / TCES_PER_PAGE]; - if (!page) { - vcpu->arch.regs.gpr[4] = 0; - return H_SUCCESS; - } - tbl = (u64 *)page_address(page); - - vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; - - return H_SUCCESS; -} -EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); - -#endif /* KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 5ea107d830a6..e08fb3124dca 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1327,6 +1327,12 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd) case H_CONFER: case H_REGISTER_VPA: case H_SET_MODE: +#ifdef CONFIG_SPAPR_TCE_IOMMU + case H_GET_TCE: + case H_PUT_TCE: + case H_PUT_TCE_INDIRECT: + case H_STUFF_TCE: +#endif case H_LOGICAL_CI_LOAD: case H_LOGICAL_CI_STORE: #ifdef CONFIG_KVM_XICS @@ -2835,7 +2841,7 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) * to trap and then we emulate them. */ vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB | - HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP | HFSCR_PREFIX; + HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP; if (cpu_has_feature(CPU_FTR_HVMODE)) { vcpu->arch.hfscr &= mfspr(SPRN_HFSCR); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -3968,6 +3974,7 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns kvmhv_save_hv_regs(vcpu, &hvregs); hvregs.lpcr = lpcr; + hvregs.amor = ~0; vcpu->arch.regs.msr = vcpu->arch.shregs.msr; hvregs.version = HV_GUEST_STATE_VERSION; if (vcpu->arch.nested) { @@ -4030,6 +4037,8 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, uns static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb) { + struct kvm *kvm = vcpu->kvm; + struct kvm_nested_guest *nested = vcpu->arch.nested; u64 next_timer; int trap; @@ -4049,34 +4058,61 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb); /* H_CEDE has to be handled now, not later */ - if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && + if (trap == BOOK3S_INTERRUPT_SYSCALL && !nested && kvmppc_get_gpr(vcpu, 3) == H_CEDE) { kvmppc_cede(vcpu); kvmppc_set_gpr(vcpu, 3, 0); trap = 0; } - } else { - struct kvm *kvm = vcpu->kvm; + } else if (nested) { + __this_cpu_write(cpu_in_guest, kvm); + trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb); + __this_cpu_write(cpu_in_guest, NULL); + } else { kvmppc_xive_push_vcpu(vcpu); __this_cpu_write(cpu_in_guest, kvm); trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb); __this_cpu_write(cpu_in_guest, NULL); - if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && + if (trap == BOOK3S_INTERRUPT_SYSCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { unsigned long req = kvmppc_get_gpr(vcpu, 3); - /* H_CEDE has to be handled now, not later */ + /* + * XIVE rearm and XICS hcalls must be handled + * before xive context is pulled (is this + * true?) + */ if (req == H_CEDE) { + /* H_CEDE has to be handled now */ kvmppc_cede(vcpu); - kvmppc_xive_rearm_escalation(vcpu); /* may un-cede */ + if (!kvmppc_xive_rearm_escalation(vcpu)) { + /* + * Pending escalation so abort + * the cede. + */ + vcpu->arch.ceded = 0; + } kvmppc_set_gpr(vcpu, 3, 0); trap = 0; - /* XICS hcalls must be handled before xive is pulled */ + } else if (req == H_ENTER_NESTED) { + /* + * L2 should not run with the L1 + * context so rearm and pull it. + */ + if (!kvmppc_xive_rearm_escalation(vcpu)) { + /* + * Pending escalation so abort + * H_ENTER_NESTED. + */ + kvmppc_set_gpr(vcpu, 3, 0); + trap = 0; + } + } else if (hcall_is_xics(req)) { int ret; @@ -4234,13 +4270,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) start_wait = ktime_get(); vc->vcore_state = VCORE_SLEEPING; - trace_kvmppc_vcore_blocked(vc, 0); + trace_kvmppc_vcore_blocked(vc->runner, 0); spin_unlock(&vc->lock); schedule(); finish_rcuwait(&vc->wait); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; - trace_kvmppc_vcore_blocked(vc, 1); + trace_kvmppc_vcore_blocked(vc->runner, 1); ++vc->runner->stat.halt_successful_wait; cur = ktime_get(); @@ -4520,9 +4556,14 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, if (!nested) { kvmppc_core_prepare_to_enter(vcpu); - if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, - &vcpu->arch.pending_exceptions)) + if (vcpu->arch.shregs.msr & MSR_EE) { + if (xive_interrupt_pending(vcpu)) + kvmppc_inject_interrupt_hv(vcpu, + BOOK3S_INTERRUPT_EXTERNAL, 0); + } else if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, + &vcpu->arch.pending_exceptions)) { lpcr |= LPCR_MER; + } } else if (vcpu->arch.pending_exceptions || vcpu->arch.doorbell_request || xive_interrupt_pending(vcpu)) { @@ -4620,9 +4661,9 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, if (kvmppc_vcpu_check_block(vcpu)) break; - trace_kvmppc_vcore_blocked(vc, 0); + trace_kvmppc_vcore_blocked(vcpu, 0); schedule(); - trace_kvmppc_vcore_blocked(vc, 1); + trace_kvmppc_vcore_blocked(vcpu, 1); } finish_rcuwait(wait); } @@ -5284,6 +5325,10 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); lpcr &= LPCR_PECE | LPCR_LPES; } else { + /* + * The L2 LPES mode will be set by the L0 according to whether + * or not it needs to take external interrupts in HV mode. + */ lpcr = 0; } lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7e52d0beee77..88a8f6473c4e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -489,70 +489,6 @@ static long kvmppc_read_one_intr(bool *again) return kvmppc_check_passthru(xisr, xirr, again); } -#ifdef CONFIG_KVM_XICS -unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_xirr(vcpu); - else - return xics_rm_h_xirr(vcpu); -} - -unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - vcpu->arch.regs.gpr[5] = get_tb(); - if (xics_on_xive()) - return xive_rm_h_xirr(vcpu); - else - return xics_rm_h_xirr(vcpu); -} - -unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_ipoll(vcpu, server); - else - return H_TOO_HARD; -} - -int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, - unsigned long mfrr) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_ipi(vcpu, server, mfrr); - else - return xics_rm_h_ipi(vcpu, server, mfrr); -} - -int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_cppr(vcpu, cppr); - else - return xics_rm_h_cppr(vcpu, cppr); -} - -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) -{ - if (!kvmppc_xics_enabled(vcpu)) - return H_TOO_HARD; - if (xics_on_xive()) - return xive_rm_h_eoi(vcpu, xirr); - else - return xics_rm_h_eoi(vcpu, xirr); -} -#endif /* CONFIG_KVM_XICS */ - void kvmppc_bad_interrupt(struct pt_regs *regs) { /* diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index c943a051c6e7..0644732d1a25 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -261,8 +261,7 @@ static void load_l2_hv_regs(struct kvm_vcpu *vcpu, /* * Don't let L1 change LPCR bits for the L2 except these: */ - mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | - LPCR_LPES | LPCR_MER; + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | LPCR_MER; /* * Additional filtering is required depending on hardware @@ -439,10 +438,11 @@ long kvmhv_nested_init(void) if (!radix_enabled()) return -ENODEV; - /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */ - ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1; - if (ptb_order < 8) - ptb_order = 8; + /* Partition table entry is 1<<4 bytes in size, hence the 4. */ + ptb_order = KVM_MAX_NESTED_GUESTS_SHIFT + 4; + /* Minimum partition table size is 1<<12 bytes */ + if (ptb_order < 12) + ptb_order = 12; pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order, GFP_KERNEL); if (!pseries_partition_tb) { @@ -450,7 +450,7 @@ long kvmhv_nested_init(void) return -ENOMEM; } - ptcr = __pa(pseries_partition_tb) | (ptb_order - 8); + ptcr = __pa(pseries_partition_tb) | (ptb_order - 12); rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr); if (rc != H_SUCCESS) { pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n", @@ -521,11 +521,6 @@ static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp) kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table); } -void kvmhv_vm_nested_init(struct kvm *kvm) -{ - kvm->arch.max_nested_lpid = -1; -} - /* * Handle the H_SET_PARTITION_TABLE hcall. * r4 = guest real address of partition table + log_2(size) - 12 @@ -539,16 +534,14 @@ long kvmhv_set_partition_table(struct kvm_vcpu *vcpu) long ret = H_SUCCESS; srcu_idx = srcu_read_lock(&kvm->srcu); - /* - * Limit the partition table to 4096 entries (because that's what - * hardware supports), and check the base address. - */ - if ((ptcr & PRTS_MASK) > 12 - 8 || + /* Check partition size and base address. */ + if ((ptcr & PRTS_MASK) + 12 - 4 > KVM_MAX_NESTED_GUESTS_SHIFT || !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT)) ret = H_PARAMETER; srcu_read_unlock(&kvm->srcu, srcu_idx); if (ret == H_SUCCESS) kvm->arch.l1_ptcr = ptcr; + return ret; } @@ -644,7 +637,7 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) ret = -EFAULT; ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4); - if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8))) { + if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) { int srcu_idx = srcu_read_lock(&kvm->srcu); ret = kvm_read_guest(kvm, ptbl_addr, &ptbl_entry, sizeof(ptbl_entry)); @@ -660,6 +653,35 @@ static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) kvmhv_set_nested_ptbl(gp); } +void kvmhv_vm_nested_init(struct kvm *kvm) +{ + idr_init(&kvm->arch.kvm_nested_guest_idr); +} + +static struct kvm_nested_guest *__find_nested(struct kvm *kvm, int lpid) +{ + return idr_find(&kvm->arch.kvm_nested_guest_idr, lpid); +} + +static bool __prealloc_nested(struct kvm *kvm, int lpid) +{ + if (idr_alloc(&kvm->arch.kvm_nested_guest_idr, + NULL, lpid, lpid + 1, GFP_KERNEL) != lpid) + return false; + return true; +} + +static void __add_nested(struct kvm *kvm, int lpid, struct kvm_nested_guest *gp) +{ + if (idr_replace(&kvm->arch.kvm_nested_guest_idr, gp, lpid)) + WARN_ON(1); +} + +static void __remove_nested(struct kvm *kvm, int lpid) +{ + idr_remove(&kvm->arch.kvm_nested_guest_idr, lpid); +} + static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) { struct kvm_nested_guest *gp; @@ -720,13 +742,8 @@ static void kvmhv_remove_nested(struct kvm_nested_guest *gp) long ref; spin_lock(&kvm->mmu_lock); - if (gp == kvm->arch.nested_guests[lpid]) { - kvm->arch.nested_guests[lpid] = NULL; - if (lpid == kvm->arch.max_nested_lpid) { - while (--lpid >= 0 && !kvm->arch.nested_guests[lpid]) - ; - kvm->arch.max_nested_lpid = lpid; - } + if (gp == __find_nested(kvm, lpid)) { + __remove_nested(kvm, lpid); --gp->refcnt; } ref = gp->refcnt; @@ -743,24 +760,22 @@ static void kvmhv_remove_nested(struct kvm_nested_guest *gp) */ void kvmhv_release_all_nested(struct kvm *kvm) { - int i; + int lpid; struct kvm_nested_guest *gp; struct kvm_nested_guest *freelist = NULL; struct kvm_memory_slot *memslot; int srcu_idx, bkt; spin_lock(&kvm->mmu_lock); - for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { - gp = kvm->arch.nested_guests[i]; - if (!gp) - continue; - kvm->arch.nested_guests[i] = NULL; + idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) { + __remove_nested(kvm, lpid); if (--gp->refcnt == 0) { gp->next = freelist; freelist = gp; } } - kvm->arch.max_nested_lpid = -1; + idr_destroy(&kvm->arch.kvm_nested_guest_idr); + /* idr is empty and may be reused at this point */ spin_unlock(&kvm->mmu_lock); while ((gp = freelist) != NULL) { freelist = gp->next; @@ -792,12 +807,11 @@ struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, { struct kvm_nested_guest *gp, *newgp; - if (l1_lpid >= KVM_MAX_NESTED_GUESTS || - l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) + if (l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) return NULL; spin_lock(&kvm->mmu_lock); - gp = kvm->arch.nested_guests[l1_lpid]; + gp = __find_nested(kvm, l1_lpid); if (gp) ++gp->refcnt; spin_unlock(&kvm->mmu_lock); @@ -808,17 +822,19 @@ struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, newgp = kvmhv_alloc_nested(kvm, l1_lpid); if (!newgp) return NULL; + + if (!__prealloc_nested(kvm, l1_lpid)) { + kvmhv_release_nested(newgp); + return NULL; + } + spin_lock(&kvm->mmu_lock); - if (kvm->arch.nested_guests[l1_lpid]) { - /* someone else beat us to it */ - gp = kvm->arch.nested_guests[l1_lpid]; - } else { - kvm->arch.nested_guests[l1_lpid] = newgp; + gp = __find_nested(kvm, l1_lpid); + if (!gp) { + __add_nested(kvm, l1_lpid, newgp); ++newgp->refcnt; gp = newgp; newgp = NULL; - if (l1_lpid > kvm->arch.max_nested_lpid) - kvm->arch.max_nested_lpid = l1_lpid; } ++gp->refcnt; spin_unlock(&kvm->mmu_lock); @@ -841,20 +857,13 @@ void kvmhv_put_nested(struct kvm_nested_guest *gp) kvmhv_release_nested(gp); } -static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid) -{ - if (lpid > kvm->arch.max_nested_lpid) - return NULL; - return kvm->arch.nested_guests[lpid]; -} - pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid, unsigned long ea, unsigned *hshift) { struct kvm_nested_guest *gp; pte_t *pte; - gp = kvmhv_find_nested(kvm, lpid); + gp = __find_nested(kvm, lpid); if (!gp) return NULL; @@ -960,7 +969,7 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, gpa = n_rmap & RMAP_NESTED_GPA_MASK; lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; - gp = kvmhv_find_nested(kvm, lpid); + gp = __find_nested(kvm, lpid); if (!gp) return; @@ -1152,16 +1161,13 @@ static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric) { struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *gp; - int i; + int lpid; spin_lock(&kvm->mmu_lock); - for (i = 0; i <= kvm->arch.max_nested_lpid; i++) { - gp = kvm->arch.nested_guests[i]; - if (gp) { - spin_unlock(&kvm->mmu_lock); - kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); - spin_lock(&kvm->mmu_lock); - } + idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) { + spin_unlock(&kvm->mmu_lock); + kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); + spin_lock(&kvm->mmu_lock); } spin_unlock(&kvm->mmu_lock); } @@ -1313,7 +1319,7 @@ long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid, * H_ENTER_NESTED call. Since we can't differentiate this case from * the invalid case, we ignore such flush requests and return success. */ - if (!kvmhv_find_nested(vcpu->kvm, lpid)) + if (!__find_nested(vcpu->kvm, lpid)) return H_SUCCESS; /* @@ -1657,15 +1663,12 @@ long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid) { - int ret = -1; + int ret = lpid + 1; spin_lock(&kvm->mmu_lock); - while (++lpid <= kvm->arch.max_nested_lpid) { - if (kvm->arch.nested_guests[lpid]) { - ret = lpid; - break; - } - } + if (!idr_get_next(&kvm->arch.kvm_nested_guest_idr, &ret)) + ret = -1; spin_unlock(&kvm->mmu_lock); + return ret; } diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c b/arch/powerpc/kvm/book3s_hv_p9_entry.c index ac38c1cad378..112a09b33328 100644 --- a/arch/powerpc/kvm/book3s_hv_p9_entry.c +++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c @@ -539,8 +539,10 @@ static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u6 { struct kvm_nested_guest *nested = vcpu->arch.nested; u32 lpid; + u32 pid; lpid = nested ? nested->shadow_lpid : kvm->arch.lpid; + pid = vcpu->arch.pid; /* * Prior memory accesses to host PID Q3 must be completed before we @@ -551,7 +553,7 @@ static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u6 isync(); mtspr(SPRN_LPID, lpid); mtspr(SPRN_LPCR, lpcr); - mtspr(SPRN_PID, vcpu->arch.pid); + mtspr(SPRN_PID, pid); /* * isync not required here because we are HRFID'ing to guest before * any guest context access, which is context synchronising. @@ -561,9 +563,11 @@ static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u6 static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr) { u32 lpid; + u32 pid; int i; lpid = kvm->arch.lpid; + pid = vcpu->arch.pid; /* * See switch_mmu_to_guest_radix. ptesync should not be required here @@ -574,7 +578,7 @@ static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 isync(); mtspr(SPRN_LPID, lpid); mtspr(SPRN_LPCR, lpcr); - mtspr(SPRN_PID, vcpu->arch.pid); + mtspr(SPRN_PID, pid); for (i = 0; i < vcpu->arch.slb_max; i++) mtslb(vcpu->arch.slb[i].orige, vcpu->arch.slb[i].origv); @@ -585,6 +589,9 @@ static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 static void switch_mmu_to_host(struct kvm *kvm, u32 pid) { + u32 lpid = kvm->arch.host_lpid; + u64 lpcr = kvm->arch.host_lpcr; + /* * The guest has exited, so guest MMU context is no longer being * non-speculatively accessed, but a hwsync is needed before the @@ -594,8 +601,8 @@ static void switch_mmu_to_host(struct kvm *kvm, u32 pid) asm volatile("hwsync" ::: "memory"); isync(); mtspr(SPRN_PID, pid); - mtspr(SPRN_LPID, kvm->arch.host_lpid); - mtspr(SPRN_LPCR, kvm->arch.host_lpcr); + mtspr(SPRN_LPID, lpid); + mtspr(SPRN_LPCR, lpcr); /* * isync is not required after the switch, because mtmsrd with L=0 * is performed after this switch, which is context synchronising. diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 587c33fc4564..e165bfa842bf 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -479,6 +479,11 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, } } +unsigned long xics_rm_h_xirr_x(struct kvm_vcpu *vcpu) +{ + vcpu->arch.regs.gpr[5] = get_tb(); + return xics_rm_h_xirr(vcpu); +} unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu) { @@ -883,7 +888,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, /* --- Non-real mode XICS-related built-in routines --- */ -/** +/* * Host Operations poked by RM KVM */ static void rm_host_ipi_action(int action, void *data) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c deleted file mode 100644 index dd9880731bd6..000000000000 --- a/arch/powerpc/kvm/book3s_hv_rm_xive.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/kernel.h> -#include <linux/kvm_host.h> -#include <linux/err.h> -#include <linux/kernel_stat.h> -#include <linux/pgtable.h> - -#include <asm/kvm_book3s.h> -#include <asm/kvm_ppc.h> -#include <asm/hvcall.h> -#include <asm/xics.h> -#include <asm/debug.h> -#include <asm/synch.h> -#include <asm/cputhreads.h> -#include <asm/ppc-opcode.h> -#include <asm/pnv-pci.h> -#include <asm/opal.h> -#include <asm/smp.h> -#include <asm/xive.h> -#include <asm/xive-regs.h> - -#include "book3s_xive.h" - -/* XXX */ -#include <asm/udbg.h> -//#define DBG(fmt...) udbg_printf(fmt) -#define DBG(fmt...) do { } while(0) - -static inline void __iomem *get_tima_phys(void) -{ - return local_paca->kvm_hstate.xive_tima_phys; -} - -#undef XIVE_RUNTIME_CHECKS -#define X_PFX xive_rm_ -#define X_STATIC -#define X_STAT_PFX stat_rm_ -#define __x_tima get_tima_phys() -#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page)) -#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page)) -#define __x_writeb __raw_rm_writeb -#define __x_readw __raw_rm_readw -#define __x_readq __raw_rm_readq -#define __x_writeq __raw_rm_writeq - -#include "book3s_xive_template.c" diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index d185dee26026..0fc0e68d20d0 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -51,6 +51,14 @@ #define STACK_SLOT_FSCR (SFS-96) /* + * Use the last LPID (all implemented LPID bits = 1) for partition switching. + * This is reserved in the LPID allocator. POWER7 only implements 0x3ff, but + * we write 0xfff into the LPID SPR anyway, which seems to work and just + * ignores the top bits. + */ +#define LPID_RSVD 0xfff + +/* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. * @@ -1784,13 +1792,8 @@ hcall_real_table: .long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table .long DOTSYM(kvmppc_h_protect) - hcall_real_table -#ifdef CONFIG_SPAPR_TCE_IOMMU - .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table -#else .long 0 /* 0x1c */ .long 0 /* 0x20 */ -#endif .long 0 /* 0x24 - H_SET_SPRG0 */ .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table .long DOTSYM(kvmppc_rm_h_page_init) - hcall_real_table @@ -1808,11 +1811,11 @@ hcall_real_table: .long 0 /* 0x5c */ .long 0 /* 0x60 */ #ifdef CONFIG_KVM_XICS - .long DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_ipoll) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table + .long DOTSYM(xics_rm_h_eoi) - hcall_real_table + .long DOTSYM(xics_rm_h_cppr) - hcall_real_table + .long DOTSYM(xics_rm_h_ipi) - hcall_real_table + .long 0 /* 0x70 - H_IPOLL */ + .long DOTSYM(xics_rm_h_xirr) - hcall_real_table #else .long 0 /* 0x64 - H_EOI */ .long 0 /* 0x68 - H_CPPR */ @@ -1868,13 +1871,8 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table -#ifdef CONFIG_SPAPR_TCE_IOMMU - .long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table - .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table -#else .long 0 /* 0x138 */ .long 0 /* 0x13c */ -#endif .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ @@ -1987,7 +1985,7 @@ hcall_real_table: .long 0 /* 0x2f4 */ .long 0 /* 0x2f8 */ #ifdef CONFIG_KVM_XICS - .long DOTSYM(kvmppc_rm_h_xirr_x) - hcall_real_table + .long DOTSYM(xics_rm_h_xirr_x) - hcall_real_table #else .long 0 /* 0x2fc - H_XIRR_X*/ #endif diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 0f66f7fbfb80..598006301620 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -361,13 +361,15 @@ static bool kvmppc_gfn_is_uvmem_pfn(unsigned long gfn, struct kvm *kvm, static bool kvmppc_next_nontransitioned_gfn(const struct kvm_memory_slot *memslot, struct kvm *kvm, unsigned long *gfn) { - struct kvmppc_uvmem_slot *p; + struct kvmppc_uvmem_slot *p = NULL, *iter; bool ret = false; unsigned long i; - list_for_each_entry(p, &kvm->arch.uvmem_pfns, list) - if (*gfn >= p->base_pfn && *gfn < p->base_pfn + p->nr_pfns) + list_for_each_entry(iter, &kvm->arch.uvmem_pfns, list) + if (*gfn >= iter->base_pfn && *gfn < iter->base_pfn + iter->nr_pfns) { + p = iter; break; + } if (!p) return ret; /* diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index dc4f51ac84bc..a1f2978b2a86 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -433,9 +433,12 @@ int kvmppc_hcall_impl_pr(unsigned long cmd) case H_REMOVE: case H_PROTECT: case H_BULK_REMOVE: +#ifdef CONFIG_SPAPR_TCE_IOMMU + case H_GET_TCE: case H_PUT_TCE: case H_PUT_TCE_INDIRECT: case H_STUFF_TCE: +#endif case H_CEDE: case H_LOGICAL_CI_LOAD: case H_LOGICAL_CI_STORE: @@ -464,7 +467,10 @@ static unsigned int default_hcall_list[] = { H_REMOVE, H_PROTECT, H_BULK_REMOVE, +#ifdef CONFIG_SPAPR_TCE_IOMMU + H_GET_TCE, H_PUT_TCE, +#endif H_CEDE, H_SET_MODE, #ifdef CONFIG_KVM_XICS diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 24d434f1f012..4ca23644f752 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -30,27 +30,629 @@ #include "book3s_xive.h" - -/* - * Virtual mode variants of the hcalls for use on radix/radix - * with AIL. They require the VCPU's VP to be "pushed" - * - * We still instantiate them here because we use some of the - * generated utility functions as well in this file. - */ -#define XIVE_RUNTIME_CHECKS -#define X_PFX xive_vm_ -#define X_STATIC static -#define X_STAT_PFX stat_vm_ -#define __x_tima xive_tima #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) -#define __x_writeb __raw_writeb -#define __x_readw __raw_readw -#define __x_readq __raw_readq -#define __x_writeq __raw_writeq -#include "book3s_xive_template.c" +/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */ +#define XICS_DUMMY 1 + +static void xive_vm_ack_pending(struct kvmppc_xive_vcpu *xc) +{ + u8 cppr; + u16 ack; + + /* + * Ensure any previous store to CPPR is ordered vs. + * the subsequent loads from PIPR or ACK. + */ + eieio(); + + /* Perform the acknowledge OS to register cycle. */ + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_OS_REG)); + + /* Synchronize subsequent queue accesses */ + mb(); + + /* XXX Check grouping level */ + + /* Anything ? */ + if (!((ack >> 8) & TM_QW1_NSR_EO)) + return; + + /* Grab CPPR of the most favored pending interrupt */ + cppr = ack & 0xff; + if (cppr < 8) + xc->pending |= 1 << cppr; + + /* Check consistency */ + if (cppr >= xc->hw_cppr) + pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n", + smp_processor_id(), cppr, xc->hw_cppr); + + /* + * Update our image of the HW CPPR. We don't yet modify + * xc->cppr, this will be done as we scan for interrupts + * in the queues. + */ + xc->hw_cppr = cppr; +} + +static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset) +{ + u64 val; + + if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI) + offset |= XIVE_ESB_LD_ST_MO; + + val = __raw_readq(__x_eoi_page(xd) + offset); +#ifdef __LITTLE_ENDIAN__ + val >>= 64-8; +#endif + return (u8)val; +} + + +static void xive_vm_source_eoi(u32 hw_irq, struct xive_irq_data *xd) +{ + /* If the XIVE supports the new "store EOI facility, use it */ + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) + __raw_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI); + else if (xd->flags & XIVE_IRQ_FLAG_LSI) { + /* + * For LSIs the HW EOI cycle is used rather than PQ bits, + * as they are automatically re-triggred in HW when still + * pending. + */ + __raw_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI); + } else { + uint64_t eoi_val; + + /* + * Otherwise for EOI, we use the special MMIO that does + * a clear of both P and Q and returns the old Q, + * except for LSIs where we use the "EOI cycle" special + * load. + * + * This allows us to then do a re-trigger if Q was set + * rather than synthetizing an interrupt in software + */ + eoi_val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_00); + + /* Re-trigger if needed */ + if ((eoi_val & 1) && __x_trig_page(xd)) + __raw_writeq(0, __x_trig_page(xd)); + } +} + +enum { + scan_fetch, + scan_poll, + scan_eoi, +}; + +static u32 xive_vm_scan_interrupts(struct kvmppc_xive_vcpu *xc, + u8 pending, int scan_type) +{ + u32 hirq = 0; + u8 prio = 0xff; + + /* Find highest pending priority */ + while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) { + struct xive_q *q; + u32 idx, toggle; + __be32 *qpage; + + /* + * If pending is 0 this will return 0xff which is what + * we want + */ + prio = ffs(pending) - 1; + + /* Don't scan past the guest cppr */ + if (prio >= xc->cppr || prio > 7) { + if (xc->mfrr < xc->cppr) { + prio = xc->mfrr; + hirq = XICS_IPI; + } + break; + } + + /* Grab queue and pointers */ + q = &xc->queues[prio]; + idx = q->idx; + toggle = q->toggle; + + /* + * Snapshot the queue page. The test further down for EOI + * must use the same "copy" that was used by __xive_read_eq + * since qpage can be set concurrently and we don't want + * to miss an EOI. + */ + qpage = READ_ONCE(q->qpage); + +skip_ipi: + /* + * Try to fetch from the queue. Will return 0 for a + * non-queueing priority (ie, qpage = 0). + */ + hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle); + + /* + * If this was a signal for an MFFR change done by + * H_IPI we skip it. Additionally, if we were fetching + * we EOI it now, thus re-enabling reception of a new + * such signal. + * + * We also need to do that if prio is 0 and we had no + * page for the queue. In this case, we have non-queued + * IPI that needs to be EOId. + * + * This is safe because if we have another pending MFRR + * change that wasn't observed above, the Q bit will have + * been set and another occurrence of the IPI will trigger. + */ + if (hirq == XICS_IPI || (prio == 0 && !qpage)) { + if (scan_type == scan_fetch) { + xive_vm_source_eoi(xc->vp_ipi, + &xc->vp_ipi_data); + q->idx = idx; + q->toggle = toggle; + } + /* Loop back on same queue with updated idx/toggle */ + WARN_ON(hirq && hirq != XICS_IPI); + if (hirq) + goto skip_ipi; + } + + /* If it's the dummy interrupt, continue searching */ + if (hirq == XICS_DUMMY) + goto skip_ipi; + + /* Clear the pending bit if the queue is now empty */ + if (!hirq) { + pending &= ~(1 << prio); + + /* + * Check if the queue count needs adjusting due to + * interrupts being moved away. + */ + if (atomic_read(&q->pending_count)) { + int p = atomic_xchg(&q->pending_count, 0); + + if (p) { + WARN_ON(p > atomic_read(&q->count)); + atomic_sub(p, &q->count); + } + } + } + + /* + * If the most favoured prio we found pending is less + * favored (or equal) than a pending IPI, we return + * the IPI instead. + */ + if (prio >= xc->mfrr && xc->mfrr < xc->cppr) { + prio = xc->mfrr; + hirq = XICS_IPI; + break; + } + + /* If fetching, update queue pointers */ + if (scan_type == scan_fetch) { + q->idx = idx; + q->toggle = toggle; + } + } + + /* If we are just taking a "peek", do nothing else */ + if (scan_type == scan_poll) + return hirq; + + /* Update the pending bits */ + xc->pending = pending; + + /* + * If this is an EOI that's it, no CPPR adjustment done here, + * all we needed was cleanup the stale pending bits and check + * if there's anything left. + */ + if (scan_type == scan_eoi) + return hirq; + + /* + * If we found an interrupt, adjust what the guest CPPR should + * be as if we had just fetched that interrupt from HW. + * + * Note: This can only make xc->cppr smaller as the previous + * loop will only exit with hirq != 0 if prio is lower than + * the current xc->cppr. Thus we don't need to re-check xc->mfrr + * for pending IPIs. + */ + if (hirq) + xc->cppr = prio; + /* + * If it was an IPI the HW CPPR might have been lowered too much + * as the HW interrupt we use for IPIs is routed to priority 0. + * + * We re-sync it here. + */ + if (xc->cppr != xc->hw_cppr) { + xc->hw_cppr = xc->cppr; + __raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR); + } + + return hirq; +} + +static unsigned long xive_vm_h_xirr(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + u8 old_cppr; + u32 hirq; + + pr_devel("H_XIRR\n"); + + xc->stat_vm_h_xirr++; + + /* First collect pending bits from HW */ + xive_vm_ack_pending(xc); + + pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", + xc->pending, xc->hw_cppr, xc->cppr); + + /* Grab previous CPPR and reverse map it */ + old_cppr = xive_prio_to_guest(xc->cppr); + + /* Scan for actual interrupts */ + hirq = xive_vm_scan_interrupts(xc, xc->pending, scan_fetch); + + pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n", + hirq, xc->hw_cppr, xc->cppr); + + /* That should never hit */ + if (hirq & 0xff000000) + pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq); + + /* + * XXX We could check if the interrupt is masked here and + * filter it. If we chose to do so, we would need to do: + * + * if (masked) { + * lock(); + * if (masked) { + * old_Q = true; + * hirq = 0; + * } + * unlock(); + * } + */ + + /* Return interrupt and old CPPR in GPR4 */ + vcpu->arch.regs.gpr[4] = hirq | (old_cppr << 24); + + return H_SUCCESS; +} + +static unsigned long xive_vm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + u8 pending = xc->pending; + u32 hirq; + + pr_devel("H_IPOLL(server=%ld)\n", server); + + xc->stat_vm_h_ipoll++; + + /* Grab the target VCPU if not the current one */ + if (xc->server_num != server) { + vcpu = kvmppc_xive_find_server(vcpu->kvm, server); + if (!vcpu) + return H_PARAMETER; + xc = vcpu->arch.xive_vcpu; + + /* Scan all priorities */ + pending = 0xff; + } else { + /* Grab pending interrupt if any */ + __be64 qw1 = __raw_readq(xive_tima + TM_QW1_OS); + u8 pipr = be64_to_cpu(qw1) & 0xff; + + if (pipr < 8) + pending |= 1 << pipr; + } + + hirq = xive_vm_scan_interrupts(xc, pending, scan_poll); + + /* Return interrupt and old CPPR in GPR4 */ + vcpu->arch.regs.gpr[4] = hirq | (xc->cppr << 24); + + return H_SUCCESS; +} + +static void xive_vm_push_pending_to_hw(struct kvmppc_xive_vcpu *xc) +{ + u8 pending, prio; + + pending = xc->pending; + if (xc->mfrr != 0xff) { + if (xc->mfrr < 8) + pending |= 1 << xc->mfrr; + else + pending |= 0x80; + } + if (!pending) + return; + prio = ffs(pending) - 1; + + __raw_writeb(prio, xive_tima + TM_SPC_SET_OS_PENDING); +} + +static void xive_vm_scan_for_rerouted_irqs(struct kvmppc_xive *xive, + struct kvmppc_xive_vcpu *xc) +{ + unsigned int prio; + + /* For each priority that is now masked */ + for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) { + struct xive_q *q = &xc->queues[prio]; + struct kvmppc_xive_irq_state *state; + struct kvmppc_xive_src_block *sb; + u32 idx, toggle, entry, irq, hw_num; + struct xive_irq_data *xd; + __be32 *qpage; + u16 src; + + idx = q->idx; + toggle = q->toggle; + qpage = READ_ONCE(q->qpage); + if (!qpage) + continue; + + /* For each interrupt in the queue */ + for (;;) { + entry = be32_to_cpup(qpage + idx); + + /* No more ? */ + if ((entry >> 31) == toggle) + break; + irq = entry & 0x7fffffff; + + /* Skip dummies and IPIs */ + if (irq == XICS_DUMMY || irq == XICS_IPI) + goto next; + sb = kvmppc_xive_find_source(xive, irq, &src); + if (!sb) + goto next; + state = &sb->irq_state[src]; + + /* Has it been rerouted ? */ + if (xc->server_num == state->act_server) + goto next; + + /* + * Allright, it *has* been re-routed, kill it from + * the queue. + */ + qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY); + + /* Find the HW interrupt */ + kvmppc_xive_select_irq(state, &hw_num, &xd); + + /* If it's not an LSI, set PQ to 11 the EOI will force a resend */ + if (!(xd->flags & XIVE_IRQ_FLAG_LSI)) + xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11); + + /* EOI the source */ + xive_vm_source_eoi(hw_num, xd); + +next: + idx = (idx + 1) & q->msk; + if (idx == 0) + toggle ^= 1; + } + } +} + +static int xive_vm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; + u8 old_cppr; + + pr_devel("H_CPPR(cppr=%ld)\n", cppr); + + xc->stat_vm_h_cppr++; + + /* Map CPPR */ + cppr = xive_prio_from_guest(cppr); + + /* Remember old and update SW state */ + old_cppr = xc->cppr; + xc->cppr = cppr; + + /* + * Order the above update of xc->cppr with the subsequent + * read of xc->mfrr inside push_pending_to_hw() + */ + smp_mb(); + + if (cppr > old_cppr) { + /* + * We are masking less, we need to look for pending things + * to deliver and set VP pending bits accordingly to trigger + * a new interrupt otherwise we might miss MFRR changes for + * which we have optimized out sending an IPI signal. + */ + xive_vm_push_pending_to_hw(xc); + } else { + /* + * We are masking more, we need to check the queue for any + * interrupt that has been routed to another CPU, take + * it out (replace it with the dummy) and retrigger it. + * + * This is necessary since those interrupts may otherwise + * never be processed, at least not until this CPU restores + * its CPPR. + * + * This is in theory racy vs. HW adding new interrupts to + * the queue. In practice this works because the interesting + * cases are when the guest has done a set_xive() to move the + * interrupt away, which flushes the xive, followed by the + * target CPU doing a H_CPPR. So any new interrupt coming into + * the queue must still be routed to us and isn't a source + * of concern. + */ + xive_vm_scan_for_rerouted_irqs(xive, xc); + } + + /* Apply new CPPR */ + xc->hw_cppr = cppr; + __raw_writeb(cppr, xive_tima + TM_QW1_OS + TM_CPPR); + + return H_SUCCESS; +} + +static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xive *xive = vcpu->kvm->arch.xive; + struct kvmppc_xive_src_block *sb; + struct kvmppc_xive_irq_state *state; + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct xive_irq_data *xd; + u8 new_cppr = xirr >> 24; + u32 irq = xirr & 0x00ffffff, hw_num; + u16 src; + int rc = 0; + + pr_devel("H_EOI(xirr=%08lx)\n", xirr); + + xc->stat_vm_h_eoi++; + + xc->cppr = xive_prio_from_guest(new_cppr); + + /* + * IPIs are synthetized from MFRR and thus don't need + * any special EOI handling. The underlying interrupt + * used to signal MFRR changes is EOId when fetched from + * the queue. + */ + if (irq == XICS_IPI || irq == 0) { + /* + * This barrier orders the setting of xc->cppr vs. + * subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); + goto bail; + } + + /* Find interrupt source */ + sb = kvmppc_xive_find_source(xive, irq, &src); + if (!sb) { + pr_devel(" source not found !\n"); + rc = H_PARAMETER; + /* Same as above */ + smp_mb(); + goto bail; + } + state = &sb->irq_state[src]; + kvmppc_xive_select_irq(state, &hw_num, &xd); + + state->in_eoi = true; + + /* + * This barrier orders both setting of in_eoi above vs, + * subsequent test of guest_priority, and the setting + * of xc->cppr vs. subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); + +again: + if (state->guest_priority == MASKED) { + arch_spin_lock(&sb->lock); + if (state->guest_priority != MASKED) { + arch_spin_unlock(&sb->lock); + goto again; + } + pr_devel(" EOI on saved P...\n"); + + /* Clear old_p, that will cause unmask to perform an EOI */ + state->old_p = false; + + arch_spin_unlock(&sb->lock); + } else { + pr_devel(" EOI on source...\n"); + + /* Perform EOI on the source */ + xive_vm_source_eoi(hw_num, xd); + + /* If it's an emulated LSI, check level and resend */ + if (state->lsi && state->asserted) + __raw_writeq(0, __x_trig_page(xd)); + + } + + /* + * This barrier orders the above guest_priority check + * and spin_lock/unlock with clearing in_eoi below. + * + * It also has to be a full mb() as it must ensure + * the MMIOs done in source_eoi() are completed before + * state->in_eoi is visible. + */ + mb(); + state->in_eoi = false; +bail: + + /* Re-evaluate pending IRQs and update HW */ + xive_vm_scan_interrupts(xc, xc->pending, scan_eoi); + xive_vm_push_pending_to_hw(xc); + pr_devel(" after scan pending=%02x\n", xc->pending); + + /* Apply new CPPR */ + xc->hw_cppr = xc->cppr; + __raw_writeb(xc->cppr, xive_tima + TM_QW1_OS + TM_CPPR); + + return rc; +} + +static int xive_vm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + + pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr); + + xc->stat_vm_h_ipi++; + + /* Find target */ + vcpu = kvmppc_xive_find_server(vcpu->kvm, server); + if (!vcpu) + return H_PARAMETER; + xc = vcpu->arch.xive_vcpu; + + /* Locklessly write over MFRR */ + xc->mfrr = mfrr; + + /* + * The load of xc->cppr below and the subsequent MMIO store + * to the IPI must happen after the above mfrr update is + * globally visible so that: + * + * - Synchronize with another CPU doing an H_EOI or a H_CPPR + * updating xc->cppr then reading xc->mfrr. + * + * - The target of the IPI sees the xc->mfrr update + */ + mb(); + + /* Shoot the IPI if most favored than target cppr */ + if (mfrr < xc->cppr) + __raw_writeq(0, __x_trig_page(&xc->vp_ipi_data)); + + return H_SUCCESS; +} /* * We leave a gap of a couple of interrupts in the queue to @@ -179,12 +781,13 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu); -void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) +bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr; + bool ret = true; if (!esc_vaddr) - return; + return ret; /* we are using XIVE with single escalation */ @@ -197,7 +800,7 @@ void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) * we also don't want to set xive_esc_on to 1 here in * case we race with xive_esc_irq(). */ - vcpu->arch.ceded = 0; + ret = false; /* * The escalation interrupts are special as we don't EOI them. * There is no need to use the load-after-store ordering offset @@ -210,6 +813,8 @@ void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) __raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00); } mb(); + + return ret; } EXPORT_SYMBOL_GPL(kvmppc_xive_rearm_escalation); @@ -238,7 +843,7 @@ static irqreturn_t xive_esc_irq(int irq, void *data) vcpu->arch.irq_pending = 1; smp_mb(); - if (vcpu->arch.ceded) + if (vcpu->arch.ceded || vcpu->arch.nested) kvmppc_fast_vcpu_kick(vcpu); /* Since we have the no-EOI flag, the interrupt is effectively diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 09d0657596c3..1e48f72e8aa5 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -285,13 +285,6 @@ static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle) return cur & 0x7fffffff; } -extern unsigned long xive_rm_h_xirr(struct kvm_vcpu *vcpu); -extern unsigned long xive_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server); -extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, - unsigned long mfrr); -extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); -extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); - /* * Common Xive routines for XICS-over-XIVE and XIVE native */ diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c deleted file mode 100644 index b0015e05d99a..000000000000 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ /dev/null @@ -1,636 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation - */ - -/* File to be included by other .c files */ - -#define XGLUE(a,b) a##b -#define GLUE(a,b) XGLUE(a,b) - -/* Dummy interrupt used when taking interrupts out of a queue in H_CPPR */ -#define XICS_DUMMY 1 - -static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) -{ - u8 cppr; - u16 ack; - - /* - * Ensure any previous store to CPPR is ordered vs. - * the subsequent loads from PIPR or ACK. - */ - eieio(); - - /* Perform the acknowledge OS to register cycle. */ - ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG)); - - /* Synchronize subsequent queue accesses */ - mb(); - - /* XXX Check grouping level */ - - /* Anything ? */ - if (!((ack >> 8) & TM_QW1_NSR_EO)) - return; - - /* Grab CPPR of the most favored pending interrupt */ - cppr = ack & 0xff; - if (cppr < 8) - xc->pending |= 1 << cppr; - -#ifdef XIVE_RUNTIME_CHECKS - /* Check consistency */ - if (cppr >= xc->hw_cppr) - pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n", - smp_processor_id(), cppr, xc->hw_cppr); -#endif - - /* - * Update our image of the HW CPPR. We don't yet modify - * xc->cppr, this will be done as we scan for interrupts - * in the queues. - */ - xc->hw_cppr = cppr; -} - -static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd, u32 offset) -{ - u64 val; - - if (offset == XIVE_ESB_SET_PQ_10 && xd->flags & XIVE_IRQ_FLAG_STORE_EOI) - offset |= XIVE_ESB_LD_ST_MO; - - val =__x_readq(__x_eoi_page(xd) + offset); -#ifdef __LITTLE_ENDIAN__ - val >>= 64-8; -#endif - return (u8)val; -} - - -static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd) -{ - /* If the XIVE supports the new "store EOI facility, use it */ - if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) - __x_writeq(0, __x_eoi_page(xd) + XIVE_ESB_STORE_EOI); - else if (xd->flags & XIVE_IRQ_FLAG_LSI) { - /* - * For LSIs the HW EOI cycle is used rather than PQ bits, - * as they are automatically re-triggred in HW when still - * pending. - */ - __x_readq(__x_eoi_page(xd) + XIVE_ESB_LOAD_EOI); - } else { - uint64_t eoi_val; - - /* - * Otherwise for EOI, we use the special MMIO that does - * a clear of both P and Q and returns the old Q, - * except for LSIs where we use the "EOI cycle" special - * load. - * - * This allows us to then do a re-trigger if Q was set - * rather than synthetizing an interrupt in software - */ - eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00); - - /* Re-trigger if needed */ - if ((eoi_val & 1) && __x_trig_page(xd)) - __x_writeq(0, __x_trig_page(xd)); - } -} - -enum { - scan_fetch, - scan_poll, - scan_eoi, -}; - -static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc, - u8 pending, int scan_type) -{ - u32 hirq = 0; - u8 prio = 0xff; - - /* Find highest pending priority */ - while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) { - struct xive_q *q; - u32 idx, toggle; - __be32 *qpage; - - /* - * If pending is 0 this will return 0xff which is what - * we want - */ - prio = ffs(pending) - 1; - - /* Don't scan past the guest cppr */ - if (prio >= xc->cppr || prio > 7) { - if (xc->mfrr < xc->cppr) { - prio = xc->mfrr; - hirq = XICS_IPI; - } - break; - } - - /* Grab queue and pointers */ - q = &xc->queues[prio]; - idx = q->idx; - toggle = q->toggle; - - /* - * Snapshot the queue page. The test further down for EOI - * must use the same "copy" that was used by __xive_read_eq - * since qpage can be set concurrently and we don't want - * to miss an EOI. - */ - qpage = READ_ONCE(q->qpage); - -skip_ipi: - /* - * Try to fetch from the queue. Will return 0 for a - * non-queueing priority (ie, qpage = 0). - */ - hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle); - - /* - * If this was a signal for an MFFR change done by - * H_IPI we skip it. Additionally, if we were fetching - * we EOI it now, thus re-enabling reception of a new - * such signal. - * - * We also need to do that if prio is 0 and we had no - * page for the queue. In this case, we have non-queued - * IPI that needs to be EOId. - * - * This is safe because if we have another pending MFRR - * change that wasn't observed above, the Q bit will have - * been set and another occurrence of the IPI will trigger. - */ - if (hirq == XICS_IPI || (prio == 0 && !qpage)) { - if (scan_type == scan_fetch) { - GLUE(X_PFX,source_eoi)(xc->vp_ipi, - &xc->vp_ipi_data); - q->idx = idx; - q->toggle = toggle; - } - /* Loop back on same queue with updated idx/toggle */ -#ifdef XIVE_RUNTIME_CHECKS - WARN_ON(hirq && hirq != XICS_IPI); -#endif - if (hirq) - goto skip_ipi; - } - - /* If it's the dummy interrupt, continue searching */ - if (hirq == XICS_DUMMY) - goto skip_ipi; - - /* Clear the pending bit if the queue is now empty */ - if (!hirq) { - pending &= ~(1 << prio); - - /* - * Check if the queue count needs adjusting due to - * interrupts being moved away. - */ - if (atomic_read(&q->pending_count)) { - int p = atomic_xchg(&q->pending_count, 0); - if (p) { -#ifdef XIVE_RUNTIME_CHECKS - WARN_ON(p > atomic_read(&q->count)); -#endif - atomic_sub(p, &q->count); - } - } - } - - /* - * If the most favoured prio we found pending is less - * favored (or equal) than a pending IPI, we return - * the IPI instead. - */ - if (prio >= xc->mfrr && xc->mfrr < xc->cppr) { - prio = xc->mfrr; - hirq = XICS_IPI; - break; - } - - /* If fetching, update queue pointers */ - if (scan_type == scan_fetch) { - q->idx = idx; - q->toggle = toggle; - } - } - - /* If we are just taking a "peek", do nothing else */ - if (scan_type == scan_poll) - return hirq; - - /* Update the pending bits */ - xc->pending = pending; - - /* - * If this is an EOI that's it, no CPPR adjustment done here, - * all we needed was cleanup the stale pending bits and check - * if there's anything left. - */ - if (scan_type == scan_eoi) - return hirq; - - /* - * If we found an interrupt, adjust what the guest CPPR should - * be as if we had just fetched that interrupt from HW. - * - * Note: This can only make xc->cppr smaller as the previous - * loop will only exit with hirq != 0 if prio is lower than - * the current xc->cppr. Thus we don't need to re-check xc->mfrr - * for pending IPIs. - */ - if (hirq) - xc->cppr = prio; - /* - * If it was an IPI the HW CPPR might have been lowered too much - * as the HW interrupt we use for IPIs is routed to priority 0. - * - * We re-sync it here. - */ - if (xc->cppr != xc->hw_cppr) { - xc->hw_cppr = xc->cppr; - __x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR); - } - - return hirq; -} - -X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu) -{ - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - u8 old_cppr; - u32 hirq; - - pr_devel("H_XIRR\n"); - - xc->GLUE(X_STAT_PFX,h_xirr)++; - - /* First collect pending bits from HW */ - GLUE(X_PFX,ack_pending)(xc); - - pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", - xc->pending, xc->hw_cppr, xc->cppr); - - /* Grab previous CPPR and reverse map it */ - old_cppr = xive_prio_to_guest(xc->cppr); - - /* Scan for actual interrupts */ - hirq = GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_fetch); - - pr_devel(" got hirq=0x%x hw_cppr=%d cppr=%d\n", - hirq, xc->hw_cppr, xc->cppr); - -#ifdef XIVE_RUNTIME_CHECKS - /* That should never hit */ - if (hirq & 0xff000000) - pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq); -#endif - - /* - * XXX We could check if the interrupt is masked here and - * filter it. If we chose to do so, we would need to do: - * - * if (masked) { - * lock(); - * if (masked) { - * old_Q = true; - * hirq = 0; - * } - * unlock(); - * } - */ - - /* Return interrupt and old CPPR in GPR4 */ - vcpu->arch.regs.gpr[4] = hirq | (old_cppr << 24); - - return H_SUCCESS; -} - -X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server) -{ - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - u8 pending = xc->pending; - u32 hirq; - - pr_devel("H_IPOLL(server=%ld)\n", server); - - xc->GLUE(X_STAT_PFX,h_ipoll)++; - - /* Grab the target VCPU if not the current one */ - if (xc->server_num != server) { - vcpu = kvmppc_xive_find_server(vcpu->kvm, server); - if (!vcpu) - return H_PARAMETER; - xc = vcpu->arch.xive_vcpu; - - /* Scan all priorities */ - pending = 0xff; - } else { - /* Grab pending interrupt if any */ - __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); - u8 pipr = be64_to_cpu(qw1) & 0xff; - if (pipr < 8) - pending |= 1 << pipr; - } - - hirq = GLUE(X_PFX,scan_interrupts)(xc, pending, scan_poll); - - /* Return interrupt and old CPPR in GPR4 */ - vcpu->arch.regs.gpr[4] = hirq | (xc->cppr << 24); - - return H_SUCCESS; -} - -static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc) -{ - u8 pending, prio; - - pending = xc->pending; - if (xc->mfrr != 0xff) { - if (xc->mfrr < 8) - pending |= 1 << xc->mfrr; - else - pending |= 0x80; - } - if (!pending) - return; - prio = ffs(pending) - 1; - - __x_writeb(prio, __x_tima + TM_SPC_SET_OS_PENDING); -} - -static void GLUE(X_PFX,scan_for_rerouted_irqs)(struct kvmppc_xive *xive, - struct kvmppc_xive_vcpu *xc) -{ - unsigned int prio; - - /* For each priority that is now masked */ - for (prio = xc->cppr; prio < KVMPPC_XIVE_Q_COUNT; prio++) { - struct xive_q *q = &xc->queues[prio]; - struct kvmppc_xive_irq_state *state; - struct kvmppc_xive_src_block *sb; - u32 idx, toggle, entry, irq, hw_num; - struct xive_irq_data *xd; - __be32 *qpage; - u16 src; - - idx = q->idx; - toggle = q->toggle; - qpage = READ_ONCE(q->qpage); - if (!qpage) - continue; - - /* For each interrupt in the queue */ - for (;;) { - entry = be32_to_cpup(qpage + idx); - - /* No more ? */ - if ((entry >> 31) == toggle) - break; - irq = entry & 0x7fffffff; - - /* Skip dummies and IPIs */ - if (irq == XICS_DUMMY || irq == XICS_IPI) - goto next; - sb = kvmppc_xive_find_source(xive, irq, &src); - if (!sb) - goto next; - state = &sb->irq_state[src]; - - /* Has it been rerouted ? */ - if (xc->server_num == state->act_server) - goto next; - - /* - * Allright, it *has* been re-routed, kill it from - * the queue. - */ - qpage[idx] = cpu_to_be32((entry & 0x80000000) | XICS_DUMMY); - - /* Find the HW interrupt */ - kvmppc_xive_select_irq(state, &hw_num, &xd); - - /* If it's not an LSI, set PQ to 11 the EOI will force a resend */ - if (!(xd->flags & XIVE_IRQ_FLAG_LSI)) - GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_11); - - /* EOI the source */ - GLUE(X_PFX,source_eoi)(hw_num, xd); - - next: - idx = (idx + 1) & q->msk; - if (idx == 0) - toggle ^= 1; - } - } -} - -X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) -{ - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - struct kvmppc_xive *xive = vcpu->kvm->arch.xive; - u8 old_cppr; - - pr_devel("H_CPPR(cppr=%ld)\n", cppr); - - xc->GLUE(X_STAT_PFX,h_cppr)++; - - /* Map CPPR */ - cppr = xive_prio_from_guest(cppr); - - /* Remember old and update SW state */ - old_cppr = xc->cppr; - xc->cppr = cppr; - - /* - * Order the above update of xc->cppr with the subsequent - * read of xc->mfrr inside push_pending_to_hw() - */ - smp_mb(); - - if (cppr > old_cppr) { - /* - * We are masking less, we need to look for pending things - * to deliver and set VP pending bits accordingly to trigger - * a new interrupt otherwise we might miss MFRR changes for - * which we have optimized out sending an IPI signal. - */ - GLUE(X_PFX,push_pending_to_hw)(xc); - } else { - /* - * We are masking more, we need to check the queue for any - * interrupt that has been routed to another CPU, take - * it out (replace it with the dummy) and retrigger it. - * - * This is necessary since those interrupts may otherwise - * never be processed, at least not until this CPU restores - * its CPPR. - * - * This is in theory racy vs. HW adding new interrupts to - * the queue. In practice this works because the interesting - * cases are when the guest has done a set_xive() to move the - * interrupt away, which flushes the xive, followed by the - * target CPU doing a H_CPPR. So any new interrupt coming into - * the queue must still be routed to us and isn't a source - * of concern. - */ - GLUE(X_PFX,scan_for_rerouted_irqs)(xive, xc); - } - - /* Apply new CPPR */ - xc->hw_cppr = cppr; - __x_writeb(cppr, __x_tima + TM_QW1_OS + TM_CPPR); - - return H_SUCCESS; -} - -X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr) -{ - struct kvmppc_xive *xive = vcpu->kvm->arch.xive; - struct kvmppc_xive_src_block *sb; - struct kvmppc_xive_irq_state *state; - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - struct xive_irq_data *xd; - u8 new_cppr = xirr >> 24; - u32 irq = xirr & 0x00ffffff, hw_num; - u16 src; - int rc = 0; - - pr_devel("H_EOI(xirr=%08lx)\n", xirr); - - xc->GLUE(X_STAT_PFX,h_eoi)++; - - xc->cppr = xive_prio_from_guest(new_cppr); - - /* - * IPIs are synthetized from MFRR and thus don't need - * any special EOI handling. The underlying interrupt - * used to signal MFRR changes is EOId when fetched from - * the queue. - */ - if (irq == XICS_IPI || irq == 0) { - /* - * This barrier orders the setting of xc->cppr vs. - * subsquent test of xc->mfrr done inside - * scan_interrupts and push_pending_to_hw - */ - smp_mb(); - goto bail; - } - - /* Find interrupt source */ - sb = kvmppc_xive_find_source(xive, irq, &src); - if (!sb) { - pr_devel(" source not found !\n"); - rc = H_PARAMETER; - /* Same as above */ - smp_mb(); - goto bail; - } - state = &sb->irq_state[src]; - kvmppc_xive_select_irq(state, &hw_num, &xd); - - state->in_eoi = true; - - /* - * This barrier orders both setting of in_eoi above vs, - * subsequent test of guest_priority, and the setting - * of xc->cppr vs. subsquent test of xc->mfrr done inside - * scan_interrupts and push_pending_to_hw - */ - smp_mb(); - -again: - if (state->guest_priority == MASKED) { - arch_spin_lock(&sb->lock); - if (state->guest_priority != MASKED) { - arch_spin_unlock(&sb->lock); - goto again; - } - pr_devel(" EOI on saved P...\n"); - - /* Clear old_p, that will cause unmask to perform an EOI */ - state->old_p = false; - - arch_spin_unlock(&sb->lock); - } else { - pr_devel(" EOI on source...\n"); - - /* Perform EOI on the source */ - GLUE(X_PFX,source_eoi)(hw_num, xd); - - /* If it's an emulated LSI, check level and resend */ - if (state->lsi && state->asserted) - __x_writeq(0, __x_trig_page(xd)); - - } - - /* - * This barrier orders the above guest_priority check - * and spin_lock/unlock with clearing in_eoi below. - * - * It also has to be a full mb() as it must ensure - * the MMIOs done in source_eoi() are completed before - * state->in_eoi is visible. - */ - mb(); - state->in_eoi = false; -bail: - - /* Re-evaluate pending IRQs and update HW */ - GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_eoi); - GLUE(X_PFX,push_pending_to_hw)(xc); - pr_devel(" after scan pending=%02x\n", xc->pending); - - /* Apply new CPPR */ - xc->hw_cppr = xc->cppr; - __x_writeb(xc->cppr, __x_tima + TM_QW1_OS + TM_CPPR); - - return rc; -} - -X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, - unsigned long mfrr) -{ - struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; - - pr_devel("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr); - - xc->GLUE(X_STAT_PFX,h_ipi)++; - - /* Find target */ - vcpu = kvmppc_xive_find_server(vcpu->kvm, server); - if (!vcpu) - return H_PARAMETER; - xc = vcpu->arch.xive_vcpu; - - /* Locklessly write over MFRR */ - xc->mfrr = mfrr; - - /* - * The load of xc->cppr below and the subsequent MMIO store - * to the IPI must happen after the above mfrr update is - * globally visible so that: - * - * - Synchronize with another CPU doing an H_EOI or a H_CPPR - * updating xc->cppr then reading xc->mfrr. - * - * - The target of the IPI sees the xc->mfrr update - */ - mb(); - - /* Shoot the IPI if most favored than target cppr */ - if (mfrr < xc->cppr) - __x_writeq(0, __x_trig_page(&xc->vp_ipi_data)); - - return H_SUCCESS; -} diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 4ff1372e48d4..57e0ad6a2ca3 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -399,7 +399,6 @@ static int __init kvmppc_e500mc_init(void) * allocator. */ kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core); - kvmppc_claim_lpid(0); /* host */ r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); if (r) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 81069e3c3570..191992fcb2c2 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2497,41 +2497,37 @@ out: return r; } -static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)]; +static DEFINE_IDA(lpid_inuse); static unsigned long nr_lpids; long kvmppc_alloc_lpid(void) { - long lpid; + int lpid; - do { - lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS); - if (lpid >= nr_lpids) { + /* The host LPID must always be 0 (allocation starts at 1) */ + lpid = ida_alloc_range(&lpid_inuse, 1, nr_lpids - 1, GFP_KERNEL); + if (lpid < 0) { + if (lpid == -ENOMEM) + pr_err("%s: Out of memory\n", __func__); + else pr_err("%s: No LPIDs free\n", __func__); - return -ENOMEM; - } - } while (test_and_set_bit(lpid, lpid_inuse)); + return -ENOMEM; + } return lpid; } EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid); -void kvmppc_claim_lpid(long lpid) -{ - set_bit(lpid, lpid_inuse); -} -EXPORT_SYMBOL_GPL(kvmppc_claim_lpid); - void kvmppc_free_lpid(long lpid) { - clear_bit(lpid, lpid_inuse); + ida_free(&lpid_inuse, lpid); } EXPORT_SYMBOL_GPL(kvmppc_free_lpid); +/* nr_lpids_param includes the host LPID */ void kvmppc_init_lpid(unsigned long nr_lpids_param) { - nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); - memset(lpid_inuse, 0, sizeof(lpid_inuse)); + nr_lpids = nr_lpids_param; } EXPORT_SYMBOL_GPL(kvmppc_init_lpid); diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index 38cd0ed0a617..32e2cb5811cc 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h @@ -409,9 +409,9 @@ TRACE_EVENT(kvmppc_run_core, ); TRACE_EVENT(kvmppc_vcore_blocked, - TP_PROTO(struct kvmppc_vcore *vc, int where), + TP_PROTO(struct kvm_vcpu *vcpu, int where), - TP_ARGS(vc, where), + TP_ARGS(vcpu, where), TP_STRUCT__entry( __field(int, n_runnable) @@ -421,8 +421,8 @@ TRACE_EVENT(kvmppc_vcore_blocked, ), TP_fast_assign( - __entry->runner_vcpu = vc->runner->vcpu_id; - __entry->n_runnable = vc->n_runnable; + __entry->runner_vcpu = vcpu->vcpu_id; + __entry->n_runnable = vcpu->arch.vcore->n_runnable; __entry->where = where; __entry->tgid = current->tgid; ), |