diff options
Diffstat (limited to 'arch/powerpc')
49 files changed, 1159 insertions, 196 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e4824fd04bb7..9faa18c4f3f7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -557,7 +557,7 @@ choice config PPC_4K_PAGES bool "4k page size" - select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S + select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 config PPC_16K_PAGES bool "16k page size" @@ -566,7 +566,7 @@ config PPC_16K_PAGES config PPC_64K_PAGES bool "64k page size" depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64) - select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S + select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 config PPC_256K_PAGES bool "256k page size" diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 06f17e778c27..8d1c8162f0c1 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -50,7 +50,9 @@ * set of bits not changed in pmd_modify. */ #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ - _PAGE_ACCESSED | _PAGE_THP_HUGE) + _PAGE_ACCESSED | _PAGE_THP_HUGE | _PAGE_PTE | \ + _PAGE_SOFT_DIRTY) + #ifdef CONFIG_PPC_64K_PAGES #include <asm/book3s/64/hash-64k.h> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8204b0c393aa..ac07a30a7934 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -223,7 +223,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_young(pmd) pte_young(pmd_pte(pmd)) -#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) @@ -282,6 +281,10 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE +extern void pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index c5eb86f3d452..867c39b45df6 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -81,6 +81,7 @@ struct pci_dn; #define EEH_PE_KEEP (1 << 8) /* Keep PE on hotplug */ #define EEH_PE_CFG_RESTRICTED (1 << 9) /* Block config on error */ #define EEH_PE_REMOVED (1 << 10) /* Removed permanently */ +#define EEH_PE_PRI_BUS (1 << 11) /* Cached primary bus */ struct eeh_pe { int type; /* PE type: PHB/Bus/Device */ diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 2aa79c864e91..7529aab068f5 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) } #endif -#define SPAPR_TCE_SHIFT 12 - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 271fefbbe521..d7b343170453 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -38,8 +38,7 @@ #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS -#define KVM_USER_MEM_SLOTS 32 -#define KVM_MEM_SLOTS_NUM KVM_USER_MEM_SLOTS +#define KVM_USER_MEM_SLOTS 512 #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -183,7 +182,10 @@ struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; u64 liobn; - u32 window_size; + struct rcu_head rcu; + u32 page_shift; + u64 offset; /* in pages */ + u64 size; /* window size in pages */ struct page *pages[0]; }; @@ -290,7 +292,7 @@ struct kvmppc_vcore { struct list_head runnable_threads; struct list_head preempt_list; spinlock_t lock; - wait_queue_head_t wq; + struct swait_queue_head wq; spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ u64 stolen_tb; u64 preempt_tb; @@ -630,7 +632,7 @@ struct kvm_vcpu_arch { u8 prodded; u32 last_inst; - wait_queue_head_t *wqp; + struct swait_queue_head *wqp; struct kvmppc_vcore *vcore; int ret; int trap; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2241d5357129..2544edabe7f3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -165,9 +165,25 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args); + struct kvm_create_spapr_tce_64 *args); +extern struct kvmppc_spapr_tce_table *kvmppc_find_table( + struct kvm_vcpu *vcpu, unsigned long liobn); +extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages); +extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, + unsigned long tce); +extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, + unsigned long *ua, unsigned long **prmap); +extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, + unsigned long idx, unsigned long tce); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); +extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages); +extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba); extern struct page *kvm_alloc_hpt(unsigned long nr_pages); @@ -437,6 +453,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.irq_type == KVMPPC_IRQ_XICS; } +extern void kvmppc_alloc_host_rm_ops(void); +extern void kvmppc_free_host_rm_ops(void); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); @@ -445,7 +463,11 @@ extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); +extern void kvmppc_xics_ipi_action(void); +extern int h_ipi_redirect; #else +static inline void kvmppc_alloc_host_rm_ops(void) {}; +static inline void kvmppc_free_host_rm_ops(void) {}; static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } @@ -459,6 +481,33 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { return 0; } #endif +/* + * Host-side operations we want to set up while running in real + * mode in the guest operating on the xics. + * Currently only VCPU wakeup is supported. + */ + +union kvmppc_rm_state { + unsigned long raw; + struct { + u32 in_host; + u32 rm_action; + }; +}; + +struct kvmppc_host_rm_core { + union kvmppc_rm_state rm_state; + void *rm_data; + char pad[112]; +}; + +struct kvmppc_host_rm_ops { + struct kvmppc_host_rm_core *rm_core; + void (*vcpu_kick)(struct kvm_vcpu *vcpu); +}; + +extern struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; + static inline unsigned long kvmppc_get_epr(struct kvm_vcpu *vcpu) { #ifdef CONFIG_KVM_BOOKE_HV diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index ac9fb114e25d..47897a30982d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -78,6 +78,9 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, } return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); } + +unsigned long vmalloc_to_phys(void *vmalloc_addr); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 825663c30945..78083ed20792 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -114,6 +114,9 @@ extern int cpu_to_core_id(int cpu); #define PPC_MSG_TICK_BROADCAST 2 #define PPC_MSG_DEBUGGER_BREAK 3 +/* This is only used by the powernv kernel */ +#define PPC_MSG_RM_HOST_ACTION 4 + /* for irq controllers that have dedicated ipis per message (4) */ extern int smp_request_message_ipi(int virq, int message); extern const char *smp_ipi_name[]; @@ -121,6 +124,7 @@ extern const char *smp_ipi_name[]; /* for irq controllers with only a single ipi */ extern void smp_muxed_ipi_set_data(int cpu, unsigned long data); extern void smp_muxed_ipi_message_pass(int cpu, int msg); +extern void smp_muxed_ipi_set_message(int cpu, int msg); extern irqreturn_t smp_ipi_demux(void); void smp_init_pSeries(void); diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 5654ece02c0d..3fa9df70aa20 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -383,3 +383,4 @@ SYSCALL(ni_syscall) SYSCALL(ni_syscall) SYSCALL(ni_syscall) SYSCALL(mlock2) +SYSCALL(copy_file_range) diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h index 8e86b48d0369..32e36b16773f 100644 --- a/arch/powerpc/include/asm/trace.h +++ b/arch/powerpc/include/asm/trace.h @@ -57,12 +57,14 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit, extern void hcall_tracepoint_regfunc(void); extern void hcall_tracepoint_unregfunc(void); -TRACE_EVENT_FN(hcall_entry, +TRACE_EVENT_FN_COND(hcall_entry, TP_PROTO(unsigned long opcode, unsigned long *args), TP_ARGS(opcode, args), + TP_CONDITION(cpu_online(raw_smp_processor_id())), + TP_STRUCT__entry( __field(unsigned long, opcode) ), @@ -76,13 +78,15 @@ TRACE_EVENT_FN(hcall_entry, hcall_tracepoint_regfunc, hcall_tracepoint_unregfunc ); -TRACE_EVENT_FN(hcall_exit, +TRACE_EVENT_FN_COND(hcall_exit, TP_PROTO(unsigned long opcode, unsigned long retval, unsigned long *retbuf), TP_ARGS(opcode, retval, retbuf), + TP_CONDITION(cpu_online(raw_smp_processor_id())), + TP_STRUCT__entry( __field(unsigned long, opcode) __field(unsigned long, retval) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 6a5ace5fa0c8..1f2594d45605 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -12,7 +12,7 @@ #include <uapi/asm/unistd.h> -#define NR_syscalls 379 +#define NR_syscalls 380 #define __NR__exit __NR_exit diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index 0e25bdb190bb..254604856e69 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -30,6 +30,7 @@ #ifdef CONFIG_PPC_ICP_NATIVE extern int icp_native_init(void); extern void icp_native_flush_interrupt(void); +extern void icp_native_cause_ipi_rm(int cpu); #else static inline int icp_native_init(void) { return -ENODEV; } #endif diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index ab4d4732c492..c93cf35ce379 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -333,6 +333,15 @@ struct kvm_create_spapr_tce { __u32 window_size; }; +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u32 page_shift; + __u32 flags; + __u64 offset; /* in pages */ + __u64 size; /* in pages */ +}; + /* for KVM_ALLOCATE_RMA */ struct kvm_allocate_rma { __u64 rma_size; diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h index 12a05652377a..940290d45b08 100644 --- a/arch/powerpc/include/uapi/asm/unistd.h +++ b/arch/powerpc/include/uapi/asm/unistd.h @@ -389,5 +389,6 @@ #define __NR_userfaultfd 364 #define __NR_membarrier 365 #define __NR_mlock2 378 +#define __NR_copy_file_range 379 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */ diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 938742135ee0..650cfb31ea3d 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -418,8 +418,7 @@ static void *eeh_rmv_device(void *data, void *userdata) eeh_pcid_put(dev); if (driver->err_handler && driver->err_handler->error_detected && - driver->err_handler->slot_reset && - driver->err_handler->resume) + driver->err_handler->slot_reset) return NULL; } @@ -564,6 +563,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) */ eeh_pe_state_mark(pe, EEH_PE_KEEP); if (bus) { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); pci_lock_rescan_remove(); pcibios_remove_pci_devices(bus); pci_unlock_rescan_remove(); @@ -803,6 +803,7 @@ perm_error: * the their PCI config any more. */ if (frozen_bus) { + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); pci_lock_rescan_remove(); @@ -886,6 +887,7 @@ static void eeh_handle_special_event(void) continue; /* Notify all devices to be down */ + eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); bus = eeh_pe_bus_get(phb_pe); eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 8654cb166c19..98f81800e00c 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -883,32 +883,29 @@ void eeh_pe_restore_bars(struct eeh_pe *pe) const char *eeh_pe_loc_get(struct eeh_pe *pe) { struct pci_bus *bus = eeh_pe_bus_get(pe); - struct device_node *dn = pci_bus_to_OF_node(bus); + struct device_node *dn; const char *loc = NULL; - if (!dn) - goto out; + while (bus) { + dn = pci_bus_to_OF_node(bus); + if (!dn) { + bus = bus->parent; + continue; + } - /* PHB PE or root PE ? */ - if (pci_is_root_bus(bus)) { - loc = of_get_property(dn, "ibm,loc-code", NULL); - if (!loc) + if (pci_is_root_bus(bus)) loc = of_get_property(dn, "ibm,io-base-loc-code", NULL); + else + loc = of_get_property(dn, "ibm,slot-location-code", + NULL); + if (loc) - goto out; + return loc; - /* Check the root port */ - dn = dn->child; - if (!dn) - goto out; + bus = bus->parent; } - loc = of_get_property(dn, "ibm,loc-code", NULL); - if (!loc) - loc = of_get_property(dn, "ibm,slot-location-code", NULL); - -out: - return loc ? loc : "N/A"; + return "N/A"; } /** @@ -931,7 +928,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) bus = pe->phb->bus; } else if (pe->type & EEH_PE_BUS || pe->type & EEH_PE_DEVICE) { - if (pe->bus) { + if (pe->state & EEH_PE_PRI_BUS) { bus = pe->bus; goto out; } diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 05e804cdecaa..aec9a1b1d25b 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -109,8 +109,9 @@ void arch_unregister_hw_breakpoint(struct perf_event *bp) * If the breakpoint is unregistered between a hw_breakpoint_handler() * and the single_step_dabr_instruction(), then cleanup the breakpoint * restoration variables to prevent dangling pointers. + * FIXME, this should not be using bp->ctx at all! Sayeth peterz. */ - if (bp->ctx && bp->ctx->task) + if (bp->ctx && bp->ctx->task && bp->ctx->task != ((void *)-1L)) bp->ctx->task->thread.last_hit_ubp = NULL; } diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index db475d41b57a..f28754c497e5 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -701,31 +701,3 @@ _GLOBAL(kexec_sequence) li r5,0 blr /* image->start(physid, image->start, 0); */ #endif /* CONFIG_KEXEC */ - -#ifdef CONFIG_MODULES -#if defined(_CALL_ELF) && _CALL_ELF == 2 - -#ifdef CONFIG_MODVERSIONS -.weak __crc_TOC. -.section "___kcrctab+TOC.","a" -.globl __kcrctab_TOC. -__kcrctab_TOC.: - .llong __crc_TOC. -#endif - -/* - * Export a fake .TOC. since both modpost and depmod will complain otherwise. - * Both modpost and depmod strip the leading . so we do the same here. - */ -.section "__ksymtab_strings","a" -__kstrtab_TOC.: - .asciz "TOC." - -.section "___ksymtab+TOC.","a" -/* This symbol name is important: it's used by modpost to find exported syms */ -.globl __ksymtab_TOC. -__ksymtab_TOC.: - .llong 0 /* .value */ - .llong __kstrtab_TOC. -#endif /* ELFv2 */ -#endif /* MODULES */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 59663af9315f..08b7a40de5f8 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -326,7 +326,10 @@ static void dedotify_versions(struct modversion_info *vers, } } -/* Undefined symbols which refer to .funcname, hack to funcname (or .TOC.) */ +/* + * Undefined symbols which refer to .funcname, hack to funcname. Make .TOC. + * seem to be defined (value set later). + */ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab) { unsigned int i; @@ -334,8 +337,11 @@ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab) for (i = 1; i < numsyms; i++) { if (syms[i].st_shndx == SHN_UNDEF) { char *name = strtab + syms[i].st_name; - if (name[0] == '.') - memmove(name, name+1, strlen(name)); + if (name[0] == '.') { + if (strcmp(name+1, "TOC.") == 0) + syms[i].st_shndx = SHN_ABS; + syms[i].st_name++; + } } } } @@ -351,7 +357,7 @@ static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs, numsyms = sechdrs[symindex].sh_size / sizeof(Elf64_Sym); for (i = 1; i < numsyms; i++) { - if (syms[i].st_shndx == SHN_UNDEF + if (syms[i].st_shndx == SHN_ABS && strcmp(strtab + syms[i].st_name, "TOC.") == 0) return &syms[i]; } diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index dccc87e8fee5..3c5736e52a14 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1768,9 +1768,9 @@ static inline unsigned long brk_rnd(void) /* 8MB for 32bit, 1GB for 64bit */ if (is_32bit_task()) - rnd = (long)(get_random_int() % (1<<(23-PAGE_SHIFT))); + rnd = (get_random_long() % (1UL<<(23-PAGE_SHIFT))); else - rnd = (long)(get_random_int() % (1<<(30-PAGE_SHIFT))); + rnd = (get_random_long() % (1UL<<(30-PAGE_SHIFT))); return rnd << PAGE_SHIFT; } diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index ad8c9db61237..d544fa311757 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -114,8 +114,6 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ notrace void __init machine_init(u64 dt_ptr) { - lockdep_init(); - /* Enable early debugging if any specified (see udbg.h) */ udbg_early_init(); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 5c03a6a9b054..f98be8383a39 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -255,9 +255,6 @@ void __init early_setup(unsigned long dt_ptr) setup_paca(&boot_paca); fixup_boot_paca(); - /* Initialize lockdep early or else spinlocks will blow */ - lockdep_init(); - /* -------- printk is now safe to use ------- */ /* Enable early debugging if any specified (see udbg.h) */ diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ec9ec2058d2d..b7dea05f0725 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -206,7 +206,7 @@ int smp_request_message_ipi(int virq, int msg) #ifdef CONFIG_PPC_SMP_MUXED_IPI struct cpu_messages { - int messages; /* current messages */ + long messages; /* current messages */ unsigned long data; /* data for cause ipi */ }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message); @@ -218,7 +218,7 @@ void smp_muxed_ipi_set_data(int cpu, unsigned long data) info->data = data; } -void smp_muxed_ipi_message_pass(int cpu, int msg) +void smp_muxed_ipi_set_message(int cpu, int msg) { struct cpu_messages *info = &per_cpu(ipi_message, cpu); char *message = (char *)&info->messages; @@ -228,6 +228,13 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) */ smp_mb(); message[msg] = 1; +} + +void smp_muxed_ipi_message_pass(int cpu, int msg) +{ + struct cpu_messages *info = &per_cpu(ipi_message, cpu); + + smp_muxed_ipi_set_message(cpu, msg); /* * cause_ipi functions are required to include a full barrier * before doing whatever causes the IPI. @@ -236,20 +243,31 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) } #ifdef __BIG_ENDIAN__ -#define IPI_MESSAGE(A) (1 << (24 - 8 * (A))) +#define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A))) #else -#define IPI_MESSAGE(A) (1 << (8 * (A))) +#define IPI_MESSAGE(A) (1uL << (8 * (A))) #endif irqreturn_t smp_ipi_demux(void) { struct cpu_messages *info = this_cpu_ptr(&ipi_message); - unsigned int all; + unsigned long all; mb(); /* order any irq clear */ do { all = xchg(&info->messages, 0); +#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) + /* + * Must check for PPC_MSG_RM_HOST_ACTION messages + * before PPC_MSG_CALL_FUNCTION messages because when + * a VM is destroyed, we call kick_all_cpus_sync() + * to ensure that any pending PPC_MSG_RM_HOST_ACTION + * messages have completed before we free any VCPUs. + */ + if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION)) + kvmppc_xics_ipi_action(); +#endif if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION)) generic_smp_call_function_interrupt(); if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE)) @@ -727,7 +745,7 @@ void start_secondary(void *unused) local_irq_enable(); - cpu_startup_entry(CPUHP_ONLINE); + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); BUG(); } diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 0570eef83fba..7f7b6d86ac73 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o + $(KVM)/eventfd.o $(KVM)/vfio.o CFLAGS_e500_mmu.o := -I. CFLAGS_e500_mmu_host.o := -I. diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 638c6d9be9e0..b34220d2aa42 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -807,7 +807,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) { #ifdef CONFIG_PPC64 - INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables); INIT_LIST_HEAD(&kvm->arch.rtas_tokens); #endif diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 774a253ca4e1..9bf7031a67ff 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -377,15 +377,12 @@ no_seg_found: static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) { - struct kvmppc_vcpu_book3s *vcpu_book3s; u64 esid, esid_1t; int slb_nr; struct kvmppc_slb *slbe; dprintk("KVM MMU: slbmte(0x%llx, 0x%llx)\n", rs, rb); - vcpu_book3s = to_book3s(vcpu); - esid = GET_ESID(rb); esid_1t = GET_ESID_1T(rb); slb_nr = rb & 0xfff; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 54cf9bc94dad..2c2d1030843a 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com> */ #include <linux/types.h> @@ -36,28 +37,69 @@ #include <asm/ppc-opcode.h> #include <asm/kvm_host.h> #include <asm/udbg.h> +#include <asm/iommu.h> +#include <asm/tce.h> -#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) +static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) +{ + return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; +} -static long kvmppc_stt_npages(unsigned long window_size) +static unsigned long kvmppc_stt_pages(unsigned long tce_pages) { - return ALIGN((window_size >> SPAPR_TCE_SHIFT) - * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; + unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) + + (tce_pages * sizeof(struct page *)); + + return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE; } -static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) { - struct kvm *kvm = stt->kvm; - int i; + long ret = 0; - mutex_lock(&kvm->lock); - list_del(&stt->list); - for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + if (!current || !current->mm) + return ret; /* process exited */ + + down_write(¤t->mm->mmap_sem); + + if (inc) { + unsigned long locked, lock_limit; + + locked = current->mm->locked_vm + stt_pages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + current->mm->locked_vm += stt_pages; + } else { + if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm)) + stt_pages = current->mm->locked_vm; + + current->mm->locked_vm -= stt_pages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, + inc ? '+' : '-', + stt_pages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static void release_spapr_tce_table(struct rcu_head *head) +{ + struct kvmppc_spapr_tce_table *stt = container_of(head, + struct kvmppc_spapr_tce_table, rcu); + unsigned long i, npages = kvmppc_tce_pages(stt->size); + + for (i = 0; i < npages; i++) __free_page(stt->pages[i]); - kfree(stt); - mutex_unlock(&kvm->lock); - kvm_put_kvm(kvm); + kfree(stt); } static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -65,7 +107,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; struct page *page; - if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) return VM_FAULT_SIGBUS; page = stt->pages[vmf->pgoff]; @@ -88,7 +130,14 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) { struct kvmppc_spapr_tce_table *stt = filp->private_data; - release_spapr_tce_table(stt); + list_del_rcu(&stt->list); + + kvm_put_kvm(stt->kvm); + + kvmppc_account_memlimit( + kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); + call_rcu(&stt->rcu, release_spapr_tce_table); + return 0; } @@ -98,20 +147,29 @@ static const struct file_operations kvm_spapr_tce_fops = { }; long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args) + struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; - long npages; + unsigned long npages, size; int ret = -ENOMEM; int i; + if (!args->size) + return -EINVAL; + /* Check this LIOBN hasn't been previously allocated */ list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { if (stt->liobn == args->liobn) return -EBUSY; } - npages = kvmppc_stt_npages(args->window_size); + size = args->size; + npages = kvmppc_tce_pages(size); + ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); + if (ret) { + stt = NULL; + goto fail; + } stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); @@ -119,7 +177,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; stt->liobn = args->liobn; - stt->window_size = args->window_size; + stt->page_shift = args->page_shift; + stt->offset = args->offset; + stt->size = size; stt->kvm = kvm; for (i = 0; i < npages; i++) { @@ -131,7 +191,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kvm_get_kvm(kvm); mutex_lock(&kvm->lock); - list_add(&stt->list, &kvm->arch.spapr_tce_tables); + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); mutex_unlock(&kvm->lock); @@ -148,3 +208,59 @@ fail: } return ret; } + +long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret = H_SUCCESS, idx; + unsigned long entry, ua = 0; + u64 __user *tces, tce; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + entry = ioba >> stt->page_shift; + /* + * SPAPR spec says that the maximum size of the list is 512 TCEs + * so the whole table fits in 4K page + */ + if (npages > 512) + return H_PARAMETER; + + if (tce_list & (SZ_4K - 1)) + return H_PARAMETER; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tces = (u64 __user *) ua; + + for (i = 0; i < npages; ++i) { + if (get_user(tce, tces + i)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + tce = be64_to_cpu(tce); + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + goto unlock_exit; + + kvmppc_tce_put(stt, entry + i, tce); + } + +unlock_exit: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; +} +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 89e96b3e0039..44be73e6aa26 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -14,6 +14,7 @@ * * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com> */ #include <linux/types.h> @@ -30,76 +31,321 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> #include <asm/mmu-hash64.h> +#include <asm/mmu_context.h> #include <asm/hvcall.h> #include <asm/synch.h> #include <asm/ppc-opcode.h> #include <asm/kvm_host.h> #include <asm/udbg.h> +#include <asm/iommu.h> +#include <asm/tce.h> +#include <asm/iommu.h> #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) -/* WARNING: This will be called in real-mode on HV KVM and virtual +/* + * Finds a TCE table descriptor by LIOBN. + * + * WARNING: This will be called in real or virtual mode on HV KVM and virtual * mode on PR KVM */ -long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba, unsigned long tce) +struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, + unsigned long liobn) { struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; + list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) + if (stt->liobn == liobn) + return stt; + + return NULL; +} +EXPORT_SYMBOL_GPL(kvmppc_find_table); + +/* + * Validates IO address. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages) +{ + unsigned long mask = (1ULL << stt->page_shift) - 1; + unsigned long idx = ioba >> stt->page_shift; + + if ((ioba & mask) || (idx < stt->offset) || + (idx - stt->offset + npages > stt->size) || + (idx + npages < idx)) + return H_PARAMETER; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); + +/* + * Validates TCE address. + * At the moment flags and page mask are validated. + * As the host kernel does not access those addresses (just puts them + * to the table and user space is supposed to process them), we can skip + * checking other things (such as TCE is a guest RAM address or the page + * was actually allocated). + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) +{ + unsigned long page_mask = ~((1ULL << stt->page_shift) - 1); + unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ); + + if (tce & mask) + return H_PARAMETER; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_validate); + +/* Note on the use of page_address() in real mode, + * + * It is safe to use page_address() in real mode on ppc64 because + * page_address() is always defined as lowmem_page_address() + * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic + * operation and does not access page struct. + * + * Theoretically page_address() could be defined different + * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL + * would have to be enabled. + * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64, + * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only + * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP + * is not expected to be enabled on ppc32, page_address() + * is safe for ppc32 as well. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +static u64 *kvmppc_page_address(struct page *page) +{ +#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL) +#error TODO: fix to avoid page_address() here +#endif + return (u64 *) page_address(page); +} + +/* + * Handles TCE requests for emulated devices. + * Puts guest TCE values to the table and expects user space to convert them. + * Called in both real and virtual modes. + * Cannot fail so kvmppc_tce_validate must be called before it. + * + * WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, + unsigned long idx, unsigned long tce) +{ + struct page *page; + u64 *tbl; + + idx -= stt->offset; + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = kvmppc_page_address(page); + + tbl[idx % TCES_PER_PAGE] = tce; +} +EXPORT_SYMBOL_GPL(kvmppc_tce_put); + +long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, + unsigned long *ua, unsigned long **prmap) +{ + unsigned long gfn = gpa >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + memslot = search_memslots(kvm_memslots(kvm), gfn); + if (!memslot) + return -EINVAL; + + *ua = __gfn_to_hva_memslot(memslot, gfn) | + (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (prmap) + *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; +#endif + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce) +{ + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == liobn) { - unsigned long idx = ioba >> SPAPR_TCE_SHIFT; - struct page *page; - u64 *tbl; - - /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ - /* liobn, stt, stt->window_size); */ - if (ioba >= stt->window_size) - return H_PARAMETER; - - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); - - /* FIXME: Need to validate the TCE itself */ - /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ - tbl[idx % TCES_PER_PAGE] = tce; - return H_SUCCESS; - } - } + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + return ret; + + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); -long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba) +static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, + unsigned long ua, unsigned long *phpa) +{ + pte_t *ptep, pte; + unsigned shift = 0; + + ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift); + if (!ptep || !pte_present(*ptep)) + return -ENXIO; + pte = *ptep; + + if (!shift) + shift = PAGE_SHIFT; + + /* Avoid handling anything potentially complicated in realmode */ + if (shift > PAGE_SHIFT) + return -EAGAIN; + + if (!pte_young(pte)) + return -EAGAIN; + + *phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) | + (ua & ~PAGE_MASK); + + return 0; +} + +long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages) { - struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; + long i, ret = H_SUCCESS; + unsigned long tces, entry, ua = 0; + unsigned long *rmap = NULL; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + entry = ioba >> stt->page_shift; + /* + * The spec says that the maximum size of the list is 512 TCEs + * so the whole table addressed resides in 4K page + */ + if (npages > 512) + return H_PARAMETER; + + if (tce_list & (SZ_4K - 1)) + return H_PARAMETER; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == liobn) { - unsigned long idx = ioba >> SPAPR_TCE_SHIFT; - struct page *page; - u64 *tbl; + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + return H_TOO_HARD; - if (ioba >= stt->window_size) - return H_PARAMETER; + rmap = (void *) vmalloc_to_phys(rmap); - page = stt->pages[idx / TCES_PER_PAGE]; - tbl = (u64 *)page_address(page); + /* + * Synchronize with the MMU notifier callbacks in + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * While we have the rmap lock, code running on other CPUs + * cannot finish unmapping the host real page that backs + * this guest real page, so we are OK to access the host + * real page. + */ + lock_rmap(rmap); + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { + ret = H_TOO_HARD; + goto unlock_exit; + } + + for (i = 0; i < npages; ++i) { + unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + goto unlock_exit; - vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; - return H_SUCCESS; - } + kvmppc_tce_put(stt, entry + i, tce); } - /* Didn't find the liobn, punt it to userspace */ - return H_TOO_HARD; +unlock_exit: + unlock_rmap(rmap); + + return ret; +} + +long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + /* Check permission bits only to allow userspace poison TCE for debug */ + if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) + return H_PARAMETER; + + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba) +{ + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + unsigned long idx; + struct page *page; + u64 *tbl; + + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; + + idx = (ioba >> stt->page_shift) - stt->offset; + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; + + return H_SUCCESS; } EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); + +#endif /* KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cff207b72c46..84fb4fcfaa41 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -81,6 +81,17 @@ static int target_smt_mode; module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); +#ifdef CONFIG_KVM_XICS +static struct kernel_param_ops module_param_ops = { + .set = param_set_int, + .get = param_get_int, +}; + +module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); +#endif + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -114,11 +125,11 @@ static bool kvmppc_ipi_thread(int cpu) static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) { int cpu; - wait_queue_head_t *wqp; + struct swait_queue_head *wqp; wqp = kvm_arch_vcpu_wq(vcpu); - if (waitqueue_active(wqp)) { - wake_up_interruptible(wqp); + if (swait_active(wqp)) { + swake_up(wqp); ++vcpu->stat.halt_wakeup; } @@ -701,8 +712,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) tvcpu->arch.prodded = 1; smp_mb(); if (vcpu->arch.ceded) { - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); + if (swait_active(&vcpu->wq)) { + swake_up(&vcpu->wq); vcpu->stat.halt_wakeup++; } } @@ -768,7 +779,31 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) if (kvmppc_xics_enabled(vcpu)) { ret = kvmppc_xics_hcall(vcpu, req); break; - } /* fallthrough */ + } + return RESUME_HOST; + case H_PUT_TCE: + ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_PUT_TCE_INDIRECT: + ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_STUFF_TCE: + ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; default: return RESUME_HOST; } @@ -833,6 +868,24 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->stat.sum_exits++; + /* + * This can happen if an interrupt occurs in the last stages + * of guest entry or the first stages of guest exit (i.e. after + * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV + * and before setting it to KVM_GUEST_MODE_HOST_HV). + * That can happen due to a bug, or due to a machine check + * occurring at just the wrong time. + */ + if (vcpu->arch.shregs.msr & MSR_HV) { + printk(KERN_EMERG "KVM trap in HV mode!\n"); + printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + kvmppc_dump_regs(vcpu); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->hw.hardware_exit_reason = vcpu->arch.trap; + return RESUME_HOST; + } run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; switch (vcpu->arch.trap) { @@ -1441,7 +1494,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); spin_lock_init(&vcore->stoltb_lock); - init_waitqueue_head(&vcore->wq); + init_swait_queue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; vcore->lpcr = kvm->arch.lpcr; vcore->first_vcpuid = core * threads_per_subcore; @@ -2261,6 +2314,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) } /* + * Clear core from the list of active host cores as we are about to + * enter the guest. Only do this if it is the primary thread of the + * core (not if a subcore) that is entering the guest. + */ +static inline void kvmppc_clear_host_core(int cpu) +{ + int core; + + if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) + return; + /* + * Memory barrier can be omitted here as we will do a smp_wmb() + * later in kvmppc_start_thread and we need ensure that state is + * visible to other CPUs only after we enter guest. + */ + core = cpu >> threads_shift; + kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0; +} + +/* + * Advertise this core as an active host core since we exited the guest + * Only need to do this if it is the primary thread of the core that is + * exiting. + */ +static inline void kvmppc_set_host_core(int cpu) +{ + int core; + + if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) + return; + + /* + * Memory barrier can be omitted here because we do a spin_unlock + * immediately after this which provides the memory barrier. + */ + core = cpu >> threads_shift; + kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1; +} + +/* * Run a set of guest threads on a physical core. * Called with vc->lock held. */ @@ -2372,6 +2465,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } } + kvmppc_clear_host_core(pcpu); + /* Start all the threads */ active = 0; for (sub = 0; sub < core_info.n_subcores; ++sub) { @@ -2468,6 +2563,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_ipi_thread(pcpu + i); } + kvmppc_set_host_core(pcpu); + spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ @@ -2513,10 +2610,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { struct kvm_vcpu *vcpu; int do_sleep = 1; + DECLARE_SWAITQUEUE(wait); - DEFINE_WAIT(wait); - - prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); /* * Check one last time for pending exceptions and ceded state after @@ -2530,7 +2626,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) } if (!do_sleep) { - finish_wait(&vc->wq, &wait); + finish_swait(&vc->wq, &wait); return; } @@ -2538,7 +2634,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) trace_kvmppc_vcore_blocked(vc, 0); spin_unlock(&vc->lock); schedule(); - finish_wait(&vc->wq, &wait); + finish_swait(&vc->wq, &wait); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; trace_kvmppc_vcore_blocked(vc, 1); @@ -2594,7 +2690,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { - wake_up(&vc->wq); + swake_up(&vc->wq); } } @@ -2966,6 +3062,114 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto out_srcu; } +#ifdef CONFIG_KVM_XICS +static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + unsigned long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + kvmppc_set_host_core(cpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + kvmppc_clear_host_core(cpu); + break; +#endif + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block kvmppc_cpu_notifier = { + .notifier_call = kvmppc_cpu_notify, +}; + +/* + * Allocate a per-core structure for managing state about which cores are + * running in the host versus the guest and for exchanging data between + * real mode KVM and CPU running in the host. + * This is only done for the first VM. + * The allocated structure stays even if all VMs have stopped. + * It is only freed when the kvm-hv module is unloaded. + * It's OK for this routine to fail, we just don't support host + * core operations like redirecting H_IPI wakeups. + */ +void kvmppc_alloc_host_rm_ops(void) +{ + struct kvmppc_host_rm_ops *ops; + unsigned long l_ops; + int cpu, core; + int size; + + /* Not the first time here ? */ + if (kvmppc_host_rm_ops_hv != NULL) + return; + + ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL); + if (!ops) + return; + + size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core); + ops->rm_core = kzalloc(size, GFP_KERNEL); + + if (!ops->rm_core) { + kfree(ops); + return; + } + + get_online_cpus(); + + for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) { + if (!cpu_online(cpu)) + continue; + + core = cpu >> threads_shift; + ops->rm_core[core].rm_state.in_host = 1; + } + + ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv; + + /* + * Make the contents of the kvmppc_host_rm_ops structure visible + * to other CPUs before we assign it to the global variable. + * Do an atomic assignment (no locks used here), but if someone + * beats us to it, just free our copy and return. + */ + smp_wmb(); + l_ops = (unsigned long) ops; + + if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) { + put_online_cpus(); + kfree(ops->rm_core); + kfree(ops); + return; + } + + register_cpu_notifier(&kvmppc_cpu_notifier); + + put_online_cpus(); +} + +void kvmppc_free_host_rm_ops(void) +{ + if (kvmppc_host_rm_ops_hv) { + unregister_cpu_notifier(&kvmppc_cpu_notifier); + kfree(kvmppc_host_rm_ops_hv->rm_core); + kfree(kvmppc_host_rm_ops_hv); + kvmppc_host_rm_ops_hv = NULL; + } +} +#endif + static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; @@ -2978,6 +3182,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) return -ENOMEM; kvm->arch.lpid = lpid; + kvmppc_alloc_host_rm_ops(); + /* * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, @@ -3211,6 +3417,7 @@ static int kvmppc_book3s_init_hv(void) static void kvmppc_book3s_exit_hv(void) { + kvmppc_free_host_rm_ops(); kvmppc_hv_ops = NULL; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fd7006bf6b1a..5f0380db3eab 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -283,3 +283,6 @@ void kvmhv_commence_exit(int trap) kvmhv_interrupt_vcore(vc, ee); } } + +struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; +EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 24f58076d49e..980d8a6f7284 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -17,12 +17,16 @@ #include <asm/xics.h> #include <asm/debug.h> #include <asm/synch.h> +#include <asm/cputhreads.h> #include <asm/ppc-opcode.h> #include "book3s_xics.h" #define DEBUG_PASSUP +int h_ipi_redirect = 1; +EXPORT_SYMBOL(h_ipi_redirect); + static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); @@ -50,11 +54,84 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics, /* -- ICP routines -- */ +#ifdef CONFIG_SMP +static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) +{ + int hcpu; + + hcpu = hcore << threads_shift; + kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; + smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); + icp_native_cause_ipi_rm(hcpu); +} +#else +static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } +#endif + +/* + * We start the search from our current CPU Id in the core map + * and go in a circle until we get back to our ID looking for a + * core that is running in host context and that hasn't already + * been targeted for another rm_host_ops. + * + * In the future, could consider using a fairer algorithm (one + * that distributes the IPIs better) + * + * Returns -1, if no CPU could be found in the host + * Else, returns a CPU Id which has been reserved for use + */ +static inline int grab_next_hostcore(int start, + struct kvmppc_host_rm_core *rm_core, int max, int action) +{ + bool success; + int core; + union kvmppc_rm_state old, new; + + for (core = start + 1; core < max; core++) { + old = new = READ_ONCE(rm_core[core].rm_state); + + if (!old.in_host || old.rm_action) + continue; + + /* Try to grab this host core if not taken already. */ + new.rm_action = action; + + success = cmpxchg64(&rm_core[core].rm_state.raw, + old.raw, new.raw) == old.raw; + if (success) { + /* + * Make sure that the store to the rm_action is made + * visible before we return to caller (and the + * subsequent store to rm_data) to synchronize with + * the IPI handler. + */ + smp_wmb(); + return core; + } + } + + return -1; +} + +static inline int find_available_hostcore(int action) +{ + int core; + int my_core = smp_processor_id() >> threads_shift; + struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core; + + core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action); + if (core == -1) + core = grab_next_hostcore(core, rm_core, my_core, action); + + return core; +} + static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, struct kvm_vcpu *this_vcpu) { struct kvmppc_icp *this_icp = this_vcpu->arch.icp; int cpu; + int hcore; /* Mark the target VCPU as having an interrupt pending */ vcpu->stat.queue_intr++; @@ -66,11 +143,22 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, return; } - /* Check if the core is loaded, if not, too hard */ + /* + * Check if the core is loaded, + * if not, find an available host core to post to wake the VCPU, + * if we can't find one, set up state to eventually return too hard. + */ cpu = vcpu->arch.thread_cpu; if (cpu < 0 || cpu >= nr_cpu_ids) { - this_icp->rm_action |= XICS_RM_KICK_VCPU; - this_icp->rm_kick_target = vcpu; + hcore = -1; + if (kvmppc_host_rm_ops_hv && h_ipi_redirect) + hcore = find_available_hostcore(XICS_RM_KICK_VCPU); + if (hcore != -1) { + icp_send_hcore_msg(hcore, vcpu); + } else { + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + } return; } @@ -623,3 +711,40 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) bail: return check_too_hard(xics, icp); } + +/* --- Non-real mode XICS-related built-in routines --- */ + +/** + * Host Operations poked by RM KVM + */ +static void rm_host_ipi_action(int action, void *data) +{ + switch (action) { + case XICS_RM_KICK_VCPU: + kvmppc_host_rm_ops_hv->vcpu_kick(data); + break; + default: + WARN(1, "Unexpected rm_action=%d data=%p\n", action, data); + break; + } + +} + +void kvmppc_xics_ipi_action(void) +{ + int core; + unsigned int cpu = smp_processor_id(); + struct kvmppc_host_rm_core *rm_corep; + + core = cpu >> threads_shift; + rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core]; + + if (rm_corep->rm_data) { + rm_host_ipi_action(rm_corep->rm_state.rm_action, + rm_corep->rm_data); + /* Order these stores against the real mode KVM */ + rm_corep->rm_data = NULL; + smp_wmb(); + rm_corep->rm_state.rm_action = 0; + } +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 3c6badcd53ef..85b32f16fa74 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1370,6 +1370,20 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) std r6, VCPU_ACOP(r9) stw r7, VCPU_GUEST_PID(r9) std r8, VCPU_WORT(r9) + /* + * Restore various registers to 0, where non-zero values + * set by the guest could disrupt the host. + */ + li r0, 0 + mtspr SPRN_IAMR, r0 + mtspr SPRN_CIABR, r0 + mtspr SPRN_DAWRX, r0 + mtspr SPRN_TCSCR, r0 + mtspr SPRN_WORT, r0 + /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ + li r0, 1 + sldi r0, r0, 31 + mtspr SPRN_MMCRS, r0 8: /* Save and reset AMR and UAMOR before turning on the MMU */ @@ -2006,8 +2020,8 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table - .long 0 /* 0x138 */ - .long 0 /* 0x13c */ + .long DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table .long 0 /* 0x140 */ .long 0 /* 0x144 */ .long 0 /* 0x148 */ @@ -2153,7 +2167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ 2: rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW - rlwimi r5, r4, 1, DAWRX_WT + rlwimi r5, r4, 2, DAWRX_WT clrrdi r4, r4, 3 std r4, VCPU_DAWR(r3) std r5, VCPU_DAWRX(r3) @@ -2404,6 +2418,8 @@ machine_check_realmode: * guest as machine check causing guest to crash. */ ld r11, VCPU_MSR(r9) + rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ + bne mc_cont /* if so, exit to host */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */ beq 1f /* Deliver a machine check to guest */ ld r10, VCPU_PC(r9) diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index f2c75a1e0536..02176fd52f84 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -280,6 +280,37 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce = kvmppc_get_gpr(vcpu, 6); + unsigned long npages = kvmppc_get_gpr(vcpu, 7); + long rc; + + rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba, + tce, npages); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce_value = kvmppc_get_gpr(vcpu, 6); + unsigned long npages = kvmppc_get_gpr(vcpu, 7); + long rc; + + rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { long rc = kvmppc_xics_hcall(vcpu, cmd); @@ -306,6 +337,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) return kvmppc_h_pr_bulk_remove(vcpu); case H_PUT_TCE: return kvmppc_h_pr_put_tce(vcpu); + case H_PUT_TCE_INDIRECT: + return kvmppc_h_pr_put_tce_indirect(vcpu); + case H_STUFF_TCE: + return kvmppc_h_pr_stuff_tce(vcpu); case H_CEDE: kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6fd2405c7f4a..19aa59b0850c 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -33,6 +33,7 @@ #include <asm/tlbflush.h> #include <asm/cputhreads.h> #include <asm/irqflags.h> +#include <asm/iommu.h> #include "timing.h" #include "irq.h" #include "../mm/mmu_decl.h" @@ -437,6 +438,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm) unsigned int i; struct kvm_vcpu *vcpu; +#ifdef CONFIG_KVM_XICS + /* + * We call kick_all_cpus_sync() to ensure that all + * CPUs have executed any pending IPIs before we + * continue and free VCPUs structures below. + */ + if (is_kvmppc_hv_enabled(kvm)) + kick_all_cpus_sync(); +#endif + kvm_for_each_vcpu(i, vcpu, kvm) kvm_arch_vcpu_free(vcpu); @@ -509,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: + case KVM_CAP_SPAPR_TCE_64: case KVM_CAP_PPC_ALLOC_HTAB: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: @@ -569,6 +581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_GET_SMMU_INFO: r = 1; break; + case KVM_CAP_SPAPR_MULTITCE: + r = 1; + break; #endif default: r = 0; @@ -919,21 +934,17 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = -ENXIO; break; } - vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; break; case KVM_REG_PPC_VSCR: if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { r = -ENXIO; break; } - vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); + val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); break; case KVM_REG_PPC_VRSAVE: - if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { - r = -ENXIO; - break; - } - vcpu->arch.vrsave = set_reg_val(reg->id, val); + val = get_reg_val(reg->id, vcpu->arch.vrsave); break; #endif /* CONFIG_ALTIVEC */ default: @@ -974,17 +985,21 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = -ENXIO; break; } - val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; + vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; break; case KVM_REG_PPC_VSCR: if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { r = -ENXIO; break; } - val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); + vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); break; case KVM_REG_PPC_VRSAVE: - val = get_reg_val(reg->id, vcpu->arch.vrsave); + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); break; #endif /* CONFIG_ALTIVEC */ default: @@ -1331,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } #ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CREATE_SPAPR_TCE_64: { + struct kvm_create_spapr_tce_64 create_tce_64; + + r = -EFAULT; + if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64))) + goto out; + if (create_tce_64.flags) { + r = -EINVAL; + goto out; + } + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); + goto out; + } case KVM_CREATE_SPAPR_TCE: { struct kvm_create_spapr_tce create_tce; + struct kvm_create_spapr_tce_64 create_tce_64; r = -EFAULT; if (copy_from_user(&create_tce, argp, sizeof(create_tce))) goto out; - r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); + + create_tce_64.liobn = create_tce.liobn; + create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K; + create_tce_64.offset = 0; + create_tce_64.size = create_tce.window_size >> + IOMMU_PAGE_SHIFT_4K; + create_tce_64.flags = 0; + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64); goto out; } case KVM_PPC_GET_SMMU_INFO: { diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index 0762c1e08c88..edb09912f0c9 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -111,7 +111,13 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, */ if (!(old_pte & _PAGE_COMBO)) { flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); - old_pte &= ~_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND; + /* + * clear the old slot details from the old and new pte. + * On hash insert failure we use old pte value and we don't + * want slot information there if we have a insert failure. + */ + old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); + new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); goto htab_insert_hpte; } /* diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 49b152b0f926..eb2accdd76fd 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -78,9 +78,19 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, * base page size. This is because demote_segment won't flush * hash page table entries. */ - if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) + if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) { flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, ssize, flags); + /* + * With THP, we also clear the slot information with + * respect to all the 64K hash pte mapping the 16MB + * page. They are all invalid now. This make sure we + * don't find the slot valid when we fault with 4k + * base page size. + * + */ + memset(hpte_slot_array, 0, PTE_FRAG_SIZE); + } } valid = hpte_valid(hpte_slot_array, index); diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c index 7e6d0880813f..83a8be791e06 100644 --- a/arch/powerpc/mm/hugetlbpage-book3e.c +++ b/arch/powerpc/mm/hugetlbpage-book3e.c @@ -8,6 +8,8 @@ #include <linux/mm.h> #include <linux/hugetlb.h> +#include <asm/mmu.h> + #ifdef CONFIG_PPC_FSL_BOOK3E #ifdef CONFIG_PPC64 static inline int tlb1_next(void) @@ -60,6 +62,14 @@ static inline void book3e_tlb_lock(void) unsigned long tmp; int token = smp_processor_id() + 1; + /* + * Besides being unnecessary in the absence of SMT, this + * check prevents trying to do lbarx/stbcx. on e5500 which + * doesn't implement either feature. + */ + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + asm volatile("1: lbarx %0, 0, %1;" "cmpwi %0, 0;" "bne 2f;" @@ -80,6 +90,9 @@ static inline void book3e_tlb_unlock(void) { struct paca_struct *paca = get_paca(); + if (!cpu_has_feature(CPU_FTR_SMT)) + return; + isync(); paca->tcd_ptr->lock = 0; } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 22d94c3e6fc4..f078a1f94fc2 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -541,7 +541,7 @@ static int __init add_system_ram_resources(void) res->name = "System RAM"; res->start = base; res->end = base + size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; WARN_ON(request_resource(&iomem_resource, res) < 0); } } @@ -560,12 +560,12 @@ subsys_initcall(add_system_ram_resources); */ int devmem_is_allowed(unsigned long pfn) { + if (page_is_rtas_user_buf(pfn)) + return 1; if (iomem_is_exclusive(PFN_PHYS(pfn))) return 0; if (!page_is_ram(pfn)) return 1; - if (page_is_rtas_user_buf(pfn)) - return 1; return 0; } #endif /* CONFIG_STRICT_DEVMEM */ diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 0f0502e12f6c..4087705ba90f 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -59,9 +59,9 @@ unsigned long arch_mmap_rnd(void) /* 8MB for 32bit, 1GB for 64bit */ if (is_32bit_task()) - rnd = (unsigned long)get_random_int() % (1<<(23-PAGE_SHIFT)); + rnd = get_random_long() % (1<<(23-PAGE_SHIFT)); else - rnd = (unsigned long)get_random_int() % (1<<(30-PAGE_SHIFT)); + rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT)); return rnd << PAGE_SHIFT; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 83dfd7925c72..de37ff445362 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -243,3 +243,11 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) } #endif /* CONFIG_DEBUG_VM */ +unsigned long vmalloc_to_phys(void *va) +{ + unsigned long pfn = vmalloc_to_pfn(va); + + BUG_ON(!pfn); + return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va); +} +EXPORT_SYMBOL_GPL(vmalloc_to_phys); diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 3124a20d0fab..cdf2123d46db 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -646,6 +646,28 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } +void pmdp_huge_split_prepare(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); + + /* + * We can't mark the pmd none here, because that will cause a race + * against exit_mmap. We need to continue mark pmd TRANS HUGE, while + * we spilt, but at the same time we wan't rest of the ppc64 code + * not to insert hash pte on this, because we will be modifying + * the deposited pgtable in the caller of this function. Hence + * clear the _PAGE_USER so that we move the fault handling to + * higher level function and that will serialize against ptl. + * We need to flush existing hash pte entries here even though, + * the translation is still valid, because we will withdraw + * pgtable_t after this. + */ + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0); +} + + /* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. @@ -663,10 +685,20 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } +/* + * We use this to invalidate a pmdp entry before switching from a + * hugepte to regular pmd entry. + */ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + + /* + * This ensures that generic code that rely on IRQ disabling + * to prevent a parallel THP split work as expected. + */ + kick_all_cpus_sync(); } /* diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 9f9dfda9ed2c..3b09ecfd0aee 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -493,14 +493,6 @@ static size_t event_to_attr_ct(struct hv_24x7_event_data *event) } } -static unsigned long vmalloc_to_phys(void *v) -{ - struct page *p = vmalloc_to_page(v); - - BUG_ON(!p); - return page_to_phys(p) + offset_in_page(v); -} - /* */ struct event_uniq { struct rb_node node; diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c index 7d5e295255b7..9958ba8bf0d2 100644 --- a/arch/powerpc/perf/power8-pmu.c +++ b/arch/powerpc/perf/power8-pmu.c @@ -816,7 +816,7 @@ static struct power_pmu power8_pmu = { .get_constraint = power8_get_constraint, .get_alternatives = power8_get_alternatives, .disable_pmc = power8_disable_pmc, - .flags = PPMU_HAS_SSLOT | PPMU_HAS_SIER | PPMU_ARCH_207S, + .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, .n_generic = ARRAY_SIZE(power8_generic_events), .generic_events = power8_generic_events, .cache_events = &power8_cache_events, diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 5f152b95ca0c..87f47e55aab6 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -444,9 +444,12 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) * PCI devices of the PE are expected to be removed prior * to PE reset. */ - if (!edev->pe->bus) + if (!(edev->pe->state & EEH_PE_PRI_BUS)) { edev->pe->bus = pci_find_bus(hose->global_number, pdn->busno); + if (edev->pe->bus) + edev->pe->state |= EEH_PE_PRI_BUS; + } /* * Enable EEH explicitly so that we will do EEH check diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 573ae1994097..f90dc04395bf 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3180,6 +3180,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_dma_dev_setup, + .dma_bus_setup = pnv_pci_dma_bus_setup, #ifdef CONFIG_PCI_MSI .setup_msi_irqs = pnv_setup_msi_irqs, .teardown_msi_irqs = pnv_teardown_msi_irqs, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 2f55c86df703..b1ef84a6c9d1 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -599,6 +599,9 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, u64 rpn = __pa(uaddr) >> tbl->it_page_shift; long i; + if (proto_tce & TCE_PCI_WRITE) + proto_tce |= TCE_PCI_READ; + for (i = 0; i < npages; i++) { unsigned long newtce = proto_tce | ((rpn + i) << tbl->it_page_shift); @@ -620,6 +623,9 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); + if (newtce & TCE_PCI_WRITE) + newtce |= TCE_PCI_READ; + oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)); *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE); *direction = iommu_tce_direction(oldtce); @@ -760,6 +766,26 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev) phb->dma_dev_setup(phb, pdev); } +void pnv_pci_dma_bus_setup(struct pci_bus *bus) +{ + struct pci_controller *hose = bus->sysdata; + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) + continue; + + if (!pe->pbus) + continue; + + if (bus->number == ((pe->rid >> 8) & 0xFF)) { + pe->pbus = bus; + break; + } + } +} + void pnv_pci_shutdown(void) { struct pci_controller *hose; diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 7f56313e8d72..00691a9b99af 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -242,6 +242,7 @@ extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev); +extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index eae32654bdf2..afdf62f2a695 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -159,6 +159,27 @@ static void icp_native_cause_ipi(int cpu, unsigned long data) icp_native_set_qirr(cpu, IPI_PRIORITY); } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +void icp_native_cause_ipi_rm(int cpu) +{ + /* + * Currently not used to send IPIs to another CPU + * on the same core. Only caller is KVM real mode. + * Need the physical address of the XICS to be + * previously saved in kvm_hstate in the paca. + */ + unsigned long xics_phys; + + /* + * Just like the cause_ipi functions, it is required to + * include a full barrier (out8 includes a sync) before + * causing the IPI. + */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY); +} +#endif + /* * Called when an interrupt is received on an off-line CPU to * clear the interrupt, so that the CPU can go back to nap mode. |