diff options
71 files changed, 515 insertions, 233 deletions
diff --git a/Documentation/trace/fprobe.rst b/Documentation/trace/fprobe.rst index 7a895514b537..196f52386aaa 100644 --- a/Documentation/trace/fprobe.rst +++ b/Documentation/trace/fprobe.rst @@ -91,9 +91,9 @@ The prototype of the entry/exit callback function are as follows: .. code-block:: c - int entry_callback(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs, void *entry_data); + int entry_callback(struct fprobe *fp, unsigned long entry_ip, unsigned long ret_ip, struct pt_regs *regs, void *entry_data); - void exit_callback(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs, void *entry_data); + void exit_callback(struct fprobe *fp, unsigned long entry_ip, unsigned long ret_ip, struct pt_regs *regs, void *entry_data); Note that the @entry_ip is saved at function entry and passed to exit handler. If the entry callback function returns !0, the corresponding exit callback will be cancelled. @@ -108,6 +108,10 @@ If the entry callback function returns !0, the corresponding exit callback will Note that this may not be the actual entry address of the function but the address where the ftrace is instrumented. +@ret_ip + This is the return address that the traced function will return to, + somewhere in the caller. This can be used at both entry and exit. + @regs This is the `pt_regs` data structure at the entry and exit. Note that the instruction pointer of @regs may be different from the @entry_ip diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 5882b2415596..1095c6647e96 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -344,14 +344,14 @@ */ #define __HFGRTR_EL2_RES0 (GENMASK(63, 56) | GENMASK(53, 51)) #define __HFGRTR_EL2_MASK GENMASK(49, 0) -#define __HFGRTR_EL2_nMASK (GENMASK(55, 54) | BIT(50)) +#define __HFGRTR_EL2_nMASK (GENMASK(58, 57) | GENMASK(55, 54) | BIT(50)) #define __HFGWTR_EL2_RES0 (GENMASK(63, 56) | GENMASK(53, 51) | \ BIT(46) | BIT(42) | BIT(40) | BIT(28) | \ GENMASK(26, 25) | BIT(21) | BIT(18) | \ GENMASK(15, 14) | GENMASK(10, 9) | BIT(2)) #define __HFGWTR_EL2_MASK GENMASK(49, 0) -#define __HFGWTR_EL2_nMASK (GENMASK(55, 54) | BIT(50)) +#define __HFGWTR_EL2_nMASK (GENMASK(58, 57) | GENMASK(55, 54) | BIT(50)) #define __HFGITR_EL2_RES0 GENMASK(63, 57) #define __HFGITR_EL2_MASK GENMASK(54, 0) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 6dcdae4d38cb..a1e24228aaaa 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -55,11 +55,6 @@ static struct irq_ops arch_timer_irq_ops = { .get_input_level = kvm_arch_timer_get_input_level, }; -static bool has_cntpoff(void) -{ - return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); -} - static int nr_timers(struct kvm_vcpu *vcpu) { if (!vcpu_has_nv(vcpu)) @@ -180,7 +175,7 @@ u64 kvm_phys_timer_read(void) return timecounter->cc->read(timecounter->cc); } -static void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) +void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map) { if (vcpu_has_nv(vcpu)) { if (is_hyp_ctxt(vcpu)) { @@ -548,8 +543,7 @@ static void timer_save_state(struct arch_timer_context *ctx) timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL)); cval = read_sysreg_el0(SYS_CNTP_CVAL); - if (!has_cntpoff()) - cval -= timer_get_offset(ctx); + cval -= timer_get_offset(ctx); timer_set_cval(ctx, cval); @@ -636,8 +630,7 @@ static void timer_restore_state(struct arch_timer_context *ctx) cval = timer_get_cval(ctx); offset = timer_get_offset(ctx); set_cntpoff(offset); - if (!has_cntpoff()) - cval += offset; + cval += offset; write_sysreg_el0(cval, SYS_CNTP_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL); diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 9ced1bf0c2b7..ee902ff2a50f 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -977,6 +977,8 @@ enum fg_filter_id { static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = { /* HFGRTR_EL2, HFGWTR_EL2 */ + SR_FGT(SYS_PIR_EL1, HFGxTR, nPIR_EL1, 0), + SR_FGT(SYS_PIRE0_EL1, HFGxTR, nPIRE0_EL1, 0), SR_FGT(SYS_TPIDR2_EL0, HFGxTR, nTPIDR2_EL0, 0), SR_FGT(SYS_SMPRI_EL1, HFGxTR, nSMPRI_EL1, 0), SR_FGT(SYS_ACCDATA_EL1, HFGxTR, nACCDATA_EL1, 0), diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 6537f58b1a8c..448b17080d36 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -39,6 +39,26 @@ static void __activate_traps(struct kvm_vcpu *vcpu) ___activate_traps(vcpu); + if (has_cntpoff()) { + struct timer_map map; + + get_timer_map(vcpu, &map); + + /* + * We're entrering the guest. Reload the correct + * values from memory now that TGE is clear. + */ + if (map.direct_ptimer == vcpu_ptimer(vcpu)) + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + if (map.direct_ptimer == vcpu_hptimer(vcpu)) + val = __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2); + + if (map.direct_ptimer) { + write_sysreg_el0(val, SYS_CNTP_CVAL); + isb(); + } + } + val = read_sysreg(cpacr_el1); val |= CPACR_ELx_TTA; val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN | @@ -77,6 +97,30 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2); + if (has_cntpoff()) { + struct timer_map map; + u64 val, offset; + + get_timer_map(vcpu, &map); + + /* + * We're exiting the guest. Save the latest CVAL value + * to memory and apply the offset now that TGE is set. + */ + val = read_sysreg_el0(SYS_CNTP_CVAL); + if (map.direct_ptimer == vcpu_ptimer(vcpu)) + __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = val; + if (map.direct_ptimer == vcpu_hptimer(vcpu)) + __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = val; + + offset = read_sysreg_s(SYS_CNTPOFF_EL2); + + if (map.direct_ptimer && offset) { + write_sysreg_el0(val + offset, SYS_CNTP_CVAL); + isb(); + } + } + /* * ARM errata 1165522 and 1530923 require the actual execution of the * above before we can switch to the EL2/EL0 translation regime used by diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 0eea225fd09a..a243934c5568 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -39,7 +39,7 @@ void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !pmu || !kvm_pmu_switch_needed(attr)) + if (!kvm_arm_support_pmu_v3() || !kvm_pmu_switch_needed(attr)) return; if (!attr->exclude_host) @@ -55,7 +55,7 @@ void kvm_clr_pmu_events(u32 clr) { struct kvm_pmu_events *pmu = kvm_get_pmu_events(); - if (!kvm_arm_support_pmu_v3() || !pmu) + if (!kvm_arm_support_pmu_v3()) return; pmu->events_host &= ~clr; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e92ec810d449..0afd6136e275 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2122,8 +2122,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi }, { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, - { SYS_DESC(SYS_PIRE0_EL1), access_vm_reg, reset_unknown, PIRE0_EL1 }, - { SYS_DESC(SYS_PIR_EL1), access_vm_reg, reset_unknown, PIR_EL1 }, + { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1 }, + { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1 }, { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, { SYS_DESC(SYS_LORSA_EL1), trap_loregion }, diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 7b2ac1319d70..467ee6b95ae1 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -592,7 +592,7 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, gfn_t gfn = gpa >> PAGE_SHIFT; int srcu_idx, err; kvm_pfn_t pfn; - pte_t *ptep, entry, old_pte; + pte_t *ptep, entry; bool writeable; unsigned long prot_bits; unsigned long mmu_seq; @@ -664,7 +664,6 @@ retry: entry = pfn_pte(pfn, __pgprot(prot_bits)); /* Write the PTE */ - old_pte = *ptep; set_pte(ptep, entry); err = 0; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c1b47d608a2b..efaebba5ee19 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -303,11 +303,6 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) return 0; } -static inline int gisa_in_alert_list(struct kvm_s390_gisa *gisa) -{ - return READ_ONCE(gisa->next_alert) != (u32)virt_to_phys(gisa); -} - static inline void gisa_set_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) { set_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); @@ -3216,11 +3211,12 @@ void kvm_s390_gisa_destroy(struct kvm *kvm) if (!gi->origin) return; - if (gi->alert.mask) - KVM_EVENT(3, "vm 0x%pK has unexpected iam 0x%02x", - kvm, gi->alert.mask); - while (gisa_in_alert_list(gi->origin)) - cpu_relax(); + WARN(gi->alert.mask != 0x00, + "unexpected non zero alert.mask 0x%02x", + gi->alert.mask); + gi->alert.mask = 0x00; + if (gisa_set_iam(gi->origin, gi->alert.mask)) + process_gib_alert_list(); hrtimer_cancel(&gi->timer); gi->origin = NULL; VM_EVENT(kvm, 3, "gisa 0x%pK destroyed", gisa); diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 31089b851c4f..a2be3aefff9f 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -157,7 +157,8 @@ static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif -extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u32 pkru); +extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, + unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 17715cb8731d..70d139406bc8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -528,7 +528,6 @@ struct kvm_pmu { u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; - struct irq_work irq_work; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 19bf955b67e0..3ac0ffc4f3e2 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -268,6 +268,7 @@ enum avic_ipi_failure_cause { AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, AVIC_IPI_FAILURE_INVALID_TARGET, AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, + AVIC_IPI_FAILURE_INVALID_IPI_VECTOR, }; #define AVIC_PHYSICAL_MAX_INDEX_MASK GENMASK_ULL(8, 0) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index a86d37052a64..a21a4d0ecc34 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -369,14 +369,15 @@ int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) EXPORT_SYMBOL_GPL(fpu_swap_kvm_fpstate); void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, - unsigned int size, u32 pkru) + unsigned int size, u64 xfeatures, u32 pkru) { struct fpstate *kstate = gfpu->fpstate; union fpregs_state *ustate = buf; struct membuf mb = { .p = buf, .left = size }; if (cpu_feature_enabled(X86_FEATURE_XSAVE)) { - __copy_xstate_to_uabi_buf(mb, kstate, pkru, XSTATE_COPY_XSAVE); + __copy_xstate_to_uabi_buf(mb, kstate, xfeatures, pkru, + XSTATE_COPY_XSAVE); } else { memcpy(&ustate->fxsave, &kstate->regs.fxsave, sizeof(ustate->fxsave)); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index cadf68737e6b..ef6906107c54 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1049,6 +1049,7 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy + * @xfeatures: The mask of xfeatures to save (XSAVE mode only) * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * @@ -1059,7 +1060,8 @@ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, - u32 pkru_val, enum xstate_copy_mode copy_mode) + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; @@ -1083,7 +1085,7 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, break; case XSTATE_COPY_XSAVE: - header.xfeatures &= fpstate->user_xfeatures; + header.xfeatures &= fpstate->user_xfeatures & xfeatures; break; } @@ -1185,6 +1187,7 @@ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, + tsk->thread.fpu.fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } @@ -1536,10 +1539,7 @@ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; - - if (!guest_fpu) - newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; - + newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index a4ecb04d8d64..3518fb26d06b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -43,7 +43,8 @@ enum xstate_copy_mode { struct membuf; extern void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, - u32 pkru_val, enum xstate_copy_mode copy_mode); + u64 xfeatures, u32 pkru_val, + enum xstate_copy_mode copy_mode); extern void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode mode); extern int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0544e30b4946..773132c3bf5a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -360,14 +360,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent); - /* - * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if - * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't - * supported by the host. - */ - vcpu->arch.guest_fpu.fpstate->user_xfeatures = vcpu->arch.guest_supported_xcr0 | - XFEATURE_MASK_FPSSE; - kvm_update_pv_runtime(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index dcd60b39e794..3e977dbbf993 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2759,13 +2759,17 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = kvm_lapic_get_reg(apic, lvt_type); int vector, mode, trig_mode; + int r; if (kvm_apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { vector = reg & APIC_VECTOR_MASK; mode = reg & APIC_MODE_MASK; trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; - return __apic_accept_irq(apic, mode, vector, 1, trig_mode, - NULL); + + r = __apic_accept_irq(apic, mode, vector, 1, trig_mode, NULL); + if (r && lvt_type == APIC_LVTPC) + kvm_lapic_set_reg(apic, APIC_LVTPC, reg | APIC_LVT_MASKED); + return r; } return 0; } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index edb89b51b383..9ae07db6f0f6 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,14 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static void kvm_pmi_trigger_fn(struct irq_work *irq_work) -{ - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); - struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - - kvm_pmu_deliver_pmi(vcpu); -} - static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -124,20 +116,7 @@ static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); } - if (!pmc->intr || skip_pmi) - return; - - /* - * Inject PMI. If vcpu was in a guest mode during NMI PMI - * can be ejected on a guest mode re-entry. Otherwise we can't - * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until - * woken up. So we should wake it, but this is impossible from - * NMI context. Do it from irq work instead. - */ - if (in_pmi && !kvm_handling_nmi_from_guest(pmc->vcpu)) - irq_work_queue(&pmc_to_pmu(pmc)->irq_work); - else + if (pmc->intr && !skip_pmi) kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } @@ -675,9 +654,6 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - - irq_work_sync(&pmu->irq_work); static_call(kvm_x86_pmu_reset)(vcpu); } @@ -687,7 +663,6 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); static_call(kvm_x86_pmu_init)(vcpu); - init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7d9ba301c090..1d64113de488 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -74,6 +74,12 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } +static inline void pmc_write_counter(struct kvm_pmc *pmc, u64 val) +{ + pmc->counter += val - pmc_read_counter(pmc); + pmc->counter &= pmc_bitmask(pmc); +} + static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 2092db892d7d..4b74ea91f4e6 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -529,8 +529,11 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); break; + case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: + /* Invalid IPI with vector < 16 */ + break; default: - pr_err("Unknown IPI interception\n"); + vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n"); } return 1; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index dd496c9e5f91..3fea8c47679e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1253,6 +1253,9 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) nested_svm_uninit_mmu_context(vcpu); vmcb_mark_all_dirty(svm->vmcb); + + if (kvm_apicv_activated(vcpu->kvm)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); } kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index cef5a3d0abd0..373ff6a6687b 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -160,7 +160,7 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* MSR_PERFCTRn */ pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); if (pmc) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); return 0; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9507df93f410..beea99c8e8e0 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -691,7 +691,7 @@ static int svm_hardware_enable(void) */ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { struct sev_es_save_area *hostsa; - u32 msr_hi; + u32 __maybe_unused msr_hi; hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); @@ -913,8 +913,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) if (intercept == svm->x2avic_msrs_intercepted) return; - if (!x2avic_enabled || - !apic_x2apic_mode(svm->vcpu.arch.apic)) + if (!x2avic_enabled) return; for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index f2efa0bf7ae8..820d3e1f6b4f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -436,11 +436,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_fixed_pmc(pmu, msr))) { - pmc->counter += data - pmc_read_counter(pmc); + pmc_write_counter(pmc, data); pmc_update_sample_period(pmc); break; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f18b06bbda6..41cce5031126 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5382,26 +5382,37 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, return 0; } -static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, - struct kvm_xsave *guest_xsave) -{ - if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) - return; - - fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, - guest_xsave->region, - sizeof(guest_xsave->region), - vcpu->arch.pkru); -} static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, u8 *state, unsigned int size) { + /* + * Only copy state for features that are enabled for the guest. The + * state itself isn't problematic, but setting bits in the header for + * features that are supported in *this* host but not exposed to the + * guest can result in KVM_SET_XSAVE failing when live migrating to a + * compatible host without the features that are NOT exposed to the + * guest. + * + * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if + * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't + * supported by the host. + */ + u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 | + XFEATURE_MASK_FPSSE; + if (fpstate_is_confidential(&vcpu->arch.guest_fpu)) return; - fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, - state, size, vcpu->arch.pkru); + fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu, state, size, + supported_xcr0, vcpu->arch.pkru); +} + +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + return kvm_vcpu_ioctl_x86_get_xsave2(vcpu, (void *)guest_xsave->region, + sizeof(guest_xsave->region)); } static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, @@ -12843,6 +12854,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; #endif + if (kvm_test_request(KVM_REQ_PMI, vcpu)) + return true; + if (kvm_arch_interrupt_allowed(vcpu) && (kvm_cpu_has_interrupt(vcpu) || kvm_guest_apic_has_interrupt(vcpu))) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 884cb51c8f67..234a84ecde8b 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -1478,7 +1478,7 @@ static int dev_get_regmap_match(struct device *dev, void *res, void *data) /* If the user didn't specify a name match any */ if (data) - return !strcmp((*r)->name, data); + return (*r)->name && !strcmp((*r)->name, data); else return 1; } diff --git a/drivers/spi/spi-npcm-fiu.c b/drivers/spi/spi-npcm-fiu.c index 0ca21ff0e9cc..e42248519688 100644 --- a/drivers/spi/spi-npcm-fiu.c +++ b/drivers/spi/spi-npcm-fiu.c @@ -353,8 +353,9 @@ static int npcm_fiu_uma_read(struct spi_mem *mem, uma_cfg |= ilog2(op->cmd.buswidth); uma_cfg |= ilog2(op->addr.buswidth) << NPCM_FIU_UMA_CFG_ADBPCK_SHIFT; - uma_cfg |= ilog2(op->dummy.buswidth) - << NPCM_FIU_UMA_CFG_DBPCK_SHIFT; + if (op->dummy.nbytes) + uma_cfg |= ilog2(op->dummy.buswidth) + << NPCM_FIU_UMA_CFG_DBPCK_SHIFT; uma_cfg |= ilog2(op->data.buswidth) << NPCM_FIU_UMA_CFG_RDBPCK_SHIFT; uma_cfg |= op->dummy.nbytes << NPCM_FIU_UMA_CFG_DBSIZ_SHIFT; diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c index 5c87817a4f4c..3dcf83f5e7b4 100644 --- a/drivers/video/fbdev/aty/atyfb_base.c +++ b/drivers/video/fbdev/aty/atyfb_base.c @@ -3440,11 +3440,15 @@ static int atyfb_setup_generic(struct pci_dev *pdev, struct fb_info *info, } info->fix.mmio_start = raddr; +#if defined(__i386__) || defined(__ia64__) /* * By using strong UC we force the MTRR to never have an * effect on the MMIO region on both non-PAT and PAT systems. */ par->ati_regbase = ioremap_uc(info->fix.mmio_start, 0x1000); +#else + par->ati_regbase = ioremap(info->fix.mmio_start, 0x1000); +#endif if (par->ati_regbase == NULL) return -ENOMEM; diff --git a/drivers/video/fbdev/core/cfbcopyarea.c b/drivers/video/fbdev/core/cfbcopyarea.c index 6d4bfeecee35..5b80bf3dae50 100644 --- a/drivers/video/fbdev/core/cfbcopyarea.c +++ b/drivers/video/fbdev/core/cfbcopyarea.c @@ -382,7 +382,7 @@ void cfb_copyarea(struct fb_info *p, const struct fb_copyarea *area) { u32 dx = area->dx, dy = area->dy, sx = area->sx, sy = area->sy; u32 height = area->height, width = area->width; - unsigned long const bits_per_line = p->fix.line_length*8u; + unsigned int const bits_per_line = p->fix.line_length * 8u; unsigned long __iomem *base = NULL; int bits = BITS_PER_LONG, bytes = bits >> 3; unsigned dst_idx = 0, src_idx = 0, rev_copy = 0; diff --git a/drivers/video/fbdev/core/syscopyarea.c b/drivers/video/fbdev/core/syscopyarea.c index c1eda3190968..7b8bd3a2bedc 100644 --- a/drivers/video/fbdev/core/syscopyarea.c +++ b/drivers/video/fbdev/core/syscopyarea.c @@ -316,7 +316,7 @@ void sys_copyarea(struct fb_info *p, const struct fb_copyarea *area) { u32 dx = area->dx, dy = area->dy, sx = area->sx, sy = area->sy; u32 height = area->height, width = area->width; - unsigned long const bits_per_line = p->fix.line_length*8u; + unsigned int const bits_per_line = p->fix.line_length * 8u; unsigned long *base = NULL; int bits = BITS_PER_LONG, bytes = bits >> 3; unsigned dst_idx = 0, src_idx = 0, rev_copy = 0; diff --git a/drivers/video/fbdev/mmp/hw/mmp_ctrl.h b/drivers/video/fbdev/mmp/hw/mmp_ctrl.h index 167585a889d3..719b99a9bc77 100644 --- a/drivers/video/fbdev/mmp/hw/mmp_ctrl.h +++ b/drivers/video/fbdev/mmp/hw/mmp_ctrl.h @@ -1406,7 +1406,7 @@ struct mmphw_ctrl { /*pathes*/ int path_num; - struct mmphw_path_plat path_plats[]; + struct mmphw_path_plat path_plats[] __counted_by(path_num); }; static inline int overlay_is_vid(struct mmp_overlay *overlay) diff --git a/drivers/video/fbdev/omap/omapfb_main.c b/drivers/video/fbdev/omap/omapfb_main.c index f28cb90947a3..42c96f1cfc93 100644 --- a/drivers/video/fbdev/omap/omapfb_main.c +++ b/drivers/video/fbdev/omap/omapfb_main.c @@ -1645,13 +1645,13 @@ static int omapfb_do_probe(struct platform_device *pdev, } fbdev->int_irq = platform_get_irq(pdev, 0); if (fbdev->int_irq < 0) { - r = ENXIO; + r = -ENXIO; goto cleanup; } fbdev->ext_irq = platform_get_irq(pdev, 1); if (fbdev->ext_irq < 0) { - r = ENXIO; + r = -ENXIO; goto cleanup; } diff --git a/drivers/video/fbdev/sa1100fb.c b/drivers/video/fbdev/sa1100fb.c index 3d76ce111488..cf0f706762b4 100644 --- a/drivers/video/fbdev/sa1100fb.c +++ b/drivers/video/fbdev/sa1100fb.c @@ -1214,7 +1214,7 @@ static struct platform_driver sa1100fb_driver = { }, }; -int __init sa1100fb_init(void) +static int __init sa1100fb_init(void) { if (fb_get_options("sa1100fb", NULL)) return -ENODEV; diff --git a/drivers/video/fbdev/uvesafb.c b/drivers/video/fbdev/uvesafb.c index a1a67830fbbc..e1f421e91b4f 100644 --- a/drivers/video/fbdev/uvesafb.c +++ b/drivers/video/fbdev/uvesafb.c @@ -1928,10 +1928,10 @@ static void uvesafb_exit(void) } } - cn_del_callback(&uvesafb_cn_id); driver_remove_file(&uvesafb_driver.driver, &driver_attr_v86d); platform_device_unregister(uvesafb_device); platform_driver_unregister(&uvesafb_driver); + cn_del_callback(&uvesafb_cn_id); } module_exit(uvesafb_exit); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5a5a8d488a7b..b9ef6f54635c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5109,7 +5109,7 @@ static void init_alloc_chunk_ctl_policy_regular( ASSERT(space_info); ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); - ctl->max_stripe_size = ctl->max_chunk_size; + ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); diff --git a/fs/namei.c b/fs/namei.c index 567ee547492b..94565bd7e73f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty) } } - result->refcnt = 1; + atomic_set(&result->refcnt, 1); /* The empty path is special. */ if (unlikely(!len)) { if (empty) @@ -249,7 +249,7 @@ getname_kernel(const char * filename) memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; - result->refcnt = 1; + atomic_set(&result->refcnt, 1); audit_getname(result); return result; @@ -261,9 +261,10 @@ void putname(struct filename *name) if (IS_ERR(name)) return; - BUG_ON(name->refcnt <= 0); + if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) + return; - if (--name->refcnt > 0) + if (!atomic_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index a9d82bbb4729..63f70259edc0 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -1106,10 +1106,10 @@ repack: } } - /* + /* * The code below may require additional cluster (to extend attribute list) - * and / or one MFT record - * It is too complex to undo operations if -ENOSPC occurs deep inside + * and / or one MFT record + * It is too complex to undo operations if -ENOSPC occurs deep inside * in 'ni_insert_nonresident'. * Return in advance -ENOSPC here if there are no free cluster and no free MFT. */ @@ -1736,10 +1736,8 @@ repack: le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); - if (!attr_b) { - err = -ENOENT; - goto out; - } + if (!attr_b) + return -ENOENT; attr = attr_b; le = le_b; diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 42631b31adf1..7c01735d1219 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -52,7 +52,8 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (!attr->non_res) { lsize = le32_to_cpu(attr->res.data_size); - le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); + /* attr is resident: lsize < record_size (1K or 4K) */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; @@ -80,7 +81,17 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (err < 0) goto out; - le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); + /* attr is nonresident. + * The worst case: + * 1T (2^40) extremely fragmented file. + * cluster = 4K (2^12) => 2^28 fragments + * 2^9 fragments per one record => 2^19 records + * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes. + * + * the result is 16M bytes per attribute list. + * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes] + */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 107e808e06ea..63f14a0232f6 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -125,6 +125,7 @@ void wnd_close(struct wnd_bitmap *wnd) struct rb_node *node, *next; kfree(wnd->free_bits); + wnd->free_bits = NULL; run_close(&wnd->run); node = rb_first(&wnd->start_tree); @@ -659,7 +660,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) wnd->bits_last = wbits; wnd->free_bits = - kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); + kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO); + if (!wnd->free_bits) return -ENOMEM; diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 063a6654199b..ec0566b322d5 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -309,7 +309,11 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; } - dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ + if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) + dt_type = DT_LNK; + else + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 962f12ce6c0a..1f7a194983c5 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -745,8 +745,8 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, unsigned int flags) + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct inode *inode = in->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 2b85cb10f0be..dad976a68985 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -2148,7 +2148,7 @@ out1: for (i = 0; i < pages_per_frame; i++) { pg = pages[i]; - if (i == idx) + if (i == idx || !pg) continue; unlock_page(pg); put_page(pg); @@ -3208,6 +3208,12 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup))) continue; + /* Check simple case when parent inode equals current inode. */ + if (ino_get(&fname->home) == ni->vfs_inode.i_ino) { + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + continue; + } + /* ntfs_iget5 may sleep. */ dir = ntfs_iget5(sb, &fname->home, NULL); if (IS_ERR(dir)) { diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 12f28cdf5c83..98ccb6650858 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -2168,8 +2168,10 @@ file_is_valid: if (!page) { page = kmalloc(log->page_size, GFP_NOFS); - if (!page) - return -ENOMEM; + if (!page) { + err = -ENOMEM; + goto out; + } } /* diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 33afee0f5559..fbfe21dbb425 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -983,18 +983,11 @@ out: if (err) return err; - mark_inode_dirty(&ni->vfs_inode); + mark_inode_dirty_sync(&ni->vfs_inode); /* verify(!ntfs_update_mftmirr()); */ - /* - * If we used wait=1, sync_inode_metadata waits for the io for the - * inode to finish. It hangs when media is removed. - * So wait=0 is sent down to sync_inode_metadata - * and filemap_fdatawrite is used for the data blocks. - */ - err = sync_inode_metadata(&ni->vfs_inode, 0); - if (!err) - err = filemap_fdatawrite(ni->vfs_inode.i_mapping); + /* write mft record on disk. */ + err = _ni_write_inode(&ni->vfs_inode, 1); return err; } @@ -2461,10 +2454,12 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) { CLST end, i, zone_len, zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; + bool dirty = false; down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); if (!wnd_is_used(wnd, lcn, len)) { - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + /* mark volume as dirty out of wnd->rw_lock */ + dirty = true; end = lcn + len; len = 0; @@ -2518,6 +2513,8 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) out: up_write(&wnd->rw_lock); + if (dirty) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } /* diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 124c6e822623..cf92b2433f7a 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -729,6 +729,9 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, u32 total = le32_to_cpu(hdr->total); u16 offs[128]; + if (unlikely(!cmp)) + return NULL; + fill_table: if (end > total) return NULL; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index eb2ed0701495..d6d021e19aaa 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -170,8 +170,8 @@ next_attr: nt2kernel(std5->cr_time, &ni->i_crtime); #endif nt2kernel(std5->a_time, &inode->i_atime); - ctime = inode_get_ctime(inode); nt2kernel(std5->c_time, &ctime); + inode_set_ctime_to_ts(inode, ctime); nt2kernel(std5->m_time, &inode->i_mtime); ni->std_fa = std5->fa; @@ -1660,7 +1660,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ - inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, ni->i_crtime); + inode->i_atime = inode->i_mtime = + inode_set_ctime_to_ts(inode, ni->i_crtime); dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime); mark_inode_dirty(dir); diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ad430d50bd79..eedacf94edd8 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de) err = ntfs_link_inode(inode, de); if (!err) { - dir->i_mtime = inode_set_ctime_to_ts(inode, - inode_set_ctime_current(dir)); + dir->i_mtime = inode_set_ctime_to_ts( + inode, inode_set_ctime_current(dir)); mark_inode_dirty(inode); mark_inode_dirty(dir); d_instantiate(de, inode); @@ -373,7 +373,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, #ifdef CONFIG_NTFS3_FS_POSIX_ACL if (IS_POSIXACL(dir)) { - /* + /* * Load in cache current acl to avoid ni_lock(dir): * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create -> * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 98b76d1b09e7..86aecbb01a92 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -847,7 +847,7 @@ struct OBJECT_ID { // Birth Volume Id is the Object Id of the Volume on. // which the Object Id was allocated. It never changes. struct GUID BirthVolumeId; //0x10: - + // Birth Object Id is the first Object Id that was // ever assigned to this MFT Record. I.e. If the Object Id // is changed for some reason, this field will reflect the diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 629403ede6e5..0e6a2777870c 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -42,9 +42,11 @@ enum utf16_endian; #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 +#define MAXIMUM_SHIFT_BYTES_PER_MFT 12 #define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512) #define MAXIMUM_BYTES_PER_INDEX 4096 +#define MAXIMUM_SHIFT_BYTES_PER_INDEX 12 #define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512) /* NTFS specific error code when fixup failed. */ @@ -495,8 +497,6 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, - CLST len); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index c12ebffc94da..53629b1f65e9 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -189,12 +189,19 @@ out: return err; } +/* + * mi_enum_attr - start/continue attributes enumeration in record. + * + * NOTE: mi->mrec - memory of size sbi->record_size + * here we sure that mi->mrec->total == sbi->record_size (see mi_read) + */ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) { const struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); - u32 t32, off, asize; + u32 t32, off, asize, prev_type; u16 t16; + u64 data_size, alloc_size, tot_size; if (!attr) { u32 total = le32_to_cpu(rec->total); @@ -213,6 +220,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (!is_rec_inuse(rec)) return NULL; + prev_type = 0; attr = Add2Ptr(rec, off); } else { /* Check if input attr inside record. */ @@ -226,11 +234,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return NULL; } - if (off + asize < off) { - /* Overflow check. */ + /* Overflow check. */ + if (off + asize < off) return NULL; - } + prev_type = le32_to_cpu(attr->type); attr = Add2Ptr(attr, asize); off += asize; } @@ -250,7 +258,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* 0x100 is last known attribute for now. */ t32 = le32_to_cpu(attr->type); - if ((t32 & 0xf) || (t32 > 0x100)) + if (!t32 || (t32 & 0xf) || (t32 > 0x100)) + return NULL; + + /* attributes in record must be ordered by type */ + if (t32 < prev_type) return NULL; /* Check overflow and boundary. */ @@ -259,16 +271,15 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* Check size of attribute. */ if (!attr->non_res) { + /* Check resident fields. */ if (asize < SIZEOF_RESIDENT) return NULL; t16 = le16_to_cpu(attr->res.data_off); - if (t16 > asize) return NULL; - t32 = le32_to_cpu(attr->res.data_size); - if (t16 + t32 > asize) + if (t16 + le32_to_cpu(attr->res.data_size) > asize) return NULL; t32 = sizeof(short) * attr->name_len; @@ -278,21 +289,52 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return attr; } - /* Check some nonresident fields. */ - if (attr->name_len && - le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > - le16_to_cpu(attr->nres.run_off)) { + /* Check nonresident fields. */ + if (attr->non_res != 1) + return NULL; + + t16 = le16_to_cpu(attr->nres.run_off); + if (t16 > asize) + return NULL; + + t32 = sizeof(short) * attr->name_len; + if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) + return NULL; + + /* Check start/end vcn. */ + if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1) return NULL; - } - if (attr->nres.svcn || !is_attr_ext(attr)) { + data_size = le64_to_cpu(attr->nres.data_size); + if (le64_to_cpu(attr->nres.valid_size) > data_size) + return NULL; + + alloc_size = le64_to_cpu(attr->nres.alloc_size); + if (data_size > alloc_size) + return NULL; + + t32 = mi->sbi->cluster_mask; + if (alloc_size & t32) + return NULL; + + if (!attr->nres.svcn && is_attr_ext(attr)) { + /* First segment of sparse/compressed attribute */ + if (asize + 8 < SIZEOF_NONRESIDENT_EX) + return NULL; + + tot_size = le64_to_cpu(attr->nres.total_size); + if (tot_size & t32) + return NULL; + + if (tot_size > alloc_size) + return NULL; + } else { if (asize + 8 < SIZEOF_NONRESIDENT) return NULL; if (attr->nres.c_unit) return NULL; - } else if (asize + 8 < SIZEOF_NONRESIDENT_EX) - return NULL; + } return attr; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 5661a363005e..f763e3256ccc 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -453,15 +453,23 @@ static struct proc_dir_entry *proc_info_root; * ntfs3.1 * cluster size * number of clusters + * total number of mft records + * number of used mft records ~= number of files + folders + * real state of ntfs "dirty"/"clean" + * current state of ntfs "dirty"/"clean" */ static int ntfs3_volinfo(struct seq_file *m, void *o) { struct super_block *sb = m->private; struct ntfs_sb_info *sbi = sb->s_fs_info; - seq_printf(m, "ntfs%d.%d\n%u\n%zu\n", sbi->volume.major_ver, - sbi->volume.minor_ver, sbi->cluster_size, - sbi->used.bitmap.nbits); + seq_printf(m, "ntfs%d.%d\n%u\n%zu\n\%zu\n%zu\n%s\n%s\n", + sbi->volume.major_ver, sbi->volume.minor_ver, + sbi->cluster_size, sbi->used.bitmap.nbits, + sbi->mft.bitmap.nbits, + sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap), + sbi->volume.real_dirty ? "dirty" : "clean", + (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean"); return 0; } @@ -488,9 +496,13 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, { int err; struct super_block *sb = pde_data(file_inode(file)); - struct ntfs_sb_info *sbi = sb->s_fs_info; ssize_t ret = count; - u8 *label = kmalloc(count, GFP_NOFS); + u8 *label; + + if (sb_rdonly(sb)) + return -EROFS; + + label = kmalloc(count, GFP_NOFS); if (!label) return -ENOMEM; @@ -502,7 +514,7 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, while (ret > 0 && label[ret - 1] == '\n') ret -= 1; - err = ntfs_set_label(sbi, label, ret); + err = ntfs_set_label(sb->s_fs_info, label, ret); if (err < 0) { ntfs_err(sb, "failed (%d) to write label", err); @@ -576,20 +588,30 @@ static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi) wnd_close(&sbi->mft.bitmap); wnd_close(&sbi->used.bitmap); - if (sbi->mft.ni) + if (sbi->mft.ni) { iput(&sbi->mft.ni->vfs_inode); + sbi->mft.ni = NULL; + } - if (sbi->security.ni) + if (sbi->security.ni) { iput(&sbi->security.ni->vfs_inode); + sbi->security.ni = NULL; + } - if (sbi->reparse.ni) + if (sbi->reparse.ni) { iput(&sbi->reparse.ni->vfs_inode); + sbi->reparse.ni = NULL; + } - if (sbi->objid.ni) + if (sbi->objid.ni) { iput(&sbi->objid.ni->vfs_inode); + sbi->objid.ni = NULL; + } - if (sbi->volume.ni) + if (sbi->volume.ni) { iput(&sbi->volume.ni->vfs_inode); + sbi->volume.ni = NULL; + } ntfs_update_mftmirr(sbi, 0); @@ -836,7 +858,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; - u64 sectors, clusters, mlcn, mlcn2; + u64 sectors, clusters, mlcn, mlcn2, dev_size0; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; @@ -845,6 +867,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u32 boot_off = 0; const char *hint = "Primary boot"; + /* Save original dev_size. Used with alternative boot. */ + dev_size0 = dev_size; + sbi->volume.blocks = dev_size >> PAGE_SHIFT; bh = ntfs_bread(sb, 0); @@ -853,6 +878,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, check_boot: err = -EINVAL; + + /* Corrupted image; do not read OOB */ + if (bh->b_size - sizeof(*boot) < boot_off) + goto out; + boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off); if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { @@ -899,9 +929,17 @@ check_boot: goto out; } - sbi->record_size = record_size = - boot->record_size < 0 ? 1 << (-boot->record_size) : - (u32)boot->record_size << cluster_bits; + if (boot->record_size >= 0) { + record_size = (u32)boot->record_size << cluster_bits; + } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) { + record_size = 1u << (-boot->record_size); + } else { + ntfs_err(sb, "%s: invalid record size %d.", hint, + boot->record_size); + goto out; + } + + sbi->record_size = record_size; sbi->record_bits = blksize_bits(record_size); sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes @@ -918,9 +956,15 @@ check_boot: goto out; } - sbi->index_size = boot->index_size < 0 ? - 1u << (-boot->index_size) : - (u32)boot->index_size << cluster_bits; + if (boot->index_size >= 0) { + sbi->index_size = (u32)boot->index_size << cluster_bits; + } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) { + sbi->index_size = 1u << (-boot->index_size); + } else { + ntfs_err(sb, "%s: invalid index size %d.", hint, + boot->index_size); + goto out; + } /* Check index record size. */ if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { @@ -1055,17 +1099,17 @@ check_boot: if (bh->b_blocknr && !sb_rdonly(sb)) { /* - * Alternative boot is ok but primary is not ok. - * Do not update primary boot here 'cause it may be faked boot. - * Let ntfs to be mounted and update boot later. - */ + * Alternative boot is ok but primary is not ok. + * Do not update primary boot here 'cause it may be faked boot. + * Let ntfs to be mounted and update boot later. + */ *boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN); } out: - if (err == -EINVAL && !bh->b_blocknr && dev_size > PAGE_SHIFT) { + if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); - u64 lbo = dev_size - sizeof(*boot); + u64 lbo = dev_size0 - sizeof(*boot); /* * Try alternative boot (last sector) @@ -1079,6 +1123,7 @@ out: boot_off = lbo & (block_size - 1); hint = "Alternative boot"; + dev_size = dev_size0; /* restore original size. */ goto check_boot; } brelse(bh); @@ -1367,7 +1412,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) } bytes = inode->i_size; - sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN); + sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL); if (!t) { err = -ENOMEM; goto put_inode_out; @@ -1521,9 +1566,9 @@ load_root: if (boot2) { /* - * Alternative boot is ok but primary is not ok. - * Volume is recognized as NTFS. Update primary boot. - */ + * Alternative boot is ok but primary is not ok. + * Volume is recognized as NTFS. Update primary boot. + */ struct buffer_head *bh0 = sb_getblk(sb, 0); if (bh0) { if (buffer_locked(bh0)) @@ -1564,6 +1609,7 @@ put_inode_out: out: ntfs3_put_sbi(sbi); kfree(boot2); + ntfs3_put_sbi(sbi); return err; } @@ -1757,7 +1803,6 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); - #ifdef CONFIG_PROC_FS /* Create "/proc/fs/ntfs3" */ proc_info_root = proc_mkdir("fs/ntfs3", NULL); @@ -1799,7 +1844,6 @@ static void __exit exit_ntfs_fs(void) if (proc_info_root) remove_proc_entry("fs/ntfs3", NULL); #endif - } MODULE_LICENSE("GPL"); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 29fd391899e5..4920548192a0 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -211,7 +211,8 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, size = le32_to_cpu(info->size); /* Enumerate all xattrs. */ - for (ret = 0, off = 0; off < size; off += ea_size) { + ret = 0; + for (off = 0; off + sizeof(struct EA_FULL) < size; off += ea_size) { ea = Add2Ptr(ea_all, off); ea_size = unpacked_ea_size(ea); @@ -219,6 +220,10 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, break; if (buffer) { + /* Check if we can use field ea->name */ + if (off + ea_size > size) + break; + if (ret + ea->name_len + 1 > bytes_per_buffer) { err = -ERANGE; goto out; diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index bb3cb005873e..e748bc957d83 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -82,6 +82,8 @@ struct timer_map { struct arch_timer_context *emul_ptimer; }; +void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map); + struct arch_timer_cpu { struct arch_timer_context timers[NR_KVM_TIMERS]; @@ -145,4 +147,9 @@ u64 timer_get_cval(struct arch_timer_context *ctxt); void kvm_timer_cpu_up(void); void kvm_timer_cpu_down(void); +static inline bool has_cntpoff(void) +{ + return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); +} + #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index b528f063e8ff..4a40823c3c67 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2403,7 +2403,7 @@ struct audit_names; struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ - int refcnt; + atomic_t refcnt; struct audit_names *aname; const char iname[]; }; diff --git a/include/video/mmp_disp.h b/include/video/mmp_disp.h index 77252cb46361..a722dcbf5073 100644 --- a/include/video/mmp_disp.h +++ b/include/video/mmp_disp.h @@ -231,7 +231,7 @@ struct mmp_path { /* layers */ int overlay_num; - struct mmp_overlay overlays[]; + struct mmp_overlay overlays[] __counted_by(overlay_num); }; extern struct mmp_path *mmp_get_path(const char *name); diff --git a/include/video/uvesafb.h b/include/video/uvesafb.h index 8d2a3bfc8dac..47d96e75e8ef 100644 --- a/include/video/uvesafb.h +++ b/include/video/uvesafb.h @@ -109,8 +109,6 @@ struct uvesafb_ktask { u32 ack; }; -static int uvesafb_exec(struct uvesafb_ktask *tsk); - #define UVESAFB_EXACT_RES 1 #define UVESAFB_EXACT_DEPTH 2 diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 21d2fa815e78..6f0d6fb6523f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2212,7 +2212,7 @@ __audit_reusename(const __user char *uptr) if (!n->name) continue; if (n->name->uptr == uptr) { - n->name->refcnt++; + atomic_inc(&n->name->refcnt); return n->name; } } @@ -2241,7 +2241,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - name->refcnt++; + atomic_inc(&name->refcnt); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2373,7 +2373,7 @@ out_alloc: return; if (name) { n->name = name; - name->refcnt++; + atomic_inc(&name->refcnt); } out: @@ -2500,7 +2500,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - found_child->name->refcnt++; + atomic_inc(&found_child->name->refcnt); } } diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 3b21f4063258..881f90f0cbcf 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -189,7 +189,7 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) { int i, size; - if (num < 0) + if (num <= 0) return -EINVAL; if (!fp->exit_handler) { @@ -202,8 +202,8 @@ static int fprobe_init_rethook(struct fprobe *fp, int num) size = fp->nr_maxactive; else size = num * num_possible_cpus() * 2; - if (size < 0) - return -E2BIG; + if (size <= 0) + return -EINVAL; fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler); if (!fp->rethook) diff --git a/mm/slab_common.c b/mm/slab_common.c index 8fda308e400d..9bbffe82d65a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -895,10 +895,13 @@ void __init setup_kmalloc_cache_index_table(void) static unsigned int __kmalloc_minalign(void) { + unsigned int minalign = dma_get_cache_alignment(); + if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && is_swiotlb_allocated()) - return ARCH_KMALLOC_MINALIGN; - return dma_get_cache_alignment(); + minalign = ARCH_KMALLOC_MINALIGN; + + return max(minalign, arch_slab_minalign()); } void __init diff --git a/tools/arch/x86/include/uapi/asm/unistd_32.h b/tools/arch/x86/include/uapi/asm/unistd_32.h index 4798f9d18fe8..9de35df1afc3 100644 --- a/tools/arch/x86/include/uapi/asm/unistd_32.h +++ b/tools/arch/x86/include/uapi/asm/unistd_32.h @@ -26,6 +26,6 @@ #ifndef __NR_setns #define __NR_setns 346 #endif -#ifdef __NR_seccomp +#ifndef __NR_seccomp #define __NR_seccomp 354 #endif diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index 112bc1da732a..ce33d306c2cb 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * tools/testing/selftests/kvm/include/kvm_util.h - * * Copyright (C) 2018, Google LLC. */ #ifndef SELFTEST_KVM_UCALL_COMMON_H diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 4fd042112526..25bc61dac5fb 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -68,6 +68,12 @@ struct xstate { #define XFEATURE_MASK_OPMASK BIT_ULL(5) #define XFEATURE_MASK_ZMM_Hi256 BIT_ULL(6) #define XFEATURE_MASK_Hi16_ZMM BIT_ULL(7) +#define XFEATURE_MASK_PT BIT_ULL(8) +#define XFEATURE_MASK_PKRU BIT_ULL(9) +#define XFEATURE_MASK_PASID BIT_ULL(10) +#define XFEATURE_MASK_CET_USER BIT_ULL(11) +#define XFEATURE_MASK_CET_KERNEL BIT_ULL(12) +#define XFEATURE_MASK_LBR BIT_ULL(15) #define XFEATURE_MASK_XTILE_CFG BIT_ULL(17) #define XFEATURE_MASK_XTILE_DATA BIT_ULL(18) @@ -147,6 +153,7 @@ struct kvm_x86_cpu_feature { #define X86_FEATURE_CLWB KVM_X86_CPU_FEATURE(0x7, 0, EBX, 24) #define X86_FEATURE_UMIP KVM_X86_CPU_FEATURE(0x7, 0, ECX, 2) #define X86_FEATURE_PKU KVM_X86_CPU_FEATURE(0x7, 0, ECX, 3) +#define X86_FEATURE_OSPKE KVM_X86_CPU_FEATURE(0x7, 0, ECX, 4) #define X86_FEATURE_LA57 KVM_X86_CPU_FEATURE(0x7, 0, ECX, 16) #define X86_FEATURE_RDPID KVM_X86_CPU_FEATURE(0x7, 0, ECX, 22) #define X86_FEATURE_SGX_LC KVM_X86_CPU_FEATURE(0x7, 0, ECX, 30) @@ -553,6 +560,13 @@ static inline void xsetbv(u32 index, u64 value) __asm__ __volatile__("xsetbv" :: "a" (eax), "d" (edx), "c" (index)); } +static inline void wrpkru(u32 pkru) +{ + /* Note, ECX and EDX are architecturally required to be '0'. */ + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (pkru), "c"(0), "d"(0)); +} + static inline struct desc_ptr get_gdt(void) { struct desc_ptr gdt; @@ -908,6 +922,15 @@ static inline bool kvm_pmu_has(struct kvm_x86_pmu_feature feature) !kvm_cpu_has(feature.anti_feature); } +static __always_inline uint64_t kvm_cpu_supported_xcr0(void) +{ + if (!kvm_cpu_has_p(X86_PROPERTY_SUPPORTED_XCR0_LO)) + return 0; + + return kvm_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_LO) | + ((uint64_t)kvm_cpu_property(X86_PROPERTY_SUPPORTED_XCR0_HI) << 32); +} + static inline size_t kvm_cpuid2_size(int nr_entries) { return sizeof(struct kvm_cpuid2) + diff --git a/tools/testing/selftests/kvm/lib/guest_sprintf.c b/tools/testing/selftests/kvm/lib/guest_sprintf.c index c4a69d8aeb68..74627514c4d4 100644 --- a/tools/testing/selftests/kvm/lib/guest_sprintf.c +++ b/tools/testing/selftests/kvm/lib/guest_sprintf.c @@ -200,6 +200,13 @@ repeat: ++fmt; } + /* + * Play nice with %llu, %llx, etc. KVM selftests only support + * 64-bit builds, so just treat %ll* the same as %l*. + */ + if (qualifier == 'l' && *fmt == 'l') + ++fmt; + /* default base */ base = 10; diff --git a/tools/testing/selftests/kvm/lib/x86_64/apic.c b/tools/testing/selftests/kvm/lib/x86_64/apic.c index 7168e25c194e..89153a333e83 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/apic.c +++ b/tools/testing/selftests/kvm/lib/x86_64/apic.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * tools/testing/selftests/kvm/lib/x86_64/processor.c - * * Copyright (C) 2021, Google LLC. */ diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c index 20eb2e730800..8698d1ab60d0 100644 --- a/tools/testing/selftests/kvm/memslot_perf_test.c +++ b/tools/testing/selftests/kvm/memslot_perf_test.c @@ -1033,9 +1033,8 @@ static bool test_loop(const struct test_data *data, struct test_result *rbestruntime) { uint64_t maxslots; - struct test_result result; + struct test_result result = {}; - result.nloops = 0; if (!test_execute(targs->nslots, &maxslots, targs->seconds, data, &result.nloops, &result.slot_runtime, &result.guest_runtime)) { @@ -1089,7 +1088,7 @@ int main(int argc, char *argv[]) .seconds = 5, .runs = 1, }; - struct test_result rbestslottime; + struct test_result rbestslottime = {}; int tctr; if (!check_memory_sizes()) @@ -1098,11 +1097,10 @@ int main(int argc, char *argv[]) if (!parse_args(argc, argv, &targs)) return -1; - rbestslottime.slottimens = 0; for (tctr = targs.tfirst; tctr <= targs.tlast; tctr++) { const struct test_data *data = &tests[tctr]; unsigned int runctr; - struct test_result rbestruntime; + struct test_result rbestruntime = {}; if (tctr > targs.tfirst) pr_info("\n"); @@ -1110,7 +1108,6 @@ int main(int argc, char *argv[]) pr_info("Testing %s performance with %i runs, %d seconds each\n", data->name, targs.runs, targs.seconds); - rbestruntime.runtimens = 0; for (runctr = 0; runctr < targs.runs; runctr++) if (!test_loop(data, &targs, &rbestslottime, &rbestruntime)) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c index e446d76d1c0c..6c1278562090 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * KVM_GET/SET_* tests - * * Copyright (C) 2022, Red Hat, Inc. * * Tests for Hyper-V extensions to SVM. diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 7f36c32fa760..18ac5c1952a3 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * tools/testing/selftests/kvm/nx_huge_page_test.c - * * Usage: to be run via nx_huge_page_test.sh, which does the necessary * environment setup and teardown * diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh index 0560149e66ed..7cbb409801ee 100755 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.sh @@ -4,7 +4,6 @@ # Wrapper script which performs setup and cleanup for nx_huge_pages_test. # Makes use of root privileges to set up huge pages and KVM module parameters. # -# tools/testing/selftests/kvm/nx_huge_page_test.sh # Copyright (C) 2022, Google LLC. set -e diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c index 4c4925a8ab45..88b58aab7207 100644 --- a/tools/testing/selftests/kvm/x86_64/state_test.c +++ b/tools/testing/selftests/kvm/x86_64/state_test.c @@ -139,6 +139,83 @@ static void vmx_l1_guest_code(struct vmx_pages *vmx_pages) static void __attribute__((__flatten__)) guest_code(void *arg) { GUEST_SYNC(1); + + if (this_cpu_has(X86_FEATURE_XSAVE)) { + uint64_t supported_xcr0 = this_cpu_supported_xcr0(); + uint8_t buffer[4096]; + + memset(buffer, 0xcc, sizeof(buffer)); + + set_cr4(get_cr4() | X86_CR4_OSXSAVE); + GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSXSAVE)); + + xsetbv(0, xgetbv(0) | supported_xcr0); + + /* + * Modify state for all supported xfeatures to take them out of + * their "init" state, i.e. to make them show up in XSTATE_BV. + * + * Note off-by-default features, e.g. AMX, are out of scope for + * this particular testcase as they have a different ABI. + */ + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_FP); + asm volatile ("fincstp"); + + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_SSE); + asm volatile ("vmovdqu %0, %%xmm0" :: "m" (buffer)); + + if (supported_xcr0 & XFEATURE_MASK_YMM) + asm volatile ("vmovdqu %0, %%ymm0" :: "m" (buffer)); + + if (supported_xcr0 & XFEATURE_MASK_AVX512) { + asm volatile ("kmovq %0, %%k1" :: "r" (-1ull)); + asm volatile ("vmovupd %0, %%zmm0" :: "m" (buffer)); + asm volatile ("vmovupd %0, %%zmm16" :: "m" (buffer)); + } + + if (this_cpu_has(X86_FEATURE_MPX)) { + uint64_t bounds[2] = { 10, 0xffffffffull }; + uint64_t output[2] = { }; + + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDREGS); + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_BNDCSR); + + /* + * Don't bother trying to get BNDCSR into the INUSE + * state. MSR_IA32_BNDCFGS doesn't count as it isn't + * managed via XSAVE/XRSTOR, and BNDCFGU can only be + * modified by XRSTOR. Stuffing XSTATE_BV in the host + * is simpler than doing XRSTOR here in the guest. + * + * However, temporarily enable MPX in BNDCFGS so that + * BNDMOV actually loads BND1. If MPX isn't *fully* + * enabled, all MPX instructions are treated as NOPs. + * + * Hand encode "bndmov (%rax),%bnd1" as support for MPX + * mnemonics/registers has been removed from gcc and + * clang (and was never fully supported by clang). + */ + wrmsr(MSR_IA32_BNDCFGS, BIT_ULL(0)); + asm volatile (".byte 0x66,0x0f,0x1a,0x08" :: "a" (bounds)); + /* + * Hand encode "bndmov %bnd1, (%rax)" to sanity check + * that BND1 actually got loaded. + */ + asm volatile (".byte 0x66,0x0f,0x1b,0x08" :: "a" (output)); + wrmsr(MSR_IA32_BNDCFGS, 0); + + GUEST_ASSERT_EQ(bounds[0], output[0]); + GUEST_ASSERT_EQ(bounds[1], output[1]); + } + if (this_cpu_has(X86_FEATURE_PKU)) { + GUEST_ASSERT(supported_xcr0 & XFEATURE_MASK_PKRU); + set_cr4(get_cr4() | X86_CR4_PKE); + GUEST_ASSERT(this_cpu_has(X86_FEATURE_OSPKE)); + + wrpkru(-1u); + } + } + GUEST_SYNC(2); if (arg) { @@ -153,10 +230,11 @@ static void __attribute__((__flatten__)) guest_code(void *arg) int main(int argc, char *argv[]) { + uint64_t *xstate_bv, saved_xstate_bv; vm_vaddr_t nested_gva = 0; - + struct kvm_cpuid2 empty_cpuid = {}; struct kvm_regs regs1, regs2; - struct kvm_vcpu *vcpu; + struct kvm_vcpu *vcpu, *vcpuN; struct kvm_vm *vm; struct kvm_x86_state *state; struct ucall uc; @@ -209,6 +287,34 @@ int main(int argc, char *argv[]) /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); vcpu_load_state(vcpu, state); + + /* + * Restore XSAVE state in a dummy vCPU, first without doing + * KVM_SET_CPUID2, and then with an empty guest CPUID. Except + * for off-by-default xfeatures, e.g. AMX, KVM is supposed to + * allow KVM_SET_XSAVE regardless of guest CPUID. Manually + * load only XSAVE state, MSRs in particular have a much more + * convoluted ABI. + * + * Load two versions of XSAVE state: one with the actual guest + * XSAVE state, and one with all supported features forced "on" + * in xstate_bv, e.g. to ensure that KVM allows loading all + * supported features, even if something goes awry in saving + * the original snapshot. + */ + xstate_bv = (void *)&((uint8_t *)state->xsave->region)[512]; + saved_xstate_bv = *xstate_bv; + + vcpuN = __vm_vcpu_add(vm, vcpu->id + 1); + vcpu_xsave_set(vcpuN, state->xsave); + *xstate_bv = kvm_cpu_supported_xcr0(); + vcpu_xsave_set(vcpuN, state->xsave); + + vcpu_init_cpuid(vcpuN, &empty_cpuid); + vcpu_xsave_set(vcpuN, state->xsave); + *xstate_bv = saved_xstate_bv; + vcpu_xsave_set(vcpuN, state->xsave); + kvm_x86_state_cleanup(state); memset(®s2, 0, sizeof(regs2)); diff --git a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c index 5b669818e39a..59c7304f805e 100644 --- a/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c +++ b/tools/testing/selftests/kvm/x86_64/tsc_scaling_sync.c @@ -1,10 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * svm_vmcall_test - * * Copyright © 2021 Amazon.com, Inc. or its affiliates. - * - * Xen shared_info / pvclock testing */ #include "test_util.h" diff --git a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c index 05898ad9f4d9..9ec9ab60b63e 100644 --- a/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c +++ b/tools/testing/selftests/kvm/x86_64/xen_shinfo_test.c @@ -1,10 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * svm_vmcall_test - * * Copyright © 2021 Amazon.com, Inc. or its affiliates. - * - * Xen shared_info / pvclock testing */ #include "test_util.h" |