aboutsummaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/Kconfig1
-rw-r--r--arch/alpha/boot/bootp.c2
-rw-r--r--arch/alpha/boot/bootpz.c2
-rw-r--r--arch/alpha/boot/main.c2
-rw-r--r--arch/alpha/boot/misc.c2
-rw-r--r--arch/alpha/boot/stdio.c16
-rw-r--r--arch/alpha/boot/tools/objstrip.c2
-rw-r--r--arch/alpha/configs/defconfig2
-rw-r--r--arch/alpha/include/asm/Kbuild1
-rw-r--r--arch/alpha/include/asm/asm-offsets.h1
-rw-r--r--arch/alpha/include/asm/div64.h1
-rw-r--r--arch/alpha/include/asm/fpu.h61
-rw-r--r--arch/alpha/include/asm/io.h4
-rw-r--r--arch/alpha/include/asm/irq_regs.h1
-rw-r--r--arch/alpha/include/asm/kdebug.h1
-rw-r--r--arch/alpha/include/asm/thread_info.h18
-rw-r--r--arch/alpha/include/asm/unistd.h2
-rw-r--r--arch/alpha/include/uapi/asm/ptrace.h2
-rw-r--r--arch/alpha/kernel/asm-offsets.c2
-rw-r--r--arch/alpha/kernel/core_cia.c2
-rw-r--r--arch/alpha/kernel/entry.S148
-rw-r--r--arch/alpha/kernel/module.c4
-rw-r--r--arch/alpha/kernel/osf_sys.c2
-rw-r--r--arch/alpha/kernel/pci_iommu.c8
-rw-r--r--arch/alpha/kernel/perf_event.c6
-rw-r--r--arch/alpha/kernel/process.c7
-rw-r--r--arch/alpha/kernel/ptrace.c18
-rw-r--r--arch/alpha/kernel/signal.c20
-rw-r--r--arch/alpha/kernel/traps.c30
-rw-r--r--arch/alpha/lib/fpreg.c43
-rw-r--r--arch/alpha/lib/stacktrace.c2
-rw-r--r--arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts2
-rw-r--r--arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts2
-rw-r--r--arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts2
-rw-r--r--arch/arm/boot/dts/exynos4-cpu-thermal.dtsi2
-rw-r--r--arch/arm/boot/dts/exynos4210.dtsi1
-rw-r--r--arch/arm/boot/dts/exynos5250.dtsi2
-rw-r--r--arch/arm/boot/dts/exynos5410-odroidxu.dts1
-rw-r--r--arch/arm/boot/dts/exynos5422-odroidhc1.dts10
-rw-r--r--arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi10
-rw-r--r--arch/arm/boot/dts/spear320-hmi.dts2
-rw-r--r--arch/arm/mach-qcom/platsmp.c2
-rw-r--r--arch/arm/mach-s3c/Makefile2
-rw-r--r--arch/arm/mm/dma-mapping.c11
-rw-r--r--arch/arm64/boot/dts/qcom/sm8250.dtsi8
-rw-r--r--arch/arm64/configs/defconfig3
-rw-r--r--arch/arm64/include/asm/cache.h9
-rw-r--r--arch/arm64/include/asm/el2_setup.h99
-rw-r--r--arch/arm64/include/asm/esr.h4
-rw-r--r--arch/arm64/include/asm/kvm_arm.h23
-rw-r--r--arch/arm64/include/asm/kvm_emulate.h70
-rw-r--r--arch/arm64/include/asm/kvm_host.h67
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h1
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h15
-rw-r--r--arch/arm64/include/asm/kvm_nested.h20
-rw-r--r--arch/arm64/include/asm/kvm_pgtable.h8
-rw-r--r--arch/arm64/include/asm/sysreg.h39
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h1
-rw-r--r--arch/arm64/kernel/cacheinfo.c5
-rw-r--r--arch/arm64/kernel/cpufeature.c25
-rw-r--r--arch/arm64/kernel/hyp-stub.S86
-rw-r--r--arch/arm64/kvm/Kconfig1
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arch_timer.c106
-rw-r--r--arch/arm64/kvm/arm.c109
-rw-r--r--arch/arm64/kvm/emulate-nested.c203
-rw-r--r--arch/arm64/kvm/fpsimd.c1
-rw-r--r--arch/arm64/kvm/guest.c6
-rw-r--r--arch/arm64/kvm/handle_exit.c47
-rw-r--r--arch/arm64/kvm/hyp/exception.c48
-rw-r--r--arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h21
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-init.S1
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c1
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c43
-rw-r--r--arch/arm64/kvm/hyp/vhe/switch.c26
-rw-r--r--arch/arm64/kvm/hypercalls.c2
-rw-r--r--arch/arm64/kvm/inject_fault.c61
-rw-r--r--arch/arm64/kvm/mmu.c46
-rw-r--r--arch/arm64/kvm/nested.c161
-rw-r--r--arch/arm64/kvm/pvtime.c8
-rw-r--r--arch/arm64/kvm/reset.c25
-rw-r--r--arch/arm64/kvm/sys_regs.c459
-rw-r--r--arch/arm64/kvm/sys_regs.h14
-rw-r--r--arch/arm64/kvm/trace_arm.h59
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c21
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.c13
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c9
-rw-r--r--arch/arm64/kvm/vmid.c6
-rw-r--r--arch/arm64/tools/cpucaps1
-rwxr-xr-xarch/arm64/tools/gen-sysreg.awk20
-rw-r--r--arch/arm64/tools/sysreg17
-rw-r--r--arch/csky/lib/delay.c2
-rw-r--r--arch/ia64/kernel/Makefile2
-rw-r--r--arch/loongarch/Kconfig65
-rw-r--r--arch/loongarch/Makefile14
-rw-r--r--arch/loongarch/configs/loongson3_defconfig1
-rw-r--r--arch/loongarch/include/asm/addrspace.h2
-rw-r--r--arch/loongarch/include/asm/asm.h10
-rw-r--r--arch/loongarch/include/asm/asmmacro.h17
-rw-r--r--arch/loongarch/include/asm/cpu.h2
-rw-r--r--arch/loongarch/include/asm/hw_breakpoint.h145
-rw-r--r--arch/loongarch/include/asm/inst.h58
-rw-r--r--arch/loongarch/include/asm/kprobes.h61
-rw-r--r--arch/loongarch/include/asm/loongarch.h35
-rw-r--r--arch/loongarch/include/asm/processor.h16
-rw-r--r--arch/loongarch/include/asm/ptrace.h39
-rw-r--r--arch/loongarch/include/asm/setup.h16
-rw-r--r--arch/loongarch/include/asm/stackframe.h13
-rw-r--r--arch/loongarch/include/asm/switch_to.h1
-rw-r--r--arch/loongarch/include/asm/uaccess.h1
-rw-r--r--arch/loongarch/include/uapi/asm/ptrace.h9
-rw-r--r--arch/loongarch/kernel/Makefile9
-rw-r--r--arch/loongarch/kernel/entry.S91
-rw-r--r--arch/loongarch/kernel/ftrace_dyn.c64
-rw-r--r--arch/loongarch/kernel/genex.S8
-rw-r--r--arch/loongarch/kernel/head.S33
-rw-r--r--arch/loongarch/kernel/hw_breakpoint.c548
-rw-r--r--arch/loongarch/kernel/inst.c123
-rw-r--r--arch/loongarch/kernel/kprobes.c406
-rw-r--r--arch/loongarch/kernel/kprobes_trampoline.S96
-rw-r--r--arch/loongarch/kernel/process.c7
-rw-r--r--arch/loongarch/kernel/ptrace.c472
-rw-r--r--arch/loongarch/kernel/relocate.c242
-rw-r--r--arch/loongarch/kernel/setup.c14
-rw-r--r--arch/loongarch/kernel/time.c11
-rw-r--r--arch/loongarch/kernel/traps.c68
-rw-r--r--arch/loongarch/kernel/vmlinux.lds.S20
-rw-r--r--arch/loongarch/lib/memcpy.S3
-rw-r--r--arch/loongarch/lib/memmove.S4
-rw-r--r--arch/loongarch/lib/memset.S3
-rw-r--r--arch/loongarch/mm/fault.c3
-rw-r--r--arch/loongarch/mm/tlbex.S17
-rw-r--r--arch/loongarch/power/suspend_asm.S5
-rw-r--r--arch/m68k/68000/dragen2.c2
-rw-r--r--arch/m68k/Kconfig.machine8
-rw-r--r--arch/mips/Kbuild2
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/Makefile13
-rw-r--r--arch/mips/Makefile.postlink2
-rw-r--r--arch/mips/bcm47xx/board.c1
-rw-r--r--arch/mips/bcm47xx/buttons.c9
-rw-r--r--arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts10
-rw-r--r--arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts6
-rw-r--r--arch/mips/boot/dts/img/boston.dts2
-rw-r--r--arch/mips/boot/dts/ingenic/ci20.dts10
-rw-r--r--arch/mips/boot/dts/ingenic/jz4780.dtsi2
-rw-r--r--arch/mips/boot/dts/lantiq/danube.dtsi1
-rw-r--r--arch/mips/boot/dts/pic32/pic32mzda_sk.dts6
-rw-r--r--arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts8
-rw-r--r--arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts8
-rw-r--r--arch/mips/boot/dts/qca/ar9331_omega.dts2
-rw-r--r--arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts8
-rw-r--r--arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts22
-rw-r--r--arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts20
-rw-r--r--arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts21
-rw-r--r--arch/mips/boot/dts/ralink/mt7621.dtsi19
-rw-r--r--arch/mips/cavium-octeon/octeon-usb.c42
-rw-r--r--arch/mips/include/asm/asmmacro-32.h4
-rw-r--r--arch/mips/include/asm/asmmacro.h42
-rw-r--r--arch/mips/include/asm/fpregdef.h14
-rw-r--r--arch/mips/include/asm/kvm_host.h3
-rw-r--r--arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h1
-rw-r--r--arch/mips/include/asm/mach-rc32434/pci.h2
-rw-r--r--arch/mips/include/asm/mipsregs.h20
-rw-r--r--arch/mips/include/asm/syscall.h2
-rw-r--r--arch/mips/include/asm/vpe.h1
-rw-r--r--arch/mips/kernel/genex.S2
-rw-r--r--arch/mips/kernel/r2300_fpu.S4
-rw-r--r--arch/mips/kernel/r4k_fpu.S12
-rw-r--r--arch/mips/kernel/smp-cps.c8
-rw-r--r--arch/mips/kernel/vpe-mt.c7
-rw-r--r--arch/mips/kvm/Kconfig1
-rw-r--r--arch/mips/kvm/Makefile2
-rw-r--r--arch/mips/kvm/callback.c14
-rw-r--r--arch/mips/kvm/fpu.S6
-rw-r--r--arch/mips/kvm/mips.c34
-rw-r--r--arch/mips/kvm/vz.c7
-rw-r--r--arch/mips/lantiq/prom.c6
-rw-r--r--arch/mips/lantiq/xway/dcdc.c5
-rw-r--r--arch/mips/lantiq/xway/dma.c4
-rw-r--r--arch/mips/lantiq/xway/gptu.c5
-rw-r--r--arch/mips/loongson2ef/Platform2
-rw-r--r--arch/mips/pci/pci-lantiq.c8
-rw-r--r--arch/mips/pci/pci-mt7620.c8
-rw-r--r--arch/mips/ralink/Kconfig5
-rw-r--r--arch/mips/ralink/timer.c3
-rw-r--r--arch/mips/vdso/Kconfig14
-rw-r--r--arch/mips/vdso/Makefile3
-rw-r--r--arch/powerpc/Kconfig19
-rw-r--r--arch/powerpc/Makefile28
-rw-r--r--arch/powerpc/Makefile.postlink2
-rw-r--r--arch/powerpc/boot/Makefile14
-rw-r--r--arch/powerpc/boot/dts/turris1x.dts23
-rw-r--r--arch/powerpc/configs/ps3_defconfig39
-rw-r--r--arch/powerpc/crypto/crc32-vpmsum_core.S13
-rw-r--r--arch/powerpc/include/asm/barrier.h12
-rw-r--r--arch/powerpc/include/asm/hvcall.h1
-rw-r--r--arch/powerpc/include/asm/hw_irq.h6
-rw-r--r--arch/powerpc/include/asm/interrupt.h35
-rw-r--r--arch/powerpc/include/asm/irq.h3
-rw-r--r--arch/powerpc/include/asm/kvm_host.h3
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h1
-rw-r--r--arch/powerpc/include/asm/machdep.h16
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/papr-sysparm.h38
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h4
-rw-r--r--arch/powerpc/include/asm/plpks.h195
-rw-r--r--arch/powerpc/include/asm/rtas-types.h2
-rw-r--r--arch/powerpc/include/asm/rtas-work-area.h96
-rw-r--r--arch/powerpc/include/asm/rtas.h184
-rw-r--r--arch/powerpc/include/asm/secvar.h21
-rw-r--r--arch/powerpc/include/asm/smp.h1
-rw-r--r--arch/powerpc/include/asm/trace.h103
-rw-r--r--arch/powerpc/kernel/Makefile10
-rw-r--r--arch/powerpc/kernel/eeh_driver.c4
-rw-r--r--arch/powerpc/kernel/epapr_hcalls.S6
-rw-r--r--arch/powerpc/kernel/head_64.S51
-rw-r--r--arch/powerpc/kernel/iommu.c4
-rw-r--r--arch/powerpc/kernel/irq_64.c105
-rw-r--r--arch/powerpc/kernel/mce.c10
-rw-r--r--arch/powerpc/kernel/module_64.c29
-rw-r--r--arch/powerpc/kernel/pci_32.c17
-rw-r--r--arch/powerpc/kernel/process.c14
-rw-r--r--arch/powerpc/kernel/prom.c16
-rw-r--r--arch/powerpc/kernel/prom_init_check.sh9
-rw-r--r--arch/powerpc/kernel/rtas-proc.c24
-rw-r--r--arch/powerpc/kernel/rtas-rtc.c6
-rw-r--r--arch/powerpc/kernel/rtas.c1056
-rw-r--r--arch/powerpc/kernel/rtas_flash.c21
-rw-r--r--arch/powerpc/kernel/rtas_pci.c8
-rw-r--r--arch/powerpc/kernel/rtasd.c2
-rw-r--r--arch/powerpc/kernel/secvar-ops.c10
-rw-r--r--arch/powerpc/kernel/secvar-sysfs.c178
-rw-r--r--arch/powerpc/kernel/setup-common.c4
-rw-r--r--arch/powerpc/kernel/setup_64.c16
-rw-r--r--arch/powerpc/kernel/time.c4
-rw-r--r--arch/powerpc/kernel/trace/Makefile1
-rw-r--r--arch/powerpc/kernel/vdso/Makefile28
-rw-r--r--arch/powerpc/kexec/file_load_64.c21
-rw-r--r--arch/powerpc/kvm/book3s.c12
-rw-r--r--arch/powerpc/kvm/booke.c2
-rw-r--r--arch/powerpc/kvm/e500.c6
-rw-r--r--arch/powerpc/kvm/e500mc.c8
-rw-r--r--arch/powerpc/kvm/powerpc.c20
-rw-r--r--arch/powerpc/lib/Makefile2
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c3
-rw-r--r--arch/powerpc/mm/book3s64/radix_tlb.c73
-rw-r--r--arch/powerpc/mm/mmu_decl.h1
-rw-r--r--arch/powerpc/mm/nohash/e500_hugetlbpage.c5
-rw-r--r--arch/powerpc/mm/nohash/tlb_low_64e.S2
-rw-r--r--arch/powerpc/net/bpf_jit.h2
-rw-r--r--arch/powerpc/net/bpf_jit_comp.c91
-rw-r--r--arch/powerpc/net/bpf_jit_comp32.c400
-rw-r--r--arch/powerpc/net/bpf_jit_comp64.c16
-rw-r--r--arch/powerpc/perf/hv-24x7.c44
-rw-r--r--arch/powerpc/platforms/44x/fsp2.c2
-rw-r--r--arch/powerpc/platforms/52xx/efika.c4
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype20
-rw-r--r--arch/powerpc/platforms/cell/ras.c4
-rw-r--r--arch/powerpc/platforms/cell/smp.c4
-rw-r--r--arch/powerpc/platforms/chrp/nvram.c4
-rw-r--r--arch/powerpc/platforms/chrp/pci.c4
-rw-r--r--arch/powerpc/platforms/chrp/setup.c4
-rw-r--r--arch/powerpc/platforms/maple/setup.c4
-rw-r--r--arch/powerpc/platforms/powernv/opal-secvar.c60
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c3
-rw-r--r--arch/powerpc/platforms/ps3/htab.c2
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig20
-rw-r--r--arch/powerpc/platforms/pseries/Makefile6
-rw-r--r--arch/powerpc/platforms/pseries/dlpar.c29
-rw-r--r--arch/powerpc/platforms/pseries/eeh_pseries.c22
-rw-r--r--arch/powerpc/platforms/pseries/hotplug-cpu.c4
-rw-r--r--arch/powerpc/platforms/pseries/io_event_irq.c2
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c37
-rw-r--r--arch/powerpc/platforms/pseries/lparcfg.c104
-rw-r--r--arch/powerpc/platforms/pseries/mobility.c4
-rw-r--r--arch/powerpc/platforms/pseries/msi.c4
-rw-r--r--arch/powerpc/platforms/pseries/nvram.c4
-rw-r--r--arch/powerpc/platforms/pseries/papr-sysparm.c151
-rw-r--r--arch/powerpc/platforms/pseries/pci.c2
-rw-r--r--arch/powerpc/platforms/pseries/plpks-secvar.c217
-rw-r--r--arch/powerpc/platforms/pseries/plpks.c385
-rw-r--r--arch/powerpc/platforms/pseries/plpks.h71
-rw-r--r--arch/powerpc/platforms/pseries/ras.c2
-rw-r--r--arch/powerpc/platforms/pseries/rtas-work-area.c209
-rw-r--r--arch/powerpc/platforms/pseries/setup.c29
-rw-r--r--arch/powerpc/platforms/pseries/smp.c12
-rw-r--r--arch/powerpc/purgatory/Makefile1
-rw-r--r--arch/powerpc/sysdev/xics/ics-rtas.c8
-rw-r--r--arch/powerpc/xmon/Makefile1
-rw-r--r--arch/powerpc/xmon/xmon.c16
-rw-r--r--arch/riscv/Kconfig78
-rw-r--r--arch/riscv/Kconfig.socs5
-rw-r--r--arch/riscv/Makefile9
-rw-r--r--arch/riscv/errata/sifive/errata.c6
-rw-r--r--arch/riscv/errata/thead/errata.c17
-rw-r--r--arch/riscv/include/asm/alternative-macros.h20
-rw-r--r--arch/riscv/include/asm/alternative.h20
-rw-r--r--arch/riscv/include/asm/elf.h10
-rw-r--r--arch/riscv/include/asm/errata_list.h12
-rw-r--r--arch/riscv/include/asm/ftrace.h50
-rw-r--r--arch/riscv/include/asm/hwcap.h112
-rw-r--r--arch/riscv/include/asm/insn-def.h58
-rw-r--r--arch/riscv/include/asm/insn.h381
-rw-r--r--arch/riscv/include/asm/jump_label.h2
-rw-r--r--arch/riscv/include/asm/kvm_host.h11
-rw-r--r--arch/riscv/include/asm/kvm_vcpu_pmu.h107
-rw-r--r--arch/riscv/include/asm/kvm_vcpu_sbi.h13
-rw-r--r--arch/riscv/include/asm/module.h16
-rw-r--r--arch/riscv/include/asm/parse_asm.h219
-rw-r--r--arch/riscv/include/asm/pgtable.h4
-rw-r--r--arch/riscv/include/asm/sbi.h7
-rw-r--r--arch/riscv/include/asm/signal.h2
-rw-r--r--arch/riscv/include/asm/string.h10
-rw-r--r--arch/riscv/include/asm/switch_to.h3
-rw-r--r--arch/riscv/include/asm/thread_info.h1
-rw-r--r--arch/riscv/include/asm/vdso.h4
-rw-r--r--arch/riscv/kernel/alternative.c113
-rw-r--r--arch/riscv/kernel/cpu.c54
-rw-r--r--arch/riscv/kernel/cpufeature.c85
-rw-r--r--arch/riscv/kernel/ftrace.c65
-rw-r--r--arch/riscv/kernel/kgdb.c63
-rw-r--r--arch/riscv/kernel/mcount-dyn.S42
-rw-r--r--arch/riscv/kernel/module.c31
-rw-r--r--arch/riscv/kernel/probes/simulate-insn.c19
-rw-r--r--arch/riscv/kernel/probes/simulate-insn.h29
-rw-r--r--arch/riscv/kernel/riscv_ksyms.c3
-rw-r--r--arch/riscv/kernel/setup.c3
-rw-r--r--arch/riscv/kernel/traps.c30
-rw-r--r--arch/riscv/kernel/vdso.c5
-rw-r--r--arch/riscv/kernel/vdso/vdso.lds.S7
-rw-r--r--arch/riscv/kernel/vmlinux.lds.S9
-rw-r--r--arch/riscv/kvm/Kconfig1
-rw-r--r--arch/riscv/kvm/Makefile1
-rw-r--r--arch/riscv/kvm/main.c26
-rw-r--r--arch/riscv/kvm/mmu.c20
-rw-r--r--arch/riscv/kvm/tlb.c7
-rw-r--r--arch/riscv/kvm/vcpu.c7
-rw-r--r--arch/riscv/kvm/vcpu_exit.c9
-rw-r--r--arch/riscv/kvm/vcpu_insn.c4
-rw-r--r--arch/riscv/kvm/vcpu_pmu.c633
-rw-r--r--arch/riscv/kvm/vcpu_sbi.c72
-rw-r--r--arch/riscv/kvm/vcpu_sbi_base.c27
-rw-r--r--arch/riscv/kvm/vcpu_sbi_hsm.c28
-rw-r--r--arch/riscv/kvm/vcpu_sbi_pmu.c86
-rw-r--r--arch/riscv/kvm/vcpu_sbi_replace.c50
-rw-r--r--arch/riscv/kvm/vcpu_sbi_v01.c17
-rw-r--r--arch/riscv/kvm/vmid.c4
-rw-r--r--arch/riscv/lib/Makefile3
-rw-r--r--arch/riscv/lib/strcmp.S121
-rw-r--r--arch/riscv/lib/strlen.S133
-rw-r--r--arch/riscv/lib/strncmp.S139
-rw-r--r--arch/riscv/mm/fault.c10
-rw-r--r--arch/riscv/purgatory/Makefile13
-rw-r--r--arch/s390/Kconfig8
-rw-r--r--arch/s390/configs/debug_defconfig1
-rw-r--r--arch/s390/configs/defconfig1
-rw-r--r--arch/s390/include/asm/kvm_host.h1
-rw-r--r--arch/s390/include/asm/msi.h17
-rw-r--r--arch/s390/include/asm/pci_dma.h5
-rw-r--r--arch/s390/kernel/vdso64/Makefile4
-rw-r--r--arch/s390/kvm/gaccess.c109
-rw-r--r--arch/s390/kvm/gaccess.h3
-rw-r--r--arch/s390/kvm/interrupt.c13
-rw-r--r--arch/s390/kvm/kvm-s390.c348
-rw-r--r--arch/s390/kvm/kvm-s390.h2
-rw-r--r--arch/s390/kvm/pci.c2
-rw-r--r--arch/s390/kvm/pci.h2
-rw-r--r--arch/s390/pci/pci_dma.c31
-rw-r--r--arch/s390/purgatory/Makefile2
-rw-r--r--arch/sh/boot/compressed/Makefile7
-rw-r--r--arch/um/Kconfig7
-rw-r--r--arch/um/Makefile7
-rw-r--r--arch/um/drivers/Kconfig2
-rw-r--r--arch/um/drivers/Makefile2
-rw-r--r--arch/um/drivers/pcap_kern.c4
-rw-r--r--arch/um/drivers/vector_kern.c1
-rw-r--r--arch/um/drivers/vector_user.h2
-rw-r--r--arch/um/drivers/virt-pci.c139
-rw-r--r--arch/um/drivers/virtio_uml.c20
-rw-r--r--arch/um/include/asm/processor-generic.h2
-rw-r--r--arch/um/kernel/Makefile2
-rw-r--r--arch/um/kernel/exec.c4
-rw-r--r--arch/um/kernel/skas/Makefile2
-rw-r--r--arch/um/kernel/tlb.c6
-rw-r--r--arch/um/kernel/um_arch.c2
-rw-r--r--arch/um/kernel/vmlinux.lds.S2
-rw-r--r--arch/um/os-Linux/Makefile2
-rw-r--r--arch/um/os-Linux/drivers/Makefile2
-rw-r--r--arch/um/os-Linux/irq.c4
-rw-r--r--arch/um/os-Linux/skas/Makefile2
-rw-r--r--arch/um/os-Linux/skas/mem.c19
-rw-r--r--arch/um/os-Linux/skas/process.c121
-rw-r--r--arch/x86/Makefile.um8
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/coco/tdx/tdcall.S81
-rw-r--r--arch/x86/coco/tdx/tdx.c62
-rw-r--r--arch/x86/events/intel/core.c1
-rw-r--r--arch/x86/events/intel/ds.c4
-rw-r--r--arch/x86/include/asm/cpufeatures.h3
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h3
-rw-r--r--arch/x86/include/asm/idtentry.h16
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h109
-rw-r--r--arch/x86/include/asm/reboot.h2
-rw-r--r--arch/x86/include/asm/shared/tdx.h6
-rw-r--r--arch/x86/include/asm/virtext.h16
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h4
-rw-r--r--arch/x86/include/uapi/asm/kvm.h34
-rw-r--r--arch/x86/kernel/asm-offsets.c6
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c2
-rw-r--r--arch/x86/kernel/crash.c17
-rw-r--r--arch/x86/kernel/nmi.c8
-rw-r--r--arch/x86/kernel/reboot.c88
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c43
-rw-r--r--arch/x86/kvm/debugfs.c2
-rw-r--r--arch/x86/kvm/emulate.c18
-rw-r--r--arch/x86/kvm/hyperv.c85
-rw-r--r--arch/x86/kvm/hyperv.h27
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/i8259.c4
-rw-r--r--arch/x86/kvm/ioapic.c1
-rw-r--r--arch/x86/kvm/irq.c1
-rw-r--r--arch/x86/kvm/irq_comm.c7
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h12
-rw-r--r--arch/x86/kvm/kvm_emulate.h7
-rw-r--r--arch/x86/kvm/kvm_onhyperv.c1
-rw-r--r--arch/x86/kvm/lapic.c404
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/mmu/mmu.c320
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h22
-rw-r--r--arch/x86/kvm/mmu/page_track.c1
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h25
-rw-r--r--arch/x86/kvm/mmu/spte.c10
-rw-r--r--arch/x86/kvm/mmu/spte.h20
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.c12
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c20
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.h25
-rw-r--r--arch/x86/kvm/mtrr.c1
-rw-r--r--arch/x86/kvm/pmu.c290
-rw-r--r--arch/x86/kvm/pmu.h13
-rw-r--r--arch/x86/kvm/reverse_cpuid.h7
-rw-r--r--arch/x86/kvm/smm.c3
-rw-r--r--arch/x86/kvm/svm/avic.c374
-rw-r--r--arch/x86/kvm/svm/nested.c5
-rw-r--r--arch/x86/kvm/svm/pmu.c4
-rw-r--r--arch/x86/kvm/svm/sev.c7
-rw-r--r--arch/x86/kvm/svm/svm.c131
-rw-r--r--arch/x86/kvm/svm/svm.h58
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.c1
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h8
-rw-r--r--arch/x86/kvm/vmx/capabilities.h4
-rw-r--r--arch/x86/kvm/vmx/hyperv.c87
-rw-r--r--arch/x86/kvm/vmx/hyperv.h128
-rw-r--r--arch/x86/kvm/vmx/nested.c24
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c28
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c2
-rw-r--r--arch/x86/kvm/vmx/sgx.c5
-rw-r--r--arch/x86/kvm/vmx/vmcs.h4
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c1
-rw-r--r--arch/x86/kvm/vmx/vmenter.S80
-rw-r--r--arch/x86/kvm/vmx/vmx.c624
-rw-r--r--arch/x86/kvm/vmx/vmx.h18
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h6
-rw-r--r--arch/x86/kvm/x86.c557
-rw-r--r--arch/x86/kvm/x86.h18
-rw-r--r--arch/x86/kvm/xen.c27
-rw-r--r--arch/x86/kvm/xen.h7
-rw-r--r--arch/x86/tools/Makefile2
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/os-Linux/Makefile2
-rw-r--r--arch/x86/um/vdso/Makefile2
-rw-r--r--arch/x86/um/vdso/um_vdso.c12
476 files changed, 13608 insertions, 4920 deletions
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 97fce7386b00..780d4673c3ca 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -3,6 +3,7 @@ config ALPHA
bool
default y
select ARCH_32BIT_USTAT_F_TINODE
+ select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_NO_PREEMPT
diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c
index b4faba2432d5..842e85776cc0 100644
--- a/arch/alpha/boot/bootp.c
+++ b/arch/alpha/boot/bootp.c
@@ -18,7 +18,7 @@
#include <asm/hwrpb.h>
#include <asm/io.h>
-#include <stdarg.h>
+#include <linux/stdarg.h>
#include "ksize.h"
diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c
index 90a2b341e9c0..c6079308eab3 100644
--- a/arch/alpha/boot/bootpz.c
+++ b/arch/alpha/boot/bootpz.c
@@ -20,7 +20,7 @@
#include <asm/hwrpb.h>
#include <asm/io.h>
-#include <stdarg.h>
+#include <linux/stdarg.h>
#include "kzsize.h"
diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c
index e5347a080008..22a1cb0264af 100644
--- a/arch/alpha/boot/main.c
+++ b/arch/alpha/boot/main.c
@@ -15,7 +15,7 @@
#include <asm/console.h>
#include <asm/hwrpb.h>
-#include <stdarg.h>
+#include <linux/stdarg.h>
#include "ksize.h"
diff --git a/arch/alpha/boot/misc.c b/arch/alpha/boot/misc.c
index 325d4dd4f904..1ab91852d9f7 100644
--- a/arch/alpha/boot/misc.c
+++ b/arch/alpha/boot/misc.c
@@ -89,8 +89,6 @@ static ulg output_ptr;
static ulg bytes_out;
static void error(char *m);
-static void gzip_mark(void **);
-static void gzip_release(void **);
extern int end;
static ulg free_mem_ptr;
diff --git a/arch/alpha/boot/stdio.c b/arch/alpha/boot/stdio.c
index 60f73ccd2e89..faa5234b90b8 100644
--- a/arch/alpha/boot/stdio.c
+++ b/arch/alpha/boot/stdio.c
@@ -2,8 +2,8 @@
/*
* Copyright (C) Paul Mackerras 1997.
*/
-#include <stdarg.h>
-#include <stddef.h>
+#include <linux/string.h>
+#include <linux/stdarg.h>
size_t strnlen(const char * s, size_t count)
{
@@ -42,8 +42,8 @@ static int skip_atoi(const char **s)
static char * number(char * str, unsigned long long num, int base, int size, int precision, int type)
{
- char c,sign,tmp[66];
- const char *digits="0123456789abcdefghijklmnopqrstuvwxyz";
+ char c, sign, tmp[66];
+ const char *digits = "0123456789abcdefghijklmnopqrstuvwxyz";
int i;
if (type & LARGE)
@@ -83,14 +83,14 @@ static char * number(char * str, unsigned long long num, int base, int size, int
precision = i;
size -= precision;
if (!(type&(ZEROPAD+LEFT)))
- while(size-->0)
+ while (size-- > 0)
*str++ = ' ';
if (sign)
*str++ = sign;
if (type & SPECIAL) {
if (base==8)
*str++ = '0';
- else if (base==16) {
+ else if (base == 16) {
*str++ = '0';
*str++ = digits[33];
}
@@ -125,7 +125,7 @@ int vsprintf(char *buf, const char *fmt, va_list args)
/* 'z' changed to 'Z' --davidm 1/25/99 */
- for (str=buf ; *fmt ; ++fmt) {
+ for (str = buf ; *fmt ; ++fmt) {
if (*fmt != '%') {
*str++ = *fmt;
continue;
@@ -296,7 +296,7 @@ int sprintf(char * buf, const char *fmt, ...)
int i;
va_start(args, fmt);
- i=vsprintf(buf,fmt,args);
+ i = vsprintf(buf, fmt, args);
va_end(args);
return i;
}
diff --git a/arch/alpha/boot/tools/objstrip.c b/arch/alpha/boot/tools/objstrip.c
index 08b430d25a31..7cf92d172dce 100644
--- a/arch/alpha/boot/tools/objstrip.c
+++ b/arch/alpha/boot/tools/objstrip.c
@@ -148,7 +148,7 @@ main (int argc, char *argv[])
#ifdef __ELF__
elf = (struct elfhdr *) buf;
- if (elf->e_ident[0] == 0x7f && str_has_prefix((char *)elf->e_ident + 1, "ELF")) {
+ if (memcmp(&elf->e_ident[EI_MAG0], ELFMAG, SELFMAG) == 0) {
if (elf->e_type != ET_EXEC) {
fprintf(stderr, "%s: %s is not an ELF executable\n",
prog_name, inname);
diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig
index 6a39fe8ce9e5..1816c1dc22b1 100644
--- a/arch/alpha/configs/defconfig
+++ b/arch/alpha/configs/defconfig
@@ -39,14 +39,12 @@ CONFIG_PATA_CYPRESS=y
CONFIG_ATA_GENERIC=y
CONFIG_NETDEVICES=y
CONFIG_DUMMY=m
-CONFIG_NET_ETHERNET=y
CONFIG_NET_VENDOR_3COM=y
CONFIG_VORTEX=y
CONFIG_NET_TULIP=y
CONFIG_DE2104X=m
CONFIG_TULIP=y
CONFIG_TULIP_MMIO=y
-CONFIG_NET_PCI=y
CONFIG_YELLOWFIN=y
CONFIG_SERIAL_8250=y
CONFIG_SERIAL_8250_CONSOLE=y
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index 54f5126628c6..dd31e97edae8 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -2,6 +2,7 @@
generated-y += syscall_table.h
generic-y += agp.h
+generic-y += asm-offsets.h
generic-y += export.h
generic-y += kvm_para.h
generic-y += mcs_spinlock.h
diff --git a/arch/alpha/include/asm/asm-offsets.h b/arch/alpha/include/asm/asm-offsets.h
deleted file mode 100644
index d370ee36a182..000000000000
--- a/arch/alpha/include/asm/asm-offsets.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <generated/asm-offsets.h>
diff --git a/arch/alpha/include/asm/div64.h b/arch/alpha/include/asm/div64.h
deleted file mode 100644
index 6cd978cefb28..000000000000
--- a/arch/alpha/include/asm/div64.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/div64.h>
diff --git a/arch/alpha/include/asm/fpu.h b/arch/alpha/include/asm/fpu.h
index b9691405e56b..30b24135dd7a 100644
--- a/arch/alpha/include/asm/fpu.h
+++ b/arch/alpha/include/asm/fpu.h
@@ -15,21 +15,27 @@ rdfpcr(void)
{
unsigned long tmp, ret;
+ preempt_disable();
+ if (current_thread_info()->status & TS_SAVED_FP) {
+ ret = current_thread_info()->fp[31];
+ } else {
#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
- __asm__ __volatile__ (
- "ftoit $f0,%0\n\t"
- "mf_fpcr $f0\n\t"
- "ftoit $f0,%1\n\t"
- "itoft %0,$f0"
- : "=r"(tmp), "=r"(ret));
+ __asm__ __volatile__ (
+ "ftoit $f0,%0\n\t"
+ "mf_fpcr $f0\n\t"
+ "ftoit $f0,%1\n\t"
+ "itoft %0,$f0"
+ : "=r"(tmp), "=r"(ret));
#else
- __asm__ __volatile__ (
- "stt $f0,%0\n\t"
- "mf_fpcr $f0\n\t"
- "stt $f0,%1\n\t"
- "ldt $f0,%0"
- : "=m"(tmp), "=m"(ret));
+ __asm__ __volatile__ (
+ "stt $f0,%0\n\t"
+ "mf_fpcr $f0\n\t"
+ "stt $f0,%1\n\t"
+ "ldt $f0,%0"
+ : "=m"(tmp), "=m"(ret));
#endif
+ }
+ preempt_enable();
return ret;
}
@@ -39,21 +45,28 @@ wrfpcr(unsigned long val)
{
unsigned long tmp;
+ preempt_disable();
+ if (current_thread_info()->status & TS_SAVED_FP) {
+ current_thread_info()->status |= TS_RESTORE_FP;
+ current_thread_info()->fp[31] = val;
+ } else {
#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
- __asm__ __volatile__ (
- "ftoit $f0,%0\n\t"
- "itoft %1,$f0\n\t"
- "mt_fpcr $f0\n\t"
- "itoft %0,$f0"
- : "=&r"(tmp) : "r"(val));
+ __asm__ __volatile__ (
+ "ftoit $f0,%0\n\t"
+ "itoft %1,$f0\n\t"
+ "mt_fpcr $f0\n\t"
+ "itoft %0,$f0"
+ : "=&r"(tmp) : "r"(val));
#else
- __asm__ __volatile__ (
- "stt $f0,%0\n\t"
- "ldt $f0,%1\n\t"
- "mt_fpcr $f0\n\t"
- "ldt $f0,%0"
- : "=m"(tmp) : "m"(val));
+ __asm__ __volatile__ (
+ "stt $f0,%0\n\t"
+ "ldt $f0,%1\n\t"
+ "mt_fpcr $f0\n\t"
+ "ldt $f0,%0"
+ : "=m"(tmp) : "m"(val));
#endif
+ }
+ preempt_enable();
}
static inline unsigned long
diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h
index 1c3605d874e9..7aeaf7c30a6f 100644
--- a/arch/alpha/include/asm/io.h
+++ b/arch/alpha/include/asm/io.h
@@ -14,10 +14,6 @@
the implementation we have here matches that interface. */
#include <asm-generic/iomap.h>
-/* We don't use IO slowdowns on the Alpha, but.. */
-#define __SLOW_DOWN_IO do { } while (0)
-#define SLOW_DOWN_IO do { } while (0)
-
/*
* Virtual -> physical identity mapping starts at this offset
*/
diff --git a/arch/alpha/include/asm/irq_regs.h b/arch/alpha/include/asm/irq_regs.h
deleted file mode 100644
index 3dd9c0b70270..000000000000
--- a/arch/alpha/include/asm/irq_regs.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/irq_regs.h>
diff --git a/arch/alpha/include/asm/kdebug.h b/arch/alpha/include/asm/kdebug.h
deleted file mode 100644
index 6ece1b037665..000000000000
--- a/arch/alpha/include/asm/kdebug.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kdebug.h>
diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 082631465074..4a4d00b37986 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -26,6 +26,7 @@ struct thread_info {
int bpt_nsaved;
unsigned long bpt_addr[2]; /* breakpoint handling */
unsigned int bpt_insn[2];
+ unsigned long fp[32];
};
/*
@@ -41,6 +42,8 @@ struct thread_info {
register struct thread_info *__current_thread_info __asm__("$8");
#define current_thread_info() __current_thread_info
+register unsigned long *current_stack_pointer __asm__ ("$30");
+
#endif /* __ASSEMBLY__ */
/* Thread information allocation. */
@@ -81,6 +84,9 @@ register struct thread_info *__current_thread_info __asm__("$8");
#define TS_UAC_NOFIX 0x0002 /* ! flags as they match */
#define TS_UAC_SIGBUS 0x0004 /* ! userspace part of 'osf_sysinfo' */
+#define TS_SAVED_FP 0x0008
+#define TS_RESTORE_FP 0x0010
+
#define SET_UNALIGN_CTL(task,value) ({ \
__u32 status = task_thread_info(task)->status & ~UAC_BITMASK; \
if (value & PR_UNALIGN_NOPRINT) \
@@ -104,5 +110,17 @@ register struct thread_info *__current_thread_info __asm__("$8");
put_user(res, (int __user *)(value)); \
})
+#ifndef __ASSEMBLY__
+extern void __save_fpu(void);
+
+static inline void save_fpu(void)
+{
+ if (!(current_thread_info()->status & TS_SAVED_FP)) {
+ current_thread_info()->status |= TS_SAVED_FP;
+ __save_fpu();
+ }
+}
+#endif
+
#endif /* __KERNEL__ */
#endif /* _ALPHA_THREAD_INFO_H */
diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h
index 986f5da9b7d8..caabd92ea709 100644
--- a/arch/alpha/include/asm/unistd.h
+++ b/arch/alpha/include/asm/unistd.h
@@ -4,7 +4,7 @@
#include <uapi/asm/unistd.h>
-#define NR_SYSCALLS __NR_syscalls
+#define NR_syscalls __NR_syscalls
#define __ARCH_WANT_NEW_STAT
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/alpha/include/uapi/asm/ptrace.h b/arch/alpha/include/uapi/asm/ptrace.h
index c29194181025..5ca45934fcbb 100644
--- a/arch/alpha/include/uapi/asm/ptrace.h
+++ b/arch/alpha/include/uapi/asm/ptrace.h
@@ -64,7 +64,9 @@ struct switch_stack {
unsigned long r14;
unsigned long r15;
unsigned long r26;
+#ifndef __KERNEL__
unsigned long fp[32]; /* fp[31] is fpcr */
+#endif
};
diff --git a/arch/alpha/kernel/asm-offsets.c b/arch/alpha/kernel/asm-offsets.c
index 2e125e5c1508..b121294bee26 100644
--- a/arch/alpha/kernel/asm-offsets.c
+++ b/arch/alpha/kernel/asm-offsets.c
@@ -17,6 +17,8 @@ void foo(void)
DEFINE(TI_TASK, offsetof(struct thread_info, task));
DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
DEFINE(TI_CPU, offsetof(struct thread_info, cpu));
+ DEFINE(TI_FP, offsetof(struct thread_info, fp));
+ DEFINE(TI_STATUS, offsetof(struct thread_info, status));
BLANK();
DEFINE(TASK_BLOCKED, offsetof(struct task_struct, blocked));
diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c
index f489170201c3..12926e9538b8 100644
--- a/arch/alpha/kernel/core_cia.c
+++ b/arch/alpha/kernel/core_cia.c
@@ -527,7 +527,7 @@ verify_tb_operation(void)
if (use_tbia_try2) {
alpha_mv.mv_pci_tbi = cia_pci_tbi_try2;
- /* Tags 0-3 must be disabled if we use this workaraund. */
+ /* Tags 0-3 must be disabled if we use this workaround. */
wmb();
*(vip)CIA_IOC_TB_TAGn(0) = 2;
*(vip)CIA_IOC_TB_TAGn(1) = 2;
diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S
index a6207c47f089..eb51f93a70c8 100644
--- a/arch/alpha/kernel/entry.S
+++ b/arch/alpha/kernel/entry.S
@@ -17,7 +17,7 @@
/* Stack offsets. */
#define SP_OFF 184
-#define SWITCH_STACK_SIZE 320
+#define SWITCH_STACK_SIZE 64
.macro CFI_START_OSF_FRAME func
.align 4
@@ -159,7 +159,6 @@
.cfi_rel_offset $13, 32
.cfi_rel_offset $14, 40
.cfi_rel_offset $15, 48
- /* We don't really care about the FP registers for debugging. */
.endm
.macro UNDO_SWITCH_STACK
@@ -454,7 +453,7 @@ entSys:
SAVE_ALL
lda $8, 0x3fff
bic $sp, $8, $8
- lda $4, NR_SYSCALLS($31)
+ lda $4, NR_syscalls($31)
stq $16, SP_OFF+24($sp)
lda $5, sys_call_table
lda $27, sys_ni_syscall
@@ -498,6 +497,10 @@ ret_to_user:
and $17, _TIF_WORK_MASK, $2
bne $2, work_pending
restore_all:
+ ldl $2, TI_STATUS($8)
+ and $2, TS_SAVED_FP | TS_RESTORE_FP, $3
+ bne $3, restore_fpu
+restore_other:
.cfi_remember_state
RESTORE_ALL
call_pal PAL_rti
@@ -506,7 +509,7 @@ ret_to_kernel:
.cfi_restore_state
lda $16, 7
call_pal PAL_swpipl
- br restore_all
+ br restore_other
.align 3
$syscall_error:
@@ -570,6 +573,14 @@ $work_notifysig:
.type strace, @function
strace:
/* set up signal stack, call syscall_trace */
+ // NB: if anyone adds preemption, this block will need to be protected
+ ldl $1, TI_STATUS($8)
+ and $1, TS_SAVED_FP, $3
+ or $1, TS_SAVED_FP, $2
+ bne $3, 1f
+ stl $2, TI_STATUS($8)
+ bsr $26, __save_fpu
+1:
DO_SWITCH_STACK
jsr $26, syscall_trace_enter /* returns the syscall number */
UNDO_SWITCH_STACK
@@ -583,7 +594,7 @@ strace:
ldq $21, 88($sp)
/* get the system call pointer.. */
- lda $1, NR_SYSCALLS($31)
+ lda $1, NR_syscalls($31)
lda $2, sys_call_table
lda $27, sys_ni_syscall
cmpult $0, $1, $1
@@ -649,40 +660,6 @@ do_switch_stack:
stq $14, 40($sp)
stq $15, 48($sp)
stq $26, 56($sp)
- stt $f0, 64($sp)
- stt $f1, 72($sp)
- stt $f2, 80($sp)
- stt $f3, 88($sp)
- stt $f4, 96($sp)
- stt $f5, 104($sp)
- stt $f6, 112($sp)
- stt $f7, 120($sp)
- stt $f8, 128($sp)
- stt $f9, 136($sp)
- stt $f10, 144($sp)
- stt $f11, 152($sp)
- stt $f12, 160($sp)
- stt $f13, 168($sp)
- stt $f14, 176($sp)
- stt $f15, 184($sp)
- stt $f16, 192($sp)
- stt $f17, 200($sp)
- stt $f18, 208($sp)
- stt $f19, 216($sp)
- stt $f20, 224($sp)
- stt $f21, 232($sp)
- stt $f22, 240($sp)
- stt $f23, 248($sp)
- stt $f24, 256($sp)
- stt $f25, 264($sp)
- stt $f26, 272($sp)
- stt $f27, 280($sp)
- mf_fpcr $f0 # get fpcr
- stt $f28, 288($sp)
- stt $f29, 296($sp)
- stt $f30, 304($sp)
- stt $f0, 312($sp) # save fpcr in slot of $f31
- ldt $f0, 64($sp) # dont let "do_switch_stack" change fp state.
ret $31, ($1), 1
.cfi_endproc
.size do_switch_stack, .-do_switch_stack
@@ -701,54 +678,71 @@ undo_switch_stack:
ldq $14, 40($sp)
ldq $15, 48($sp)
ldq $26, 56($sp)
- ldt $f30, 312($sp) # get saved fpcr
- ldt $f0, 64($sp)
- ldt $f1, 72($sp)
- ldt $f2, 80($sp)
- ldt $f3, 88($sp)
- mt_fpcr $f30 # install saved fpcr
- ldt $f4, 96($sp)
- ldt $f5, 104($sp)
- ldt $f6, 112($sp)
- ldt $f7, 120($sp)
- ldt $f8, 128($sp)
- ldt $f9, 136($sp)
- ldt $f10, 144($sp)
- ldt $f11, 152($sp)
- ldt $f12, 160($sp)
- ldt $f13, 168($sp)
- ldt $f14, 176($sp)
- ldt $f15, 184($sp)
- ldt $f16, 192($sp)
- ldt $f17, 200($sp)
- ldt $f18, 208($sp)
- ldt $f19, 216($sp)
- ldt $f20, 224($sp)
- ldt $f21, 232($sp)
- ldt $f22, 240($sp)
- ldt $f23, 248($sp)
- ldt $f24, 256($sp)
- ldt $f25, 264($sp)
- ldt $f26, 272($sp)
- ldt $f27, 280($sp)
- ldt $f28, 288($sp)
- ldt $f29, 296($sp)
- ldt $f30, 304($sp)
lda $sp, SWITCH_STACK_SIZE($sp)
ret $31, ($1), 1
.cfi_endproc
.size undo_switch_stack, .-undo_switch_stack
+
+#define FR(n) n * 8 + TI_FP($8)
+ .align 4
+ .globl __save_fpu
+ .type __save_fpu, @function
+__save_fpu:
+#define V(n) stt $f##n, FR(n)
+ V( 0); V( 1); V( 2); V( 3)
+ V( 4); V( 5); V( 6); V( 7)
+ V( 8); V( 9); V(10); V(11)
+ V(12); V(13); V(14); V(15)
+ V(16); V(17); V(18); V(19)
+ V(20); V(21); V(22); V(23)
+ V(24); V(25); V(26); V(27)
+ mf_fpcr $f0 # get fpcr
+ V(28); V(29); V(30)
+ stt $f0, FR(31) # save fpcr in slot of $f31
+ ldt $f0, FR(0) # don't let "__save_fpu" change fp state.
+ ret
+#undef V
+ .size __save_fpu, .-__save_fpu
+
+ .align 4
+restore_fpu:
+ and $3, TS_RESTORE_FP, $3
+ bic $2, TS_SAVED_FP | TS_RESTORE_FP, $2
+ beq $3, 1f
+#define V(n) ldt $f##n, FR(n)
+ ldt $f30, FR(31) # get saved fpcr
+ V( 0); V( 1); V( 2); V( 3)
+ mt_fpcr $f30 # install saved fpcr
+ V( 4); V( 5); V( 6); V( 7)
+ V( 8); V( 9); V(10); V(11)
+ V(12); V(13); V(14); V(15)
+ V(16); V(17); V(18); V(19)
+ V(20); V(21); V(22); V(23)
+ V(24); V(25); V(26); V(27)
+ V(28); V(29); V(30)
+1: stl $2, TI_STATUS($8)
+ br restore_other
+#undef V
+
/*
* The meat of the context switch code.
*/
-
.align 4
.globl alpha_switch_to
.type alpha_switch_to, @function
.cfi_startproc
alpha_switch_to:
DO_SWITCH_STACK
+ ldl $1, TI_STATUS($8)
+ and $1, TS_RESTORE_FP, $3
+ bne $3, 1f
+ or $1, TS_RESTORE_FP | TS_SAVED_FP, $2
+ and $1, TS_SAVED_FP, $3
+ stl $2, TI_STATUS($8)
+ bne $3, 1f
+ bsr $26, __save_fpu
+1:
call_pal PAL_swpctx
lda $8, 0x3fff
UNDO_SWITCH_STACK
@@ -799,6 +793,14 @@ ret_from_kernel_thread:
alpha_\name:
.prologue 0
bsr $1, do_switch_stack
+ // NB: if anyone adds preemption, this block will need to be protected
+ ldl $1, TI_STATUS($8)
+ and $1, TS_SAVED_FP, $3
+ or $1, TS_SAVED_FP, $2
+ bne $3, 1f
+ stl $2, TI_STATUS($8)
+ bsr $26, __save_fpu
+1:
jsr $26, sys_\name
ldq $26, 56($sp)
lda $sp, SWITCH_STACK_SIZE($sp)
diff --git a/arch/alpha/kernel/module.c b/arch/alpha/kernel/module.c
index 5b60c248de9e..cbefa5a77384 100644
--- a/arch/alpha/kernel/module.c
+++ b/arch/alpha/kernel/module.c
@@ -146,10 +146,8 @@ apply_relocate_add(Elf64_Shdr *sechdrs, const char *strtab,
base = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr;
symtab = (Elf64_Sym *)sechdrs[symindex].sh_addr;
- /* The small sections were sorted to the end of the segment.
- The following should definitely cover them. */
- gp = (u64)me->core_layout.base + me->core_layout.size - 0x8000;
got = sechdrs[me->arch.gotsecindex].sh_addr;
+ gp = got + 0x8000;
for (i = 0; i < n; i++) {
unsigned long r_sym = ELF64_R_SYM (rela[i].r_info);
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index c54469b369cb..2a9a877a0508 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -522,7 +522,7 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, const char __user *, path,
break;
default:
retval = -EINVAL;
- printk("osf_mount(%ld, %x)\n", typenr, flag);
+ printk_ratelimited("osf_mount(%ld, %x)\n", typenr, flag);
}
return retval;
diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index e83a02ed5267..c81183935e97 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -127,10 +127,12 @@ again:
goto again;
}
- if (ptes[p+i])
- p = ALIGN(p + i + 1, mask + 1), i = 0;
- else
+ if (ptes[p+i]) {
+ p = ALIGN(p + i + 1, mask + 1);
+ i = 0;
+ } else {
i = i + 1;
+ }
}
if (i < n) {
diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c
index efcf7321701b..ccdb508c1516 100644
--- a/arch/alpha/kernel/perf_event.c
+++ b/arch/alpha/kernel/perf_event.c
@@ -689,8 +689,6 @@ static int __hw_perf_event_init(struct perf_event *event)
*/
static int alpha_pmu_event_init(struct perf_event *event)
{
- int err;
-
/* does not support taken branch sampling */
if (has_branch_stack(event))
return -EOPNOTSUPP;
@@ -709,9 +707,7 @@ static int alpha_pmu_event_init(struct perf_event *event)
return -ENODEV;
/* Do the real initialisation work. */
- err = __hw_perf_event_init(event);
-
- return err;
+ return __hw_perf_event_init(event);
}
/*
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 0eddd22c6212..e9cf7193eb81 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -133,7 +133,7 @@ common_shutdown_1(void *generic_ptr)
#ifdef CONFIG_DUMMY_CONSOLE
/* If we've gotten here after SysRq-b, leave interrupt
context before taking over the console. */
- if (in_irq())
+ if (in_hardirq())
irq_exit();
/* This has the effect of resetting the VGA video origin. */
console_lock();
@@ -243,6 +243,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
childstack = ((struct switch_stack *) childregs) - 1;
childti->pcb.ksp = (unsigned long) childstack;
childti->pcb.flags = 1; /* set FEN, clear everything else */
+ childti->status |= TS_SAVED_FP | TS_RESTORE_FP;
if (unlikely(args->fn)) {
/* kernel thread */
@@ -252,6 +253,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
childstack->r9 = (unsigned long) args->fn;
childstack->r10 = (unsigned long) args->fn_arg;
childregs->hae = alpha_mv.hae_cache;
+ memset(childti->fp, '\0', sizeof(childti->fp));
childti->pcb.usp = 0;
return 0;
}
@@ -334,8 +336,7 @@ EXPORT_SYMBOL(dump_elf_task);
int elf_core_copy_task_fpregs(struct task_struct *t, elf_fpregset_t *fpu)
{
- struct switch_stack *sw = (struct switch_stack *)task_pt_regs(t) - 1;
- memcpy(fpu, sw->fp, 32 * 8);
+ memcpy(fpu, task_thread_info(t)->fp, 32 * 8);
return 1;
}
diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c
index a1a239ea002d..fde4c68e7a0b 100644
--- a/arch/alpha/kernel/ptrace.c
+++ b/arch/alpha/kernel/ptrace.c
@@ -78,6 +78,8 @@ enum {
(PAGE_SIZE*2 - sizeof(struct pt_regs) - sizeof(struct switch_stack) \
+ offsetof(struct switch_stack, reg))
+#define FP_REG(reg) (offsetof(struct thread_info, reg))
+
static int regoff[] = {
PT_REG( r0), PT_REG( r1), PT_REG( r2), PT_REG( r3),
PT_REG( r4), PT_REG( r5), PT_REG( r6), PT_REG( r7),
@@ -87,14 +89,14 @@ static int regoff[] = {
PT_REG( r20), PT_REG( r21), PT_REG( r22), PT_REG( r23),
PT_REG( r24), PT_REG( r25), PT_REG( r26), PT_REG( r27),
PT_REG( r28), PT_REG( gp), -1, -1,
- SW_REG(fp[ 0]), SW_REG(fp[ 1]), SW_REG(fp[ 2]), SW_REG(fp[ 3]),
- SW_REG(fp[ 4]), SW_REG(fp[ 5]), SW_REG(fp[ 6]), SW_REG(fp[ 7]),
- SW_REG(fp[ 8]), SW_REG(fp[ 9]), SW_REG(fp[10]), SW_REG(fp[11]),
- SW_REG(fp[12]), SW_REG(fp[13]), SW_REG(fp[14]), SW_REG(fp[15]),
- SW_REG(fp[16]), SW_REG(fp[17]), SW_REG(fp[18]), SW_REG(fp[19]),
- SW_REG(fp[20]), SW_REG(fp[21]), SW_REG(fp[22]), SW_REG(fp[23]),
- SW_REG(fp[24]), SW_REG(fp[25]), SW_REG(fp[26]), SW_REG(fp[27]),
- SW_REG(fp[28]), SW_REG(fp[29]), SW_REG(fp[30]), SW_REG(fp[31]),
+ FP_REG(fp[ 0]), FP_REG(fp[ 1]), FP_REG(fp[ 2]), FP_REG(fp[ 3]),
+ FP_REG(fp[ 4]), FP_REG(fp[ 5]), FP_REG(fp[ 6]), FP_REG(fp[ 7]),
+ FP_REG(fp[ 8]), FP_REG(fp[ 9]), FP_REG(fp[10]), FP_REG(fp[11]),
+ FP_REG(fp[12]), FP_REG(fp[13]), FP_REG(fp[14]), FP_REG(fp[15]),
+ FP_REG(fp[16]), FP_REG(fp[17]), FP_REG(fp[18]), FP_REG(fp[19]),
+ FP_REG(fp[20]), FP_REG(fp[21]), FP_REG(fp[22]), FP_REG(fp[23]),
+ FP_REG(fp[24]), FP_REG(fp[25]), FP_REG(fp[26]), FP_REG(fp[27]),
+ FP_REG(fp[28]), FP_REG(fp[29]), FP_REG(fp[30]), FP_REG(fp[31]),
PT_REG( pc)
};
diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c
index 6f47f256fe80..e62d1d461b1f 100644
--- a/arch/alpha/kernel/signal.c
+++ b/arch/alpha/kernel/signal.c
@@ -150,9 +150,10 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
{
unsigned long usp;
struct switch_stack *sw = (struct switch_stack *)regs - 1;
- long i, err = __get_user(regs->pc, &sc->sc_pc);
+ long err = __get_user(regs->pc, &sc->sc_pc);
current->restart_block.fn = do_no_restart_syscall;
+ current_thread_info()->status |= TS_SAVED_FP | TS_RESTORE_FP;
sw->r26 = (unsigned long) ret_from_sys_call;
@@ -189,9 +190,9 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs)
err |= __get_user(usp, sc->sc_regs+30);
wrusp(usp);
- for (i = 0; i < 31; i++)
- err |= __get_user(sw->fp[i], sc->sc_fpregs+i);
- err |= __get_user(sw->fp[31], &sc->sc_fpcr);
+ err |= __copy_from_user(current_thread_info()->fp,
+ sc->sc_fpregs, 31 * 8);
+ err |= __get_user(current_thread_info()->fp[31], &sc->sc_fpcr);
return err;
}
@@ -272,7 +273,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
unsigned long mask, unsigned long sp)
{
struct switch_stack *sw = (struct switch_stack *)regs - 1;
- long i, err = 0;
+ long err = 0;
err |= __put_user(on_sig_stack((unsigned long)sc), &sc->sc_onstack);
err |= __put_user(mask, &sc->sc_mask);
@@ -312,10 +313,10 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
err |= __put_user(sp, sc->sc_regs+30);
err |= __put_user(0, sc->sc_regs+31);
- for (i = 0; i < 31; i++)
- err |= __put_user(sw->fp[i], sc->sc_fpregs+i);
+ err |= __copy_to_user(sc->sc_fpregs,
+ current_thread_info()->fp, 31 * 8);
err |= __put_user(0, sc->sc_fpregs+31);
- err |= __put_user(sw->fp[31], &sc->sc_fpcr);
+ err |= __put_user(current_thread_info()->fp[31], &sc->sc_fpcr);
err |= __put_user(regs->trap_a0, &sc->sc_traparg_a0);
err |= __put_user(regs->trap_a1, &sc->sc_traparg_a1);
@@ -528,6 +529,9 @@ do_work_pending(struct pt_regs *regs, unsigned long thread_flags,
} else {
local_irq_enable();
if (thread_flags & (_TIF_SIGPENDING|_TIF_NOTIFY_SIGNAL)) {
+ preempt_disable();
+ save_fpu();
+ preempt_enable();
do_signal(regs, r0, r19);
r0 = 0;
} else {
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 8a66fe544c69..d9a67b370e04 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -233,7 +233,21 @@ do_entIF(unsigned long type, struct pt_regs *regs)
{
int signo, code;
- if ((regs->ps & ~IPL_MAX) == 0) {
+ if (type == 3) { /* FEN fault */
+ /* Irritating users can call PAL_clrfen to disable the
+ FPU for the process. The kernel will then trap in
+ do_switch_stack and undo_switch_stack when we try
+ to save and restore the FP registers.
+
+ Given that GCC by default generates code that uses the
+ FP registers, PAL_clrfen is not useful except for DoS
+ attacks. So turn the bleeding FPU back on and be done
+ with it. */
+ current_thread_info()->pcb.flags |= 1;
+ __reload_thread(&current_thread_info()->pcb);
+ return;
+ }
+ if (!user_mode(regs)) {
if (type == 1) {
const unsigned int *data
= (const unsigned int *) regs->pc;
@@ -366,20 +380,6 @@ do_entIF(unsigned long type, struct pt_regs *regs)
}
break;
- case 3: /* FEN fault */
- /* Irritating users can call PAL_clrfen to disable the
- FPU for the process. The kernel will then trap in
- do_switch_stack and undo_switch_stack when we try
- to save and restore the FP registers.
-
- Given that GCC by default generates code that uses the
- FP registers, PAL_clrfen is not useful except for DoS
- attacks. So turn the bleeding FPU back on and be done
- with it. */
- current_thread_info()->pcb.flags |= 1;
- __reload_thread(&current_thread_info()->pcb);
- return;
-
case 5: /* illoc */
default: /* unexpected instruction-fault type */
;
diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c
index 34fea465645b..612c5eca71bc 100644
--- a/arch/alpha/lib/fpreg.c
+++ b/arch/alpha/lib/fpreg.c
@@ -7,6 +7,8 @@
#include <linux/compiler.h>
#include <linux/export.h>
+#include <linux/preempt.h>
+#include <asm/thread_info.h>
#if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
#define STT(reg,val) asm volatile ("ftoit $f"#reg",%0" : "=r"(val));
@@ -19,7 +21,12 @@ alpha_read_fp_reg (unsigned long reg)
{
unsigned long val;
- switch (reg) {
+ if (unlikely(reg >= 32))
+ return 0;
+ preempt_enable();
+ if (current_thread_info()->status & TS_SAVED_FP)
+ val = current_thread_info()->fp[reg];
+ else switch (reg) {
case 0: STT( 0, val); break;
case 1: STT( 1, val); break;
case 2: STT( 2, val); break;
@@ -52,8 +59,8 @@ alpha_read_fp_reg (unsigned long reg)
case 29: STT(29, val); break;
case 30: STT(30, val); break;
case 31: STT(31, val); break;
- default: return 0;
}
+ preempt_enable();
return val;
}
EXPORT_SYMBOL(alpha_read_fp_reg);
@@ -67,7 +74,14 @@ EXPORT_SYMBOL(alpha_read_fp_reg);
void
alpha_write_fp_reg (unsigned long reg, unsigned long val)
{
- switch (reg) {
+ if (unlikely(reg >= 32))
+ return;
+
+ preempt_disable();
+ if (current_thread_info()->status & TS_SAVED_FP) {
+ current_thread_info()->status |= TS_RESTORE_FP;
+ current_thread_info()->fp[reg] = val;
+ } else switch (reg) {
case 0: LDT( 0, val); break;
case 1: LDT( 1, val); break;
case 2: LDT( 2, val); break;
@@ -101,6 +115,7 @@ alpha_write_fp_reg (unsigned long reg, unsigned long val)
case 30: LDT(30, val); break;
case 31: LDT(31, val); break;
}
+ preempt_enable();
}
EXPORT_SYMBOL(alpha_write_fp_reg);
@@ -115,7 +130,14 @@ alpha_read_fp_reg_s (unsigned long reg)
{
unsigned long val;
- switch (reg) {
+ if (unlikely(reg >= 32))
+ return 0;
+
+ preempt_enable();
+ if (current_thread_info()->status & TS_SAVED_FP) {
+ LDT(0, current_thread_info()->fp[reg]);
+ STS(0, val);
+ } else switch (reg) {
case 0: STS( 0, val); break;
case 1: STS( 1, val); break;
case 2: STS( 2, val); break;
@@ -148,8 +170,8 @@ alpha_read_fp_reg_s (unsigned long reg)
case 29: STS(29, val); break;
case 30: STS(30, val); break;
case 31: STS(31, val); break;
- default: return 0;
}
+ preempt_enable();
return val;
}
EXPORT_SYMBOL(alpha_read_fp_reg_s);
@@ -163,7 +185,15 @@ EXPORT_SYMBOL(alpha_read_fp_reg_s);
void
alpha_write_fp_reg_s (unsigned long reg, unsigned long val)
{
- switch (reg) {
+ if (unlikely(reg >= 32))
+ return;
+
+ preempt_disable();
+ if (current_thread_info()->status & TS_SAVED_FP) {
+ current_thread_info()->status |= TS_RESTORE_FP;
+ LDS(0, val);
+ STT(0, current_thread_info()->fp[reg]);
+ } else switch (reg) {
case 0: LDS( 0, val); break;
case 1: LDS( 1, val); break;
case 2: LDS( 2, val); break;
@@ -197,5 +227,6 @@ alpha_write_fp_reg_s (unsigned long reg, unsigned long val)
case 30: LDS(30, val); break;
case 31: LDS(31, val); break;
}
+ preempt_enable();
}
EXPORT_SYMBOL(alpha_write_fp_reg_s);
diff --git a/arch/alpha/lib/stacktrace.c b/arch/alpha/lib/stacktrace.c
index 62454a7810e2..2b1176dd5174 100644
--- a/arch/alpha/lib/stacktrace.c
+++ b/arch/alpha/lib/stacktrace.c
@@ -92,7 +92,7 @@ stacktrace(void)
{
instr * ret_pc;
instr * prologue = (instr *)stacktrace;
- register unsigned char * sp __asm__ ("$30");
+ unsigned char *sp = (unsigned char *)current_stack_pointer;
printk("\tstack trace:\n");
do {
diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts
index 0f9a4f0a5571..a5be0ee048ec 100644
--- a/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts
+++ b/arch/arm/boot/dts/aspeed-bmc-ibm-bonnell.dts
@@ -124,7 +124,7 @@
};
};
- iio-hwmon-battery {
+ iio-hwmon {
compatible = "iio-hwmon";
io-channels = <&adc1 7>;
};
diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts
index 456ca2830a31..c3b0cd61ac85 100644
--- a/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts
+++ b/arch/arm/boot/dts/aspeed-bmc-ibm-everest.dts
@@ -244,7 +244,7 @@
};
};
- iio-hwmon-battery {
+ iio-hwmon {
compatible = "iio-hwmon";
io-channels = <&adc1 7>;
};
diff --git a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts
index e1b5d44308fe..7162e65b8115 100644
--- a/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts
+++ b/arch/arm/boot/dts/aspeed-bmc-ibm-rainier.dts
@@ -220,7 +220,7 @@
};
};
- iio-hwmon-battery {
+ iio-hwmon {
compatible = "iio-hwmon";
io-channels = <&adc1 7>;
};
diff --git a/arch/arm/boot/dts/exynos4-cpu-thermal.dtsi b/arch/arm/boot/dts/exynos4-cpu-thermal.dtsi
index 021d9fc1b492..27a1a8952665 100644
--- a/arch/arm/boot/dts/exynos4-cpu-thermal.dtsi
+++ b/arch/arm/boot/dts/exynos4-cpu-thermal.dtsi
@@ -10,7 +10,7 @@
/ {
thermal-zones {
cpu_thermal: cpu-thermal {
- thermal-sensors = <&tmu 0>;
+ thermal-sensors = <&tmu>;
polling-delay-passive = <0>;
polling-delay = <0>;
trips {
diff --git a/arch/arm/boot/dts/exynos4210.dtsi b/arch/arm/boot/dts/exynos4210.dtsi
index 5a1ec714c612..0e27c3375e2e 100644
--- a/arch/arm/boot/dts/exynos4210.dtsi
+++ b/arch/arm/boot/dts/exynos4210.dtsi
@@ -393,7 +393,6 @@
&cpu_thermal {
polling-delay-passive = <0>;
polling-delay = <0>;
- thermal-sensors = <&tmu 0>;
};
&gic {
diff --git a/arch/arm/boot/dts/exynos5250.dtsi b/arch/arm/boot/dts/exynos5250.dtsi
index a5db4ac213d5..60a623e3a200 100644
--- a/arch/arm/boot/dts/exynos5250.dtsi
+++ b/arch/arm/boot/dts/exynos5250.dtsi
@@ -1107,7 +1107,7 @@
&cpu_thermal {
polling-delay-passive = <0>;
polling-delay = <0>;
- thermal-sensors = <&tmu 0>;
+ thermal-sensors = <&tmu>;
cooling-maps {
map0 {
diff --git a/arch/arm/boot/dts/exynos5410-odroidxu.dts b/arch/arm/boot/dts/exynos5410-odroidxu.dts
index 232561620da2..6ddd1dd2fb0b 100644
--- a/arch/arm/boot/dts/exynos5410-odroidxu.dts
+++ b/arch/arm/boot/dts/exynos5410-odroidxu.dts
@@ -120,7 +120,6 @@
};
&cpu0_thermal {
- thermal-sensors = <&tmu_cpu0 0>;
polling-delay-passive = <0>;
polling-delay = <0>;
diff --git a/arch/arm/boot/dts/exynos5422-odroidhc1.dts b/arch/arm/boot/dts/exynos5422-odroidhc1.dts
index 3de7019572a2..5e4280393706 100644
--- a/arch/arm/boot/dts/exynos5422-odroidhc1.dts
+++ b/arch/arm/boot/dts/exynos5422-odroidhc1.dts
@@ -31,7 +31,7 @@
thermal-zones {
cpu0_thermal: cpu0-thermal {
- thermal-sensors = <&tmu_cpu0 0>;
+ thermal-sensors = <&tmu_cpu0>;
trips {
cpu0_alert0: cpu-alert-0 {
temperature = <70000>; /* millicelsius */
@@ -86,7 +86,7 @@
};
};
cpu1_thermal: cpu1-thermal {
- thermal-sensors = <&tmu_cpu1 0>;
+ thermal-sensors = <&tmu_cpu1>;
trips {
cpu1_alert0: cpu-alert-0 {
temperature = <70000>;
@@ -130,7 +130,7 @@
};
};
cpu2_thermal: cpu2-thermal {
- thermal-sensors = <&tmu_cpu2 0>;
+ thermal-sensors = <&tmu_cpu2>;
trips {
cpu2_alert0: cpu-alert-0 {
temperature = <70000>;
@@ -174,7 +174,7 @@
};
};
cpu3_thermal: cpu3-thermal {
- thermal-sensors = <&tmu_cpu3 0>;
+ thermal-sensors = <&tmu_cpu3>;
trips {
cpu3_alert0: cpu-alert-0 {
temperature = <70000>;
@@ -218,7 +218,7 @@
};
};
gpu_thermal: gpu-thermal {
- thermal-sensors = <&tmu_gpu 0>;
+ thermal-sensors = <&tmu_gpu>;
trips {
gpu_alert0: gpu-alert-0 {
temperature = <70000>;
diff --git a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
index a6961ff24030..e6e7e2ff2a26 100644
--- a/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
+++ b/arch/arm/boot/dts/exynos5422-odroidxu3-common.dtsi
@@ -50,7 +50,7 @@
thermal-zones {
cpu0_thermal: cpu0-thermal {
- thermal-sensors = <&tmu_cpu0 0>;
+ thermal-sensors = <&tmu_cpu0>;
polling-delay-passive = <250>;
polling-delay = <0>;
trips {
@@ -139,7 +139,7 @@
};
};
cpu1_thermal: cpu1-thermal {
- thermal-sensors = <&tmu_cpu1 0>;
+ thermal-sensors = <&tmu_cpu1>;
polling-delay-passive = <250>;
polling-delay = <0>;
trips {
@@ -212,7 +212,7 @@
};
};
cpu2_thermal: cpu2-thermal {
- thermal-sensors = <&tmu_cpu2 0>;
+ thermal-sensors = <&tmu_cpu2>;
polling-delay-passive = <250>;
polling-delay = <0>;
trips {
@@ -285,7 +285,7 @@
};
};
cpu3_thermal: cpu3-thermal {
- thermal-sensors = <&tmu_cpu3 0>;
+ thermal-sensors = <&tmu_cpu3>;
polling-delay-passive = <250>;
polling-delay = <0>;
trips {
@@ -358,7 +358,7 @@
};
};
gpu_thermal: gpu-thermal {
- thermal-sensors = <&tmu_gpu 0>;
+ thermal-sensors = <&tmu_gpu>;
polling-delay-passive = <250>;
polling-delay = <0>;
trips {
diff --git a/arch/arm/boot/dts/spear320-hmi.dts b/arch/arm/boot/dts/spear320-hmi.dts
index 34503ac9c51c..721e5ee7b680 100644
--- a/arch/arm/boot/dts/spear320-hmi.dts
+++ b/arch/arm/boot/dts/spear320-hmi.dts
@@ -241,7 +241,7 @@
irq-trigger = <0x1>;
stmpegpio: stmpe-gpio {
- compatible = "stmpe,gpio";
+ compatible = "st,stmpe-gpio";
reg = <0>;
gpio-controller;
#gpio-cells = <2>;
diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c
index 5d2f386a46d8..eca2fe0f4314 100644
--- a/arch/arm/mach-qcom/platsmp.c
+++ b/arch/arm/mach-qcom/platsmp.c
@@ -14,7 +14,7 @@
#include <linux/of_address.h>
#include <linux/smp.h>
#include <linux/io.h>
-#include <linux/qcom_scm.h>
+#include <linux/firmware/qcom/qcom_scm.h>
#include <asm/smp_plat.h>
diff --git a/arch/arm/mach-s3c/Makefile b/arch/arm/mach-s3c/Makefile
index 988c49672715..713827bef831 100644
--- a/arch/arm/mach-s3c/Makefile
+++ b/arch/arm/mach-s3c/Makefile
@@ -2,7 +2,7 @@
#
# Copyright 2009 Simtec Electronics
-include $(src)/Makefile.s3c64xx
+include $(srctree)/$(src)/Makefile.s3c64xx
# Objects we always build independent of SoC choice
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c135f6e37a00..8bc01071474a 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -984,7 +984,8 @@ __iommu_create_mapping(struct device *dev, struct page **pages, size_t size,
len = (j - i) << PAGE_SHIFT;
ret = iommu_map(mapping->domain, iova, phys, len,
- __dma_info_to_prot(DMA_BIDIRECTIONAL, attrs));
+ __dma_info_to_prot(DMA_BIDIRECTIONAL, attrs),
+ GFP_KERNEL);
if (ret < 0)
goto fail;
iova += len;
@@ -1207,7 +1208,8 @@ static int __map_sg_chunk(struct device *dev, struct scatterlist *sg,
prot = __dma_info_to_prot(dir, attrs);
- ret = iommu_map(mapping->domain, iova, phys, len, prot);
+ ret = iommu_map(mapping->domain, iova, phys, len, prot,
+ GFP_KERNEL);
if (ret < 0)
goto fail;
count += len >> PAGE_SHIFT;
@@ -1379,7 +1381,8 @@ static dma_addr_t arm_iommu_map_page(struct device *dev, struct page *page,
prot = __dma_info_to_prot(dir, attrs);
- ret = iommu_map(mapping->domain, dma_addr, page_to_phys(page), len, prot);
+ ret = iommu_map(mapping->domain, dma_addr, page_to_phys(page), len,
+ prot, GFP_KERNEL);
if (ret < 0)
goto fail;
@@ -1443,7 +1446,7 @@ static dma_addr_t arm_iommu_map_resource(struct device *dev,
prot = __dma_info_to_prot(dir, attrs) | IOMMU_MMIO;
- ret = iommu_map(mapping->domain, dma_addr, addr, len, prot);
+ ret = iommu_map(mapping->domain, dma_addr, addr, len, prot, GFP_KERNEL);
if (ret < 0)
goto fail;
diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi b/arch/arm64/boot/dts/qcom/sm8250.dtsi
index f69fa68a1222..2f0e460acccd 100644
--- a/arch/arm64/boot/dts/qcom/sm8250.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi
@@ -2293,7 +2293,7 @@
pinctrl-names = "default";
pinctrl-0 = <&rx_swr_active>;
compatible = "qcom,sm8250-lpass-rx-macro";
- reg = <0 0x3200000 0 0x1000>;
+ reg = <0 0x03200000 0 0x1000>;
status = "disabled";
clocks = <&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
@@ -2310,7 +2310,7 @@
};
swr1: soundwire-controller@3210000 {
- reg = <0 0x3210000 0 0x2000>;
+ reg = <0 0x03210000 0 0x2000>;
compatible = "qcom,soundwire-v1.5.1";
status = "disabled";
interrupts = <GIC_SPI 298 IRQ_TYPE_LEVEL_HIGH>;
@@ -2339,7 +2339,7 @@
pinctrl-names = "default";
pinctrl-0 = <&tx_swr_active>;
compatible = "qcom,sm8250-lpass-tx-macro";
- reg = <0 0x3220000 0 0x1000>;
+ reg = <0 0x03220000 0 0x1000>;
status = "disabled";
clocks = <&q6afecc LPASS_CLK_ID_TX_CORE_MCLK LPASS_CLK_ATTRIBUTE_COUPLE_NO>,
@@ -2357,7 +2357,7 @@
/* tx macro */
swr2: soundwire-controller@3230000 {
- reg = <0 0x3230000 0 0x2000>;
+ reg = <0 0x03230000 0 0x2000>;
compatible = "qcom,soundwire-v1.5.1";
interrupts-extended = <&intc GIC_SPI 297 IRQ_TYPE_LEVEL_HIGH>;
interrupt-names = "core";
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 7e0487bbdaa0..7790ee42c68a 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -115,6 +115,7 @@ CONFIG_KVM=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
+CONFIG_IOSCHED_BFQ=y
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_MEMORY_HOTPLUG=y
@@ -219,7 +220,7 @@ CONFIG_PCI_HOST_THUNDER_ECAM=y
CONFIG_PCIE_ROCKCHIP_HOST=m
CONFIG_PCIE_MEDIATEK_GEN3=m
CONFIG_PCIE_BRCMSTB=m
-CONFIG_PCI_IMX6=y
+CONFIG_PCI_IMX6_HOST=y
CONFIG_PCI_LAYERSCAPE=y
CONFIG_PCI_HISI=y
CONFIG_PCIE_QCOM=y
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index c0b178d1bb4f..a51e6e8f3171 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -16,6 +16,15 @@
#define CLIDR_LOC(clidr) (((clidr) >> CLIDR_LOC_SHIFT) & 0x7)
#define CLIDR_LOUIS(clidr) (((clidr) >> CLIDR_LOUIS_SHIFT) & 0x7)
+/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */
+#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1))
+#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level))
+#define CLIDR_CTYPE(clidr, level) \
+ (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level))
+
+/* Ttypen, bits [2(n - 1) + 34 : 2(n - 1) + 33], for n = 1 to 7 */
+#define CLIDR_TTYPE_SHIFT(level) (2 * ((level) - 1) + CLIDR_EL1_Ttypen_SHIFT)
+
/*
* Memory returned by kmalloc() may be used for DMA, so we must make
* sure that all such allocations are cache aligned. Otherwise,
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 2cdd010f9524..037724b19c5c 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -196,4 +196,103 @@
__init_el2_nvhe_prepare_eret
.endm
+#ifndef __KVM_NVHE_HYPERVISOR__
+// This will clobber tmp1 and tmp2, and expect tmp1 to contain
+// the id register value as read from the HW
+.macro __check_override idreg, fld, width, pass, fail, tmp1, tmp2
+ ubfx \tmp1, \tmp1, #\fld, #\width
+ cbz \tmp1, \fail
+
+ adr_l \tmp1, \idreg\()_override
+ ldr \tmp2, [\tmp1, FTR_OVR_VAL_OFFSET]
+ ldr \tmp1, [\tmp1, FTR_OVR_MASK_OFFSET]
+ ubfx \tmp2, \tmp2, #\fld, #\width
+ ubfx \tmp1, \tmp1, #\fld, #\width
+ cmp \tmp1, xzr
+ and \tmp2, \tmp2, \tmp1
+ csinv \tmp2, \tmp2, xzr, ne
+ cbnz \tmp2, \pass
+ b \fail
+.endm
+
+// This will clobber tmp1 and tmp2
+.macro check_override idreg, fld, pass, fail, tmp1, tmp2
+ mrs \tmp1, \idreg\()_el1
+ __check_override \idreg \fld 4 \pass \fail \tmp1 \tmp2
+.endm
+#else
+// This will clobber tmp
+.macro __check_override idreg, fld, width, pass, fail, tmp, ignore
+ ldr_l \tmp, \idreg\()_el1_sys_val
+ ubfx \tmp, \tmp, #\fld, #\width
+ cbnz \tmp, \pass
+ b \fail
+.endm
+
+.macro check_override idreg, fld, pass, fail, tmp, ignore
+ __check_override \idreg \fld 4 \pass \fail \tmp \ignore
+.endm
+#endif
+
+.macro finalise_el2_state
+ check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2
+
+.Linit_sve_\@: /* SVE register access */
+ mrs x0, cptr_el2 // Disable SVE traps
+ bic x0, x0, #CPTR_EL2_TZ
+ msr cptr_el2, x0
+ isb
+ mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
+ msr_s SYS_ZCR_EL2, x1 // length for EL1.
+
+.Lskip_sve_\@:
+ check_override id_aa64pfr1, ID_AA64PFR1_EL1_SME_SHIFT, .Linit_sme_\@, .Lskip_sme_\@, x1, x2
+
+.Linit_sme_\@: /* SME register access and priority mapping */
+ mrs x0, cptr_el2 // Disable SME traps
+ bic x0, x0, #CPTR_EL2_TSM
+ msr cptr_el2, x0
+ isb
+
+ mrs x1, sctlr_el2
+ orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps
+ msr sctlr_el2, x1
+ isb
+
+ mov x0, #0 // SMCR controls
+
+ // Full FP in SM?
+ mrs_s x1, SYS_ID_AA64SMFR0_EL1
+ __check_override id_aa64smfr0, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, .Linit_sme_fa64_\@, .Lskip_sme_fa64_\@, x1, x2
+
+.Linit_sme_fa64_\@:
+ orr x0, x0, SMCR_ELx_FA64_MASK
+.Lskip_sme_fa64_\@:
+
+ // ZT0 available?
+ mrs_s x1, SYS_ID_AA64SMFR0_EL1
+ __check_override id_aa64smfr0, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, .Linit_sme_zt0_\@, .Lskip_sme_zt0_\@, x1, x2
+.Linit_sme_zt0_\@:
+ orr x0, x0, SMCR_ELx_EZT0_MASK
+.Lskip_sme_zt0_\@:
+
+ orr x0, x0, #SMCR_ELx_LEN_MASK // Enable full SME vector
+ msr_s SYS_SMCR_EL2, x0 // length for EL1.
+
+ mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported?
+ ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1
+ cbz x1, .Lskip_sme_\@
+
+ msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal
+
+ mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present?
+ ubfx x1, x1, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4
+ cbz x1, .Lskip_sme_\@
+
+ mrs_s x1, SYS_HCRX_EL2
+ orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping
+ msr_s SYS_HCRX_EL2, x1
+.Lskip_sme_\@:
+.endm
+
#endif /* __ARM_KVM_INIT_H__ */
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h
index c9f15b9e3c71..8487aec9b658 100644
--- a/arch/arm64/include/asm/esr.h
+++ b/arch/arm64/include/asm/esr.h
@@ -272,6 +272,10 @@
(((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \
ESR_ELx_SYS64_ISS_OP2_SHIFT))
+/* ISS field definitions for ERET/ERETAA/ERETAB trapping */
+#define ESR_ELx_ERET_ISS_ERET 0x2
+#define ESR_ELx_ERET_ISS_ERETA 0x1
+
/*
* ISS field definitions for floating-point exception traps
* (FP_EXC_32/FP_EXC_64).
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 26b0c97df986..baef29fcbeee 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -81,11 +81,12 @@
* SWIO: Turn set/way invalidates into set/way clean+invalidate
* PTW: Take a stage2 fault if a stage1 walk steps in device memory
* TID3: Trap EL1 reads of group 3 ID registers
+ * TID2: Trap CTR_EL0, CCSIDR2_EL1, CLIDR_EL1, and CSSELR_EL1
*/
#define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \
HCR_BSU_IS | HCR_FB | HCR_TACR | \
HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \
- HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 )
+ HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID2)
#define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF)
#define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA)
#define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC)
@@ -344,10 +345,26 @@
ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
- ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
+ ECN(BKPT32), ECN(VECTOR32), ECN(BRK64), ECN(ERET)
-#define CPACR_EL1_TTA (1 << 28)
#define CPACR_EL1_DEFAULT (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\
CPACR_EL1_ZEN_EL1EN)
+#define kvm_mode_names \
+ { PSR_MODE_EL0t, "EL0t" }, \
+ { PSR_MODE_EL1t, "EL1t" }, \
+ { PSR_MODE_EL1h, "EL1h" }, \
+ { PSR_MODE_EL2t, "EL2t" }, \
+ { PSR_MODE_EL2h, "EL2h" }, \
+ { PSR_MODE_EL3t, "EL3t" }, \
+ { PSR_MODE_EL3h, "EL3h" }, \
+ { PSR_AA32_MODE_USR, "32-bit USR" }, \
+ { PSR_AA32_MODE_FIQ, "32-bit FIQ" }, \
+ { PSR_AA32_MODE_IRQ, "32-bit IRQ" }, \
+ { PSR_AA32_MODE_SVC, "32-bit SVC" }, \
+ { PSR_AA32_MODE_ABT, "32-bit ABT" }, \
+ { PSR_AA32_MODE_HYP, "32-bit HYP" }, \
+ { PSR_AA32_MODE_UND, "32-bit UND" }, \
+ { PSR_AA32_MODE_SYS, "32-bit SYS" }
+
#endif /* __ARM64_KVM_ARM_H__ */
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 193583df2d9c..b31b32ecbe2d 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -33,6 +33,12 @@ enum exception_type {
except_type_serror = 0x180,
};
+#define kvm_exception_type_names \
+ { except_type_sync, "SYNC" }, \
+ { except_type_irq, "IRQ" }, \
+ { except_type_fiq, "FIQ" }, \
+ { except_type_serror, "SERROR" }
+
bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
void kvm_skip_instr32(struct kvm_vcpu *vcpu);
@@ -44,6 +50,10 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu);
void kvm_vcpu_wfi(struct kvm_vcpu *vcpu);
+void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
+int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
+int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);
+
#if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__)
static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
{
@@ -88,10 +98,6 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
if (vcpu_el1_is_32bit(vcpu))
vcpu->arch.hcr_el2 &= ~HCR_RW;
- if (cpus_have_const_cap(ARM64_MISMATCHED_CACHE_TYPE) ||
- vcpu_el1_is_32bit(vcpu))
- vcpu->arch.hcr_el2 |= HCR_TID2;
-
if (kvm_has_mte(vcpu->kvm))
vcpu->arch.hcr_el2 |= HCR_ATA;
}
@@ -183,6 +189,62 @@ static __always_inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
vcpu_gp_regs(vcpu)->regs[reg_num] = val;
}
+static inline bool vcpu_is_el2_ctxt(const struct kvm_cpu_context *ctxt)
+{
+ switch (ctxt->regs.pstate & (PSR_MODE32_BIT | PSR_MODE_MASK)) {
+ case PSR_MODE_EL2h:
+ case PSR_MODE_EL2t:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static inline bool vcpu_is_el2(const struct kvm_vcpu *vcpu)
+{
+ return vcpu_is_el2_ctxt(&vcpu->arch.ctxt);
+}
+
+static inline bool __vcpu_el2_e2h_is_set(const struct kvm_cpu_context *ctxt)
+{
+ return ctxt_sys_reg(ctxt, HCR_EL2) & HCR_E2H;
+}
+
+static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu)
+{
+ return __vcpu_el2_e2h_is_set(&vcpu->arch.ctxt);
+}
+
+static inline bool __vcpu_el2_tge_is_set(const struct kvm_cpu_context *ctxt)
+{
+ return ctxt_sys_reg(ctxt, HCR_EL2) & HCR_TGE;
+}
+
+static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu)
+{
+ return __vcpu_el2_tge_is_set(&vcpu->arch.ctxt);
+}
+
+static inline bool __is_hyp_ctxt(const struct kvm_cpu_context *ctxt)
+{
+ /*
+ * We are in a hypervisor context if the vcpu mode is EL2 or
+ * E2H and TGE bits are set. The latter means we are in the user space
+ * of the VHE kernel. ARMv8.1 ARM describes this as 'InHost'
+ *
+ * Note that the HCR_EL2.{E2H,TGE}={0,1} isn't really handled in the
+ * rest of the KVM code, and will result in a misbehaving guest.
+ */
+ return vcpu_is_el2_ctxt(ctxt) ||
+ (__vcpu_el2_e2h_is_set(ctxt) && __vcpu_el2_tge_is_set(ctxt)) ||
+ __vcpu_el2_tge_is_set(ctxt);
+}
+
+static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu)
+{
+ return __is_hyp_ctxt(&vcpu->arch.ctxt);
+}
+
/*
* The layout of SPSR for an AArch32 state is different when observed from an
* AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 35a159d131b5..a1892a8f6032 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -60,14 +60,19 @@
enum kvm_mode {
KVM_MODE_DEFAULT,
KVM_MODE_PROTECTED,
+ KVM_MODE_NV,
KVM_MODE_NONE,
};
+#ifdef CONFIG_KVM
enum kvm_mode kvm_get_mode(void);
+#else
+static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; };
+#endif
DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
-extern unsigned int kvm_sve_max_vl;
-int kvm_arm_init_sve(void);
+extern unsigned int __ro_after_init kvm_sve_max_vl;
+int __init kvm_arm_init_sve(void);
u32 __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
@@ -252,6 +257,7 @@ struct kvm_vcpu_fault_info {
enum vcpu_sysreg {
__INVALID_SYSREG__, /* 0 is reserved as an invalid value */
MPIDR_EL1, /* MultiProcessor Affinity Register */
+ CLIDR_EL1, /* Cache Level ID Register */
CSSELR_EL1, /* Cache Size Selection Register */
SCTLR_EL1, /* System Control Register */
ACTLR_EL1, /* Auxiliary Control Register */
@@ -320,12 +326,43 @@ enum vcpu_sysreg {
TFSR_EL1, /* Tag Fault Status Register (EL1) */
TFSRE0_EL1, /* Tag Fault Status Register (EL0) */
- /* 32bit specific registers. Keep them at the end of the range */
+ /* 32bit specific registers. */
DACR32_EL2, /* Domain Access Control Register */
IFSR32_EL2, /* Instruction Fault Status Register */
FPEXC32_EL2, /* Floating-Point Exception Control Register */
DBGVCR32_EL2, /* Debug Vector Catch Register */
+ /* EL2 registers */
+ VPIDR_EL2, /* Virtualization Processor ID Register */
+ VMPIDR_EL2, /* Virtualization Multiprocessor ID Register */
+ SCTLR_EL2, /* System Control Register (EL2) */
+ ACTLR_EL2, /* Auxiliary Control Register (EL2) */
+ HCR_EL2, /* Hypervisor Configuration Register */
+ MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */
+ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */
+ HSTR_EL2, /* Hypervisor System Trap Register */
+ HACR_EL2, /* Hypervisor Auxiliary Control Register */
+ TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */
+ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */
+ TCR_EL2, /* Translation Control Register (EL2) */
+ VTTBR_EL2, /* Virtualization Translation Table Base Register */
+ VTCR_EL2, /* Virtualization Translation Control Register */
+ SPSR_EL2, /* EL2 saved program status register */
+ ELR_EL2, /* EL2 exception link register */
+ AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */
+ AFSR1_EL2, /* Auxiliary Fault Status Register 1 (EL2) */
+ ESR_EL2, /* Exception Syndrome Register (EL2) */
+ FAR_EL2, /* Fault Address Register (EL2) */
+ HPFAR_EL2, /* Hypervisor IPA Fault Address Register */
+ MAIR_EL2, /* Memory Attribute Indirection Register (EL2) */
+ AMAIR_EL2, /* Auxiliary Memory Attribute Indirection Register (EL2) */
+ VBAR_EL2, /* Vector Base Address Register (EL2) */
+ RVBAR_EL2, /* Reset Vector Base Address Register */
+ CONTEXTIDR_EL2, /* Context ID Register (EL2) */
+ TPIDR_EL2, /* EL2 Software Thread ID Register */
+ CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */
+ SP_EL2, /* EL2 Stack Pointer */
+
NR_SYS_REGS /* Nothing after this line! */
};
@@ -501,6 +538,9 @@ struct kvm_vcpu_arch {
u64 last_steal;
gpa_t base;
} steal;
+
+ /* Per-vcpu CCSIDR override or NULL */
+ u32 *ccsidr;
};
/*
@@ -598,7 +638,7 @@ struct kvm_vcpu_arch {
#define EXCEPT_AA64_EL1_IRQ __vcpu_except_flags(1)
#define EXCEPT_AA64_EL1_FIQ __vcpu_except_flags(2)
#define EXCEPT_AA64_EL1_SERR __vcpu_except_flags(3)
-/* For AArch64 with NV (one day): */
+/* For AArch64 with NV: */
#define EXCEPT_AA64_EL2_SYNC __vcpu_except_flags(4)
#define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5)
#define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6)
@@ -609,6 +649,8 @@ struct kvm_vcpu_arch {
#define DEBUG_STATE_SAVE_SPE __vcpu_single_flag(iflags, BIT(5))
/* Save TRBE context if active */
#define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6))
+/* vcpu running in HYP context */
+#define VCPU_HYP_CONTEXT __vcpu_single_flag(iflags, BIT(7))
/* SVE enabled for host EL0 */
#define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0))
@@ -705,7 +747,6 @@ static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
return false;
switch (reg) {
- case CSSELR_EL1: *val = read_sysreg_s(SYS_CSSELR_EL1); break;
case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break;
case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break;
case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break;
@@ -750,7 +791,6 @@ static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
return false;
switch (reg) {
- case CSSELR_EL1: write_sysreg_s(val, SYS_CSSELR_EL1); break;
case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break;
case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break;
case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break;
@@ -877,7 +917,7 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu);
void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);
-int kvm_sys_reg_table_init(void);
+int __init kvm_sys_reg_table_init(void);
/* MMIO helpers */
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
@@ -908,20 +948,20 @@ int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
-extern unsigned int kvm_arm_vmid_bits;
-int kvm_arm_vmid_alloc_init(void);
-void kvm_arm_vmid_alloc_free(void);
+extern unsigned int __ro_after_init kvm_arm_vmid_bits;
+int __init kvm_arm_vmid_alloc_init(void);
+void __init kvm_arm_vmid_alloc_free(void);
void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid);
void kvm_arm_vmid_clear_active(void);
static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
{
- vcpu_arch->steal.base = GPA_INVALID;
+ vcpu_arch->steal.base = INVALID_GPA;
}
static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
{
- return (vcpu_arch->steal.base != GPA_INVALID);
+ return (vcpu_arch->steal.base != INVALID_GPA);
}
void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);
@@ -943,7 +983,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void)
void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu);
-static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
@@ -994,7 +1033,7 @@ static inline void kvm_clr_pmu_events(u32 clr) {}
void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu);
-int kvm_set_ipa_limit(void);
+int __init kvm_set_ipa_limit(void);
#define __KVM_HAVE_ARCH_VM_ALLOC
struct kvm *kvm_arch_alloc_vm(void);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 6797eafe7890..bdd9cf546d95 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -122,6 +122,7 @@ extern u64 kvm_nvhe_sym(id_aa64isar2_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val);
extern u64 kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val);
+extern u64 kvm_nvhe_sym(id_aa64smfr0_el1_sys_val);
extern unsigned long kvm_nvhe_sym(__icache_flags);
extern unsigned int kvm_nvhe_sym(kvm_arm_vmid_bits);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index e4a7e6369499..083cc47dca08 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -115,6 +115,7 @@ alternative_cb_end
#include <asm/cache.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
+#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
void kvm_update_va_mask(struct alt_instr *alt,
@@ -163,7 +164,7 @@ int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **haddr);
int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
void **haddr);
-void free_hyp_pgds(void);
+void __init free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
@@ -175,7 +176,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
-int kvm_mmu_init(u32 *hyp_va_bits);
+int __init kvm_mmu_init(u32 *hyp_va_bits);
static inline void *__kvm_vector_slot2addr(void *base,
enum arm64_hyp_spectre_vector slot)
@@ -192,7 +193,15 @@ struct kvm;
static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
{
- return (vcpu_read_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101;
+ u64 cache_bits = SCTLR_ELx_M | SCTLR_ELx_C;
+ int reg;
+
+ if (vcpu_is_el2(vcpu))
+ reg = SCTLR_EL2;
+ else
+ reg = SCTLR_EL1;
+
+ return (vcpu_read_sys_reg(vcpu, reg) & cache_bits) == cache_bits;
}
static inline void __clean_dcache_guest_page(void *va, size_t size)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
new file mode 100644
index 000000000000..8fb67f032fd1
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARM64_KVM_NESTED_H
+#define __ARM64_KVM_NESTED_H
+
+#include <linux/kvm_host.h>
+
+static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu)
+{
+ return (!__is_defined(__KVM_NVHE_HYPERVISOR__) &&
+ cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) &&
+ test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features));
+}
+
+struct sys_reg_params;
+struct sys_reg_desc;
+
+void access_nested_id_reg(struct kvm_vcpu *v, struct sys_reg_params *p,
+ const struct sys_reg_desc *r);
+
+#endif /* __ARM64_KVM_NESTED_H */
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 63f81b27a4e3..4cd6762bda80 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -71,6 +71,11 @@ static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
return pte;
}
+static inline kvm_pfn_t kvm_pte_to_pfn(kvm_pte_t pte)
+{
+ return __phys_to_pfn(kvm_pte_to_phys(pte));
+}
+
static inline u64 kvm_granule_shift(u32 level)
{
/* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */
@@ -188,12 +193,15 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
* children.
* @KVM_PGTABLE_WALK_SHARED: Indicates the page-tables may be shared
* with other software walkers.
+ * @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was
+ * invoked from a fault handler.
*/
enum kvm_pgtable_walk_flags {
KVM_PGTABLE_WALK_LEAF = BIT(0),
KVM_PGTABLE_WALK_TABLE_PRE = BIT(1),
KVM_PGTABLE_WALK_TABLE_POST = BIT(2),
KVM_PGTABLE_WALK_SHARED = BIT(3),
+ KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4),
};
struct kvm_pgtable_visit_ctx {
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 043ecc3405e7..9e3ecba3c4e6 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -325,7 +325,6 @@
#define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0)
-#define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0)
#define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7)
#define SYS_RNDR_EL0 sys_reg(3, 3, 2, 4, 0)
@@ -411,23 +410,51 @@
#define SYS_PMCCFILTR_EL0 sys_reg(3, 3, 14, 15, 7)
+#define SYS_VPIDR_EL2 sys_reg(3, 4, 0, 0, 0)
+#define SYS_VMPIDR_EL2 sys_reg(3, 4, 0, 0, 5)
+
#define SYS_SCTLR_EL2 sys_reg(3, 4, 1, 0, 0)
+#define SYS_ACTLR_EL2 sys_reg(3, 4, 1, 0, 1)
+#define SYS_HCR_EL2 sys_reg(3, 4, 1, 1, 0)
+#define SYS_MDCR_EL2 sys_reg(3, 4, 1, 1, 1)
+#define SYS_CPTR_EL2 sys_reg(3, 4, 1, 1, 2)
+#define SYS_HSTR_EL2 sys_reg(3, 4, 1, 1, 3)
#define SYS_HFGRTR_EL2 sys_reg(3, 4, 1, 1, 4)
#define SYS_HFGWTR_EL2 sys_reg(3, 4, 1, 1, 5)
#define SYS_HFGITR_EL2 sys_reg(3, 4, 1, 1, 6)
+#define SYS_HACR_EL2 sys_reg(3, 4, 1, 1, 7)
+
+#define SYS_TTBR0_EL2 sys_reg(3, 4, 2, 0, 0)
+#define SYS_TTBR1_EL2 sys_reg(3, 4, 2, 0, 1)
+#define SYS_TCR_EL2 sys_reg(3, 4, 2, 0, 2)
+#define SYS_VTTBR_EL2 sys_reg(3, 4, 2, 1, 0)
+#define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2)
+
#define SYS_TRFCR_EL2 sys_reg(3, 4, 1, 2, 1)
#define SYS_HDFGRTR_EL2 sys_reg(3, 4, 3, 1, 4)
#define SYS_HDFGWTR_EL2 sys_reg(3, 4, 3, 1, 5)
#define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6)
#define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0)
#define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1)
+#define SYS_SP_EL1 sys_reg(3, 4, 4, 1, 0)
#define SYS_IFSR32_EL2 sys_reg(3, 4, 5, 0, 1)
+#define SYS_AFSR0_EL2 sys_reg(3, 4, 5, 1, 0)
+#define SYS_AFSR1_EL2 sys_reg(3, 4, 5, 1, 1)
#define SYS_ESR_EL2 sys_reg(3, 4, 5, 2, 0)
#define SYS_VSESR_EL2 sys_reg(3, 4, 5, 2, 3)
#define SYS_FPEXC32_EL2 sys_reg(3, 4, 5, 3, 0)
#define SYS_TFSR_EL2 sys_reg(3, 4, 5, 6, 0)
-#define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1)
+#define SYS_FAR_EL2 sys_reg(3, 4, 6, 0, 0)
+#define SYS_HPFAR_EL2 sys_reg(3, 4, 6, 0, 4)
+
+#define SYS_MAIR_EL2 sys_reg(3, 4, 10, 2, 0)
+#define SYS_AMAIR_EL2 sys_reg(3, 4, 10, 3, 0)
+
+#define SYS_VBAR_EL2 sys_reg(3, 4, 12, 0, 0)
+#define SYS_RVBAR_EL2 sys_reg(3, 4, 12, 0, 1)
+#define SYS_RMR_EL2 sys_reg(3, 4, 12, 0, 2)
+#define SYS_VDISR_EL2 sys_reg(3, 4, 12, 1, 1)
#define __SYS__AP0Rx_EL2(x) sys_reg(3, 4, 12, 8, x)
#define SYS_ICH_AP0R0_EL2 __SYS__AP0Rx_EL2(0)
#define SYS_ICH_AP0R1_EL2 __SYS__AP0Rx_EL2(1)
@@ -469,6 +496,12 @@
#define SYS_ICH_LR14_EL2 __SYS__LR8_EL2(6)
#define SYS_ICH_LR15_EL2 __SYS__LR8_EL2(7)
+#define SYS_CONTEXTIDR_EL2 sys_reg(3, 4, 13, 0, 1)
+#define SYS_TPIDR_EL2 sys_reg(3, 4, 13, 0, 2)
+
+#define SYS_CNTVOFF_EL2 sys_reg(3, 4, 14, 0, 3)
+#define SYS_CNTHCTL_EL2 sys_reg(3, 4, 14, 1, 0)
+
/* VHE encodings for architectural EL0/1 system registers */
#define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0)
#define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0)
@@ -491,6 +524,8 @@
#define SYS_CNTV_CTL_EL02 sys_reg(3, 5, 14, 3, 1)
#define SYS_CNTV_CVAL_EL02 sys_reg(3, 5, 14, 3, 2)
+#define SYS_SP_EL2 sys_reg(3, 6, 4, 1, 0)
+
/* Common SCTLR_ELx flags. */
#define SCTLR_ELx_ENTP2 (BIT(60))
#define SCTLR_ELx_DSSBS (BIT(44))
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index a7a857f1784d..f8129c624b07 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -109,6 +109,7 @@ struct kvm_regs {
#define KVM_ARM_VCPU_SVE 4 /* enable SVE for this CPU */
#define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */
#define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */
+#define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */
struct kvm_vcpu_init {
__u32 target;
diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c
index 3ba70985e3a2..c307f69e9b55 100644
--- a/arch/arm64/kernel/cacheinfo.c
+++ b/arch/arm64/kernel/cacheinfo.c
@@ -11,11 +11,6 @@
#include <linux/of.h>
#define MAX_CACHE_LEVEL 7 /* Max 7 level supported */
-/* Ctypen, bits[3(n - 1) + 2 : 3(n - 1)], for n = 1 to 7 */
-#define CLIDR_CTYPE_SHIFT(level) (3 * (level - 1))
-#define CLIDR_CTYPE_MASK(level) (7 << CLIDR_CTYPE_SHIFT(level))
-#define CLIDR_CTYPE(clidr, level) \
- (((clidr) & CLIDR_CTYPE_MASK(level)) >> CLIDR_CTYPE_SHIFT(level))
int cache_line_size(void)
{
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 45a42cf2191c..87687e99fee3 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1967,6 +1967,20 @@ static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused)
write_sysreg(read_sysreg(tpidr_el1), tpidr_el2);
}
+static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap,
+ int scope)
+{
+ if (kvm_get_mode() != KVM_MODE_NV)
+ return false;
+
+ if (!has_cpuid_feature(cap, scope)) {
+ pr_warn("unavailable: %s\n", cap->desc);
+ return false;
+ }
+
+ return true;
+}
+
#ifdef CONFIG_ARM64_PAN
static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
{
@@ -2263,6 +2277,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.cpu_enable = cpu_copy_el2regs,
},
{
+ .desc = "Nested Virtualization Support",
+ .capability = ARM64_HAS_NESTED_VIRT,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_nested_virt_support,
+ .sys_reg = SYS_ID_AA64MMFR2_EL1,
+ .sign = FTR_UNSIGNED,
+ .field_pos = ID_AA64MMFR2_EL1_NV_SHIFT,
+ .field_width = 4,
+ .min_field_value = ID_AA64MMFR2_EL1_NV_IMP,
+ },
+ {
.capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE,
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_32bit_el0,
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 111ff33d93ee..9439240c3fcf 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -16,30 +16,6 @@
#include <asm/ptrace.h>
#include <asm/virt.h>
-// Warning, hardcoded register allocation
-// This will clobber x1 and x2, and expect x1 to contain
-// the id register value as read from the HW
-.macro __check_override idreg, fld, width, pass, fail
- ubfx x1, x1, #\fld, #\width
- cbz x1, \fail
-
- adr_l x1, \idreg\()_override
- ldr x2, [x1, FTR_OVR_VAL_OFFSET]
- ldr x1, [x1, FTR_OVR_MASK_OFFSET]
- ubfx x2, x2, #\fld, #\width
- ubfx x1, x1, #\fld, #\width
- cmp x1, xzr
- and x2, x2, x1
- csinv x2, x2, xzr, ne
- cbnz x2, \pass
- b \fail
-.endm
-
-.macro check_override idreg, fld, pass, fail
- mrs x1, \idreg\()_el1
- __check_override \idreg \fld 4 \pass \fail
-.endm
-
.text
.pushsection .hyp.text, "ax"
@@ -98,65 +74,7 @@ SYM_CODE_START_LOCAL(elx_sync)
SYM_CODE_END(elx_sync)
SYM_CODE_START_LOCAL(__finalise_el2)
- check_override id_aa64pfr0 ID_AA64PFR0_EL1_SVE_SHIFT .Linit_sve .Lskip_sve
-
-.Linit_sve: /* SVE register access */
- mrs x0, cptr_el2 // Disable SVE traps
- bic x0, x0, #CPTR_EL2_TZ
- msr cptr_el2, x0
- isb
- mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
- msr_s SYS_ZCR_EL2, x1 // length for EL1.
-
-.Lskip_sve:
- check_override id_aa64pfr1 ID_AA64PFR1_EL1_SME_SHIFT .Linit_sme .Lskip_sme
-
-.Linit_sme: /* SME register access and priority mapping */
- mrs x0, cptr_el2 // Disable SME traps
- bic x0, x0, #CPTR_EL2_TSM
- msr cptr_el2, x0
- isb
-
- mrs x1, sctlr_el2
- orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps
- msr sctlr_el2, x1
- isb
-
- mov x0, #0 // SMCR controls
-
- // Full FP in SM?
- mrs_s x1, SYS_ID_AA64SMFR0_EL1
- __check_override id_aa64smfr0 ID_AA64SMFR0_EL1_FA64_SHIFT 1 .Linit_sme_fa64 .Lskip_sme_fa64
-
-.Linit_sme_fa64:
- orr x0, x0, SMCR_ELx_FA64_MASK
-.Lskip_sme_fa64:
-
- // ZT0 available?
- mrs_s x1, SYS_ID_AA64SMFR0_EL1
- __check_override id_aa64smfr0 ID_AA64SMFR0_EL1_SMEver_SHIFT 4 .Linit_sme_zt0 .Lskip_sme_zt0
-.Linit_sme_zt0:
- orr x0, x0, SMCR_ELx_EZT0_MASK
-.Lskip_sme_zt0:
-
- orr x0, x0, #SMCR_ELx_LEN_MASK // Enable full SME vector
- msr_s SYS_SMCR_EL2, x0 // length for EL1.
-
- mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported?
- ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1
- cbz x1, .Lskip_sme
-
- msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal
-
- mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present?
- ubfx x1, x1, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4
- cbz x1, .Lskip_sme
-
- mrs_s x1, SYS_HCRX_EL2
- orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping
- msr_s SYS_HCRX_EL2, x1
-
-.Lskip_sme:
+ finalise_el2_state
// nVHE? No way! Give me the real thing!
// Sanity check: MMU *must* be off
@@ -164,7 +82,7 @@ SYM_CODE_START_LOCAL(__finalise_el2)
tbnz x1, #0, 1f
// Needs to be VHE capable, obviously
- check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f
+ check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f x1 x2
1: mov_q x0, HVC_STUB_ERR
eret
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 05da3c8f7e88..ca6eadeb7d1a 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -21,6 +21,7 @@ if VIRTUALIZATION
menuconfig KVM
bool "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
+ select KVM_GENERIC_HARDWARE_ENABLING
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 5e33c2d4645a..c0c050e53157 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -14,7 +14,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
guest.o debug.o reset.o sys_regs.o stacktrace.o \
vgic-sys-reg-v3.o fpsimd.o pkvm.o \
- arch_timer.o trng.o vmid.o \
+ arch_timer.o trng.o vmid.o emulate-nested.o nested.o \
vgic/vgic.o vgic/vgic-init.o \
vgic/vgic-irqfd.o vgic/vgic-v2.o \
vgic/vgic-v3.o vgic/vgic-v4.o \
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index bb24a76b4224..00610477ec7b 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -428,14 +428,17 @@ static void timer_emulate(struct arch_timer_context *ctx)
* scheduled for the future. If the timer cannot fire at all,
* then we also don't need a soft timer.
*/
- if (!kvm_timer_irq_can_fire(ctx)) {
- soft_timer_cancel(&ctx->hrtimer);
+ if (should_fire || !kvm_timer_irq_can_fire(ctx))
return;
- }
soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
}
+static void set_cntvoff(u64 cntvoff)
+{
+ kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
+}
+
static void timer_save_state(struct arch_timer_context *ctx)
{
struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
@@ -459,6 +462,22 @@ static void timer_save_state(struct arch_timer_context *ctx)
write_sysreg_el0(0, SYS_CNTV_CTL);
isb();
+ /*
+ * The kernel may decide to run userspace after
+ * calling vcpu_put, so we reset cntvoff to 0 to
+ * ensure a consistent read between user accesses to
+ * the virtual counter and kernel access to the
+ * physical counter of non-VHE case.
+ *
+ * For VHE, the virtual counter uses a fixed virtual
+ * offset of zero, so no need to zero CNTVOFF_EL2
+ * register, but this is actually useful when switching
+ * between EL1/vEL2 with NV.
+ *
+ * Do it unconditionally, as this is either unavoidable
+ * or dirt cheap.
+ */
+ set_cntvoff(0);
break;
case TIMER_PTIMER:
timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL));
@@ -532,6 +551,7 @@ static void timer_restore_state(struct arch_timer_context *ctx)
switch (index) {
case TIMER_VTIMER:
+ set_cntvoff(timer_get_offset(ctx));
write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL);
isb();
write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL);
@@ -552,11 +572,6 @@ out:
local_irq_restore(flags);
}
-static void set_cntvoff(u64 cntvoff)
-{
- kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
-}
-
static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
{
int r;
@@ -631,8 +646,6 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
kvm_timer_vcpu_load_nogic(vcpu);
}
- set_cntvoff(timer_get_offset(map.direct_vtimer));
-
kvm_timer_unblocking(vcpu);
timer_restore_state(map.direct_vtimer);
@@ -688,15 +701,6 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
if (kvm_vcpu_is_blocking(vcpu))
kvm_timer_blocking(vcpu);
-
- /*
- * The kernel may decide to run userspace after calling vcpu_put, so
- * we reset cntvoff to 0 to ensure a consistent read between user
- * accesses to the virtual counter and kernel access to the physical
- * counter of non-VHE case. For VHE, the virtual counter uses a fixed
- * virtual offset of zero, so no need to zero CNTVOFF_EL2 register.
- */
- set_cntvoff(0);
}
/*
@@ -811,10 +815,18 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
ptimer->host_timer_irq_flags = host_ptimer_irq_flags;
}
-static void kvm_timer_init_interrupt(void *info)
+void kvm_timer_cpu_up(void)
{
enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
- enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
+ if (host_ptimer_irq)
+ enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
+}
+
+void kvm_timer_cpu_down(void)
+{
+ disable_percpu_irq(host_vtimer_irq);
+ if (host_ptimer_irq)
+ disable_percpu_irq(host_ptimer_irq);
}
int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
@@ -926,14 +938,22 @@ u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
enum kvm_arch_timers tmr,
enum kvm_arch_timer_regs treg)
{
+ struct arch_timer_context *timer;
+ struct timer_map map;
u64 val;
+ get_timer_map(vcpu, &map);
+ timer = vcpu_get_timer(vcpu, tmr);
+
+ if (timer == map.emul_ptimer)
+ return kvm_arm_timer_read(vcpu, timer, treg);
+
preempt_disable();
- kvm_timer_vcpu_put(vcpu);
+ timer_save_state(timer);
- val = kvm_arm_timer_read(vcpu, vcpu_get_timer(vcpu, tmr), treg);
+ val = kvm_arm_timer_read(vcpu, timer, treg);
- kvm_timer_vcpu_load(vcpu);
+ timer_restore_state(timer);
preempt_enable();
return val;
@@ -967,25 +987,22 @@ void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
enum kvm_arch_timer_regs treg,
u64 val)
{
- preempt_disable();
- kvm_timer_vcpu_put(vcpu);
-
- kvm_arm_timer_write(vcpu, vcpu_get_timer(vcpu, tmr), treg, val);
-
- kvm_timer_vcpu_load(vcpu);
- preempt_enable();
-}
-
-static int kvm_timer_starting_cpu(unsigned int cpu)
-{
- kvm_timer_init_interrupt(NULL);
- return 0;
-}
+ struct arch_timer_context *timer;
+ struct timer_map map;
-static int kvm_timer_dying_cpu(unsigned int cpu)
-{
- disable_percpu_irq(host_vtimer_irq);
- return 0;
+ get_timer_map(vcpu, &map);
+ timer = vcpu_get_timer(vcpu, tmr);
+ if (timer == map.emul_ptimer) {
+ soft_timer_cancel(&timer->hrtimer);
+ kvm_arm_timer_write(vcpu, timer, treg, val);
+ timer_emulate(timer);
+ } else {
+ preempt_disable();
+ timer_save_state(timer);
+ kvm_arm_timer_write(vcpu, timer, treg, val);
+ timer_restore_state(timer);
+ preempt_enable();
+ }
}
static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
@@ -1117,7 +1134,7 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info)
return 0;
}
-int kvm_timer_hyp_init(bool has_gic)
+int __init kvm_timer_hyp_init(bool has_gic)
{
struct arch_timer_kvm_info *info;
int err;
@@ -1185,9 +1202,6 @@ int kvm_timer_hyp_init(bool has_gic)
goto out_free_irq;
}
- cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING,
- "kvm/arm/timer:starting", kvm_timer_starting_cpu,
- kvm_timer_dying_cpu);
return 0;
out_free_irq:
free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 9c5573bc4614..3bd732eaf087 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -63,16 +63,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
}
-int kvm_arch_hardware_setup(void *opaque)
-{
- return 0;
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- return 0;
-}
-
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
struct kvm_enable_cap *cap)
{
@@ -146,7 +136,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto err_unshare_kvm;
- if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
ret = -ENOMEM;
goto err_unshare_kvm;
}
@@ -1539,7 +1529,7 @@ static int kvm_init_vector_slots(void)
return 0;
}
-static void cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
+static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
{
struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
unsigned long tcr;
@@ -1682,7 +1672,15 @@ static void _kvm_arch_hardware_enable(void *discard)
int kvm_arch_hardware_enable(void)
{
+ int was_enabled = __this_cpu_read(kvm_arm_hardware_enabled);
+
_kvm_arch_hardware_enable(NULL);
+
+ if (!was_enabled) {
+ kvm_vgic_cpu_up();
+ kvm_timer_cpu_up();
+ }
+
return 0;
}
@@ -1696,6 +1694,11 @@ static void _kvm_arch_hardware_disable(void *discard)
void kvm_arch_hardware_disable(void)
{
+ if (__this_cpu_read(kvm_arm_hardware_enabled)) {
+ kvm_timer_cpu_down();
+ kvm_vgic_cpu_down();
+ }
+
if (!is_protected_kvm_enabled())
_kvm_arch_hardware_disable(NULL);
}
@@ -1738,26 +1741,26 @@ static struct notifier_block hyp_init_cpu_pm_nb = {
.notifier_call = hyp_init_cpu_pm_notifier,
};
-static void hyp_cpu_pm_init(void)
+static void __init hyp_cpu_pm_init(void)
{
if (!is_protected_kvm_enabled())
cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
}
-static void hyp_cpu_pm_exit(void)
+static void __init hyp_cpu_pm_exit(void)
{
if (!is_protected_kvm_enabled())
cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
}
#else
-static inline void hyp_cpu_pm_init(void)
+static inline void __init hyp_cpu_pm_init(void)
{
}
-static inline void hyp_cpu_pm_exit(void)
+static inline void __init hyp_cpu_pm_exit(void)
{
}
#endif
-static void init_cpu_logical_map(void)
+static void __init init_cpu_logical_map(void)
{
unsigned int cpu;
@@ -1774,7 +1777,7 @@ static void init_cpu_logical_map(void)
#define init_psci_0_1_impl_state(config, what) \
config.psci_0_1_ ## what ## _implemented = psci_ops.what
-static bool init_psci_relay(void)
+static bool __init init_psci_relay(void)
{
/*
* If PSCI has not been initialized, protected KVM cannot install
@@ -1797,7 +1800,7 @@ static bool init_psci_relay(void)
return true;
}
-static int init_subsystems(void)
+static int __init init_subsystems(void)
{
int err = 0;
@@ -1838,13 +1841,22 @@ static int init_subsystems(void)
kvm_register_perf_callbacks(NULL);
out:
+ if (err)
+ hyp_cpu_pm_exit();
+
if (err || !is_protected_kvm_enabled())
on_each_cpu(_kvm_arch_hardware_disable, NULL, 1);
return err;
}
-static void teardown_hyp_mode(void)
+static void __init teardown_subsystems(void)
+{
+ kvm_unregister_perf_callbacks();
+ hyp_cpu_pm_exit();
+}
+
+static void __init teardown_hyp_mode(void)
{
int cpu;
@@ -1855,7 +1867,7 @@ static void teardown_hyp_mode(void)
}
}
-static int do_pkvm_init(u32 hyp_va_bits)
+static int __init do_pkvm_init(u32 hyp_va_bits)
{
void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
int ret;
@@ -1887,11 +1899,12 @@ static void kvm_hyp_init_symbols(void)
kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
+ kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
kvm_nvhe_sym(__icache_flags) = __icache_flags;
kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
}
-static int kvm_hyp_init_protection(u32 hyp_va_bits)
+static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
{
void *addr = phys_to_virt(hyp_mem_base);
int ret;
@@ -1909,10 +1922,8 @@ static int kvm_hyp_init_protection(u32 hyp_va_bits)
return 0;
}
-/**
- * Inits Hyp-mode on all online CPUs
- */
-static int init_hyp_mode(void)
+/* Inits Hyp-mode on all online CPUs */
+static int __init init_hyp_mode(void)
{
u32 hyp_va_bits;
int cpu;
@@ -2094,7 +2105,7 @@ out_err:
return err;
}
-static void _kvm_host_prot_finalize(void *arg)
+static void __init _kvm_host_prot_finalize(void *arg)
{
int *err = arg;
@@ -2102,7 +2113,7 @@ static void _kvm_host_prot_finalize(void *arg)
WRITE_ONCE(*err, -EINVAL);
}
-static int pkvm_drop_host_privileges(void)
+static int __init pkvm_drop_host_privileges(void)
{
int ret = 0;
@@ -2115,7 +2126,7 @@ static int pkvm_drop_host_privileges(void)
return ret;
}
-static int finalize_hyp_mode(void)
+static int __init finalize_hyp_mode(void)
{
if (!is_protected_kvm_enabled())
return 0;
@@ -2187,10 +2198,8 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
kvm_arm_resume_guest(irqfd->kvm);
}
-/**
- * Initialize Hyp-mode and memory mappings on all CPUs.
- */
-int kvm_arch_init(void *opaque)
+/* Initialize Hyp-mode and memory mappings on all CPUs */
+static __init int kvm_arm_init(void)
{
int err;
bool in_hyp_mode;
@@ -2241,7 +2250,7 @@ int kvm_arch_init(void *opaque)
err = kvm_init_vector_slots();
if (err) {
kvm_err("Cannot initialise vector slots\n");
- goto out_err;
+ goto out_hyp;
}
err = init_subsystems();
@@ -2252,7 +2261,7 @@ int kvm_arch_init(void *opaque)
err = finalize_hyp_mode();
if (err) {
kvm_err("Failed to finalize Hyp protection\n");
- goto out_hyp;
+ goto out_subs;
}
}
@@ -2264,10 +2273,19 @@ int kvm_arch_init(void *opaque)
kvm_info("Hyp mode initialized successfully\n");
}
+ /*
+ * FIXME: Do something reasonable if kvm_init() fails after pKVM
+ * hypervisor protection is finalized.
+ */
+ err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ if (err)
+ goto out_subs;
+
return 0;
+out_subs:
+ teardown_subsystems();
out_hyp:
- hyp_cpu_pm_exit();
if (!in_hyp_mode)
teardown_hyp_mode();
out_err:
@@ -2275,12 +2293,6 @@ out_err:
return err;
}
-/* NOP: Compiling as a module not supported */
-void kvm_arch_exit(void)
-{
- kvm_unregister_perf_callbacks();
-}
-
static int __init early_kvm_mode_cfg(char *arg)
{
if (!arg)
@@ -2310,6 +2322,11 @@ static int __init early_kvm_mode_cfg(char *arg)
return 0;
}
+ if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) {
+ kvm_mode = KVM_MODE_NV;
+ return 0;
+ }
+
return -EINVAL;
}
early_param("kvm-arm.mode", early_kvm_mode_cfg);
@@ -2319,10 +2336,4 @@ enum kvm_mode kvm_get_mode(void)
return kvm_mode;
}
-static int arm_init(void)
-{
- int rc = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
- return rc;
-}
-
-module_init(arm_init);
+module_init(kvm_arm_init);
diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c
new file mode 100644
index 000000000000..b96662029fb1
--- /dev/null
+++ b/arch/arm64/kvm/emulate-nested.c
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 - Linaro and Columbia University
+ * Author: Jintack Lim <jintack.lim@linaro.org>
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
+
+#include "hyp/include/hyp/adjust_pc.h"
+
+#include "trace.h"
+
+static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr)
+{
+ u64 mode = spsr & PSR_MODE_MASK;
+
+ /*
+ * Possible causes for an Illegal Exception Return from EL2:
+ * - trying to return to EL3
+ * - trying to return to an illegal M value
+ * - trying to return to a 32bit EL
+ * - trying to return to EL1 with HCR_EL2.TGE set
+ */
+ if (mode == PSR_MODE_EL3t || mode == PSR_MODE_EL3h ||
+ mode == 0b00001 || (mode & BIT(1)) ||
+ (spsr & PSR_MODE32_BIT) ||
+ (vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t ||
+ mode == PSR_MODE_EL1h))) {
+ /*
+ * The guest is playing with our nerves. Preserve EL, SP,
+ * masks, flags from the existing PSTATE, and set IL.
+ * The HW will then generate an Illegal State Exception
+ * immediately after ERET.
+ */
+ spsr = *vcpu_cpsr(vcpu);
+
+ spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT |
+ PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT |
+ PSR_MODE_MASK | PSR_MODE32_BIT);
+ spsr |= PSR_IL_BIT;
+ }
+
+ return spsr;
+}
+
+void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu)
+{
+ u64 spsr, elr, mode;
+ bool direct_eret;
+
+ /*
+ * Going through the whole put/load motions is a waste of time
+ * if this is a VHE guest hypervisor returning to its own
+ * userspace, or the hypervisor performing a local exception
+ * return. No need to save/restore registers, no need to
+ * switch S2 MMU. Just do the canonical ERET.
+ */
+ spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2);
+ spsr = kvm_check_illegal_exception_return(vcpu, spsr);
+
+ mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ direct_eret = (mode == PSR_MODE_EL0t &&
+ vcpu_el2_e2h_is_set(vcpu) &&
+ vcpu_el2_tge_is_set(vcpu));
+ direct_eret |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t);
+
+ if (direct_eret) {
+ *vcpu_pc(vcpu) = vcpu_read_sys_reg(vcpu, ELR_EL2);
+ *vcpu_cpsr(vcpu) = spsr;
+ trace_kvm_nested_eret(vcpu, *vcpu_pc(vcpu), spsr);
+ return;
+ }
+
+ preempt_disable();
+ kvm_arch_vcpu_put(vcpu);
+
+ elr = __vcpu_sys_reg(vcpu, ELR_EL2);
+
+ trace_kvm_nested_eret(vcpu, elr, spsr);
+
+ /*
+ * Note that the current exception level is always the virtual EL2,
+ * since we set HCR_EL2.NV bit only when entering the virtual EL2.
+ */
+ *vcpu_pc(vcpu) = elr;
+ *vcpu_cpsr(vcpu) = spsr;
+
+ kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ preempt_enable();
+}
+
+static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
+ enum exception_type type)
+{
+ trace_kvm_inject_nested_exception(vcpu, esr_el2, type);
+
+ switch (type) {
+ case except_type_sync:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
+ vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2);
+ break;
+ case except_type_irq:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ);
+ break;
+ default:
+ WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type);
+ }
+}
+
+/*
+ * Emulate taking an exception to EL2.
+ * See ARM ARM J8.1.2 AArch64.TakeException()
+ */
+static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2,
+ enum exception_type type)
+{
+ u64 pstate, mode;
+ bool direct_inject;
+
+ if (!vcpu_has_nv(vcpu)) {
+ kvm_err("Unexpected call to %s for the non-nesting configuration\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ /*
+ * As for ERET, we can avoid doing too much on the injection path by
+ * checking that we either took the exception from a VHE host
+ * userspace or from vEL2. In these cases, there is no change in
+ * translation regime (or anything else), so let's do as little as
+ * possible.
+ */
+ pstate = *vcpu_cpsr(vcpu);
+ mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ direct_inject = (mode == PSR_MODE_EL0t &&
+ vcpu_el2_e2h_is_set(vcpu) &&
+ vcpu_el2_tge_is_set(vcpu));
+ direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t);
+
+ if (direct_inject) {
+ kvm_inject_el2_exception(vcpu, esr_el2, type);
+ return 1;
+ }
+
+ preempt_disable();
+
+ /*
+ * We may have an exception or PC update in the EL0/EL1 context.
+ * Commit it before entering EL2.
+ */
+ __kvm_adjust_pc(vcpu);
+
+ kvm_arch_vcpu_put(vcpu);
+
+ kvm_inject_el2_exception(vcpu, esr_el2, type);
+
+ /*
+ * A hard requirement is that a switch between EL1 and EL2
+ * contexts has to happen between a put/load, so that we can
+ * pick the correct timer and interrupt configuration, among
+ * other things.
+ *
+ * Make sure the exception actually took place before we load
+ * the new context.
+ */
+ __kvm_adjust_pc(vcpu);
+
+ kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ preempt_enable();
+
+ return 1;
+}
+
+int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2)
+{
+ return kvm_inject_nested(vcpu, esr_el2, except_type_sync);
+}
+
+int kvm_inject_nested_irq(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Do not inject an irq if the:
+ * - Current exception level is EL2, and
+ * - virtual HCR_EL2.TGE == 0
+ * - virtual HCR_EL2.IMO == 0
+ *
+ * See Table D1-17 "Physical interrupt target and masking when EL3 is
+ * not implemented and EL2 is implemented" in ARM DDI 0487C.a.
+ */
+
+ if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) &&
+ !(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO))
+ return 1;
+
+ /* esr_el2 value doesn't matter for exits due to irqs. */
+ return kvm_inject_nested(vcpu, 0, except_type_irq);
+}
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 235775d0c825..1279949599b5 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -184,6 +184,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
sysreg_clear_set(CPACR_EL1,
CPACR_EL1_SMEN_EL0EN,
CPACR_EL1_SMEN_EL1EN);
+ isb();
}
if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) {
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index cf4c495a4321..07444fa22888 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -24,6 +24,7 @@
#include <asm/fpsimd.h>
#include <asm/kvm.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
#include <asm/sigcontext.h>
#include "trace.h"
@@ -253,6 +254,11 @@ static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
if (!vcpu_el1_is_32bit(vcpu))
return -EINVAL;
break;
+ case PSR_MODE_EL2h:
+ case PSR_MODE_EL2t:
+ if (!vcpu_has_nv(vcpu))
+ return -EINVAL;
+ fallthrough;
case PSR_MODE_EL0t:
case PSR_MODE_EL1t:
case PSR_MODE_EL1h:
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index e778eefcf214..a798c0b4d717 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -16,6 +16,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/debug-monitors.h>
#include <asm/stacktrace/nvhe.h>
#include <asm/traps.h>
@@ -41,6 +42,16 @@ static int handle_hvc(struct kvm_vcpu *vcpu)
kvm_vcpu_hvc_get_imm(vcpu));
vcpu->stat.hvc_exit_stat++;
+ /* Forward hvc instructions to the virtual EL2 if the guest has EL2. */
+ if (vcpu_has_nv(vcpu)) {
+ if (vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_HCD)
+ kvm_inject_undefined(vcpu);
+ else
+ kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
+
+ return 1;
+ }
+
ret = kvm_hvc_call_handler(vcpu);
if (ret < 0) {
vcpu_set_reg(vcpu, 0, ~0UL);
@@ -52,6 +63,8 @@ static int handle_hvc(struct kvm_vcpu *vcpu)
static int handle_smc(struct kvm_vcpu *vcpu)
{
+ int ret;
+
/*
* "If an SMC instruction executed at Non-secure EL1 is
* trapped to EL2 because HCR_EL2.TSC is 1, the exception is a
@@ -59,10 +72,30 @@ static int handle_smc(struct kvm_vcpu *vcpu)
*
* We need to advance the PC after the trap, as it would
* otherwise return to the same address...
+ *
+ * Only handle SMCs from the virtual EL2 with an immediate of zero and
+ * skip it otherwise.
*/
- vcpu_set_reg(vcpu, 0, ~0UL);
+ if (!vcpu_is_el2(vcpu) || kvm_vcpu_hvc_get_imm(vcpu)) {
+ vcpu_set_reg(vcpu, 0, ~0UL);
+ kvm_incr_pc(vcpu);
+ return 1;
+ }
+
+ /*
+ * If imm is zero then it is likely an SMCCC call.
+ *
+ * Note that on ARMv8.3, even if EL3 is not implemented, SMC executed
+ * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than
+ * being treated as UNDEFINED.
+ */
+ ret = kvm_hvc_call_handler(vcpu);
+ if (ret < 0)
+ vcpu_set_reg(vcpu, 0, ~0UL);
+
kvm_incr_pc(vcpu);
- return 1;
+
+ return ret;
}
/*
@@ -196,6 +229,15 @@ static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu)
return 1;
}
+static int kvm_handle_eret(struct kvm_vcpu *vcpu)
+{
+ if (kvm_vcpu_get_esr(vcpu) & ESR_ELx_ERET_ISS_ERET)
+ return kvm_handle_ptrauth(vcpu);
+
+ kvm_emulate_nested_eret(vcpu);
+ return 1;
+}
+
static exit_handle_fn arm_exit_handlers[] = {
[0 ... ESR_ELx_EC_MAX] = kvm_handle_unknown_ec,
[ESR_ELx_EC_WFx] = kvm_handle_wfx,
@@ -211,6 +253,7 @@ static exit_handle_fn arm_exit_handlers[] = {
[ESR_ELx_EC_SMC64] = handle_smc,
[ESR_ELx_EC_SYS64] = kvm_handle_sys_reg,
[ESR_ELx_EC_SVE] = handle_sve,
+ [ESR_ELx_EC_ERET] = kvm_handle_eret,
[ESR_ELx_EC_IABT_LOW] = kvm_handle_guest_abort,
[ESR_ELx_EC_DABT_LOW] = kvm_handle_guest_abort,
[ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug,
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 791d3de76771..424a5107cddb 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -14,6 +14,7 @@
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__)
#error Hypervisor code only!
@@ -23,7 +24,9 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
{
u64 val;
- if (__vcpu_read_sys_reg_from_cpu(reg, &val))
+ if (unlikely(vcpu_has_nv(vcpu)))
+ return vcpu_read_sys_reg(vcpu, reg);
+ else if (__vcpu_read_sys_reg_from_cpu(reg, &val))
return val;
return __vcpu_sys_reg(vcpu, reg);
@@ -31,18 +34,25 @@ static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
{
- if (__vcpu_write_sys_reg_to_cpu(val, reg))
- return;
-
- __vcpu_sys_reg(vcpu, reg) = val;
+ if (unlikely(vcpu_has_nv(vcpu)))
+ vcpu_write_sys_reg(vcpu, val, reg);
+ else if (!__vcpu_write_sys_reg_to_cpu(val, reg))
+ __vcpu_sys_reg(vcpu, reg) = val;
}
-static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val)
+static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
+ u64 val)
{
- if (has_vhe())
+ if (unlikely(vcpu_has_nv(vcpu))) {
+ if (target_mode == PSR_MODE_EL1h)
+ vcpu_write_sys_reg(vcpu, val, SPSR_EL1);
+ else
+ vcpu_write_sys_reg(vcpu, val, SPSR_EL2);
+ } else if (has_vhe()) {
write_sysreg_el1(val, SYS_SPSR);
- else
+ } else {
__vcpu_sys_reg(vcpu, SPSR_EL1) = val;
+ }
}
static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
@@ -101,6 +111,11 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
__vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
break;
+ case PSR_MODE_EL2h:
+ vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL2);
+ sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL2);
+ __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL2);
+ break;
default:
/* Don't do that */
BUG();
@@ -153,7 +168,7 @@ static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
new |= target_mode;
*vcpu_cpsr(vcpu) = new;
- __vcpu_write_spsr(vcpu, old);
+ __vcpu_write_spsr(vcpu, target_mode, old);
}
/*
@@ -323,11 +338,20 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu)
case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
break;
+
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
+ enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync);
+ break;
+
+ case unpack_vcpu_flag(EXCEPT_AA64_EL2_IRQ):
+ enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq);
+ break;
+
default:
/*
- * Only EL1_SYNC makes sense so far, EL2_{SYNC,IRQ}
- * will be implemented at some point. Everything
- * else gets silently ignored.
+ * Only EL1_SYNC and EL2_{SYNC,IRQ} makes
+ * sense so far. Everything else gets silently
+ * ignored.
*/
break;
}
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index baa5b9b3dde5..699ea1f8d409 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -39,7 +39,6 @@ static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
- ctxt_sys_reg(ctxt, CSSELR_EL1) = read_sysreg(csselr_el1);
ctxt_sys_reg(ctxt, SCTLR_EL1) = read_sysreg_el1(SYS_SCTLR);
ctxt_sys_reg(ctxt, CPACR_EL1) = read_sysreg_el1(SYS_CPACR);
ctxt_sys_reg(ctxt, TTBR0_EL1) = read_sysreg_el1(SYS_TTBR0);
@@ -95,7 +94,6 @@ static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
{
write_sysreg(ctxt_sys_reg(ctxt, MPIDR_EL1), vmpidr_el2);
- write_sysreg(ctxt_sys_reg(ctxt, CSSELR_EL1), csselr_el1);
if (has_vhe() ||
!cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
@@ -156,9 +154,26 @@ static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1), SYS_SPSR);
}
+/* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */
+static inline u64 to_hw_pstate(const struct kvm_cpu_context *ctxt)
+{
+ u64 mode = ctxt->regs.pstate & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ switch (mode) {
+ case PSR_MODE_EL2t:
+ mode = PSR_MODE_EL1t;
+ break;
+ case PSR_MODE_EL2h:
+ mode = PSR_MODE_EL1h;
+ break;
+ }
+
+ return (ctxt->regs.pstate & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode;
+}
+
static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
{
- u64 pstate = ctxt->regs.pstate;
+ u64 pstate = to_hw_pstate(ctxt);
u64 mode = pstate & PSR_AA32_MODE_MASK;
/*
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
index c953fb4b9a13..a6d67c2bb5ae 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S
@@ -183,6 +183,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu)
/* Initialize EL2 CPU state to sane values. */
init_el2_state // Clobbers x0..x2
+ finalise_el2_state
/* Enable MMU, set vectors and stack. */
mov x0, x28
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 0f9ac25afdf4..08d2b004f4b7 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -26,6 +26,7 @@ u64 id_aa64isar2_el1_sys_val;
u64 id_aa64mmfr0_el1_sys_val;
u64 id_aa64mmfr1_el1_sys_val;
u64 id_aa64mmfr2_el1_sys_val;
+u64 id_aa64smfr0_el1_sys_val;
/*
* Inject an unknown/undefined exception to an AArch64 guest while most of its
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b11cf2c618a6..3d61bd3e591d 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -168,6 +168,25 @@ static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
return walker->cb(ctx, visit);
}
+static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker,
+ int r)
+{
+ /*
+ * Visitor callbacks return EAGAIN when the conditions that led to a
+ * fault are no longer reflected in the page tables due to a race to
+ * update a PTE. In the context of a fault handler this is interpreted
+ * as a signal to retry guest execution.
+ *
+ * Ignore the return code altogether for walkers outside a fault handler
+ * (e.g. write protecting a range of memory) and chug along with the
+ * page table walk.
+ */
+ if (r == -EAGAIN)
+ return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);
+
+ return !r;
+}
+
static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, u32 level);
@@ -200,7 +219,7 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
table = kvm_pte_table(ctx.old, level);
}
- if (ret)
+ if (!kvm_pgtable_walk_continue(data->walker, ret))
goto out;
if (!table) {
@@ -211,13 +230,16 @@ static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
- if (ret)
+ if (!kvm_pgtable_walk_continue(data->walker, ret))
goto out;
if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);
out:
+ if (kvm_pgtable_walk_continue(data->walker, ret))
+ return 0;
+
return ret;
}
@@ -584,12 +606,14 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
lvls = 2;
vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+#ifdef CONFIG_ARM64_HW_AFDBM
/*
* Enable the Hardware Access Flag management, unconditionally
* on all CPUs. The features is RES0 on CPUs without the support
* and must be ignored by the CPUs.
*/
vtcr |= VTCR_EL2_HA;
+#endif /* CONFIG_ARM64_HW_AFDBM */
/* Set the vmid bits */
vtcr |= (get_vmid_bits(mmfr1) == 16) ?
@@ -1026,7 +1050,7 @@ static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
if (!kvm_pte_valid(ctx->old))
- return 0;
+ return -EAGAIN;
data->level = ctx->level;
data->pte = pte;
@@ -1094,9 +1118,15 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
kvm_pte_t kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr)
{
kvm_pte_t pte = 0;
- stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
- &pte, NULL, 0);
- dsb(ishst);
+ int ret;
+
+ ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
+ &pte, NULL,
+ KVM_PGTABLE_WALK_HANDLE_FAULT |
+ KVM_PGTABLE_WALK_SHARED);
+ if (!ret)
+ dsb(ishst);
+
return pte;
}
@@ -1141,6 +1171,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level,
+ KVM_PGTABLE_WALK_HANDLE_FAULT |
KVM_PGTABLE_WALK_SHARED);
if (!ret)
kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 1a97391fedd2..cd3f3117bf16 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -40,7 +40,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu)
___activate_traps(vcpu);
val = read_sysreg(cpacr_el1);
- val |= CPACR_EL1_TTA;
+ val |= CPACR_ELx_TTA;
val &= ~(CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN |
CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN);
@@ -120,6 +120,25 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
{
+ /*
+ * If we were in HYP context on entry, adjust the PSTATE view
+ * so that the usual helpers work correctly.
+ */
+ if (unlikely(vcpu_get_flag(vcpu, VCPU_HYP_CONTEXT))) {
+ u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
+
+ switch (mode) {
+ case PSR_MODE_EL1t:
+ mode = PSR_MODE_EL2t;
+ break;
+ case PSR_MODE_EL1h:
+ mode = PSR_MODE_EL2h;
+ break;
+ }
+
+ *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT);
+ *vcpu_cpsr(vcpu) |= mode;
+ }
}
/* Switch to the guest for VHE systems running in EL2 */
@@ -154,6 +173,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
sysreg_restore_guest_state_vhe(guest_ctxt);
__debug_switch_to_guest(vcpu);
+ if (is_hyp_ctxt(vcpu))
+ vcpu_set_flag(vcpu, VCPU_HYP_CONTEXT);
+ else
+ vcpu_clear_flag(vcpu, VCPU_HYP_CONTEXT);
+
do {
/* Jump in the fire! */
exit_code = __guest_enter(vcpu);
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index c9f401fa01a9..64c086c02c60 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -198,7 +198,7 @@ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
break;
case ARM_SMCCC_HV_PV_TIME_ST:
gpa = kvm_init_stolen_time(vcpu);
- if (gpa != GPA_INVALID)
+ if (gpa != INVALID_GPA)
val[0] = gpa;
break;
case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index f32f4a2a347f..64c3aec0d937 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -12,17 +12,55 @@
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
#include <asm/esr.h>
+static void pend_sync_exception(struct kvm_vcpu *vcpu)
+{
+ /* If not nesting, EL1 is the only possible exception target */
+ if (likely(!vcpu_has_nv(vcpu))) {
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ return;
+ }
+
+ /*
+ * With NV, we need to pick between EL1 and EL2. Note that we
+ * never deal with a nesting exception here, hence never
+ * changing context, and the exception itself can be delayed
+ * until the next entry.
+ */
+ switch(*vcpu_cpsr(vcpu) & PSR_MODE_MASK) {
+ case PSR_MODE_EL2h:
+ case PSR_MODE_EL2t:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
+ break;
+ case PSR_MODE_EL1h:
+ case PSR_MODE_EL1t:
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ break;
+ case PSR_MODE_EL0t:
+ if (vcpu_el2_tge_is_set(vcpu))
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
+ else
+ kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ break;
+ default:
+ BUG();
+ }
+}
+
+static bool match_target_el(struct kvm_vcpu *vcpu, unsigned long target)
+{
+ return (vcpu_get_flag(vcpu, EXCEPT_MASK) == target);
+}
+
static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
{
unsigned long cpsr = *vcpu_cpsr(vcpu);
bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
u64 esr = 0;
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
-
- vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
+ pend_sync_exception(vcpu);
/*
* Build an {i,d}abort, depending on the level and the
@@ -43,14 +81,22 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr
if (!is_iabt)
esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT;
- vcpu_write_sys_reg(vcpu, esr | ESR_ELx_FSC_EXTABT, ESR_EL1);
+ esr |= ESR_ELx_FSC_EXTABT;
+
+ if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC))) {
+ vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
+ vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
+ } else {
+ vcpu_write_sys_reg(vcpu, addr, FAR_EL2);
+ vcpu_write_sys_reg(vcpu, esr, ESR_EL2);
+ }
}
static void inject_undef64(struct kvm_vcpu *vcpu)
{
u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);
- kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
+ pend_sync_exception(vcpu);
/*
* Build an unknown exception, depending on the instruction
@@ -59,7 +105,10 @@ static void inject_undef64(struct kvm_vcpu *vcpu)
if (kvm_vcpu_trap_il_is32bit(vcpu))
esr |= ESR_ELx_IL;
- vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
+ if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC)))
+ vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
+ else
+ vcpu_write_sys_reg(vcpu, esr, ESR_EL2);
}
#define DFSR_FSC_EXTABT_LPAE 0x10
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index a3ee3b605c9b..7113587222ff 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -25,11 +25,11 @@
static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
-static unsigned long hyp_idmap_start;
-static unsigned long hyp_idmap_end;
-static phys_addr_t hyp_idmap_vector;
+static unsigned long __ro_after_init hyp_idmap_start;
+static unsigned long __ro_after_init hyp_idmap_end;
+static phys_addr_t __ro_after_init hyp_idmap_vector;
-static unsigned long io_map_base;
+static unsigned long __ro_after_init io_map_base;
static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
{
@@ -46,16 +46,17 @@ static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
* long will also starve other vCPUs. We have to also make sure that the page
* tables are not freed while we released the lock.
*/
-static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
+static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
phys_addr_t end,
int (*fn)(struct kvm_pgtable *, u64, u64),
bool resched)
{
+ struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
int ret;
u64 next;
do {
- struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
+ struct kvm_pgtable *pgt = mmu->pgt;
if (!pgt)
return -EINVAL;
@@ -71,8 +72,8 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr,
return ret;
}
-#define stage2_apply_range_resched(kvm, addr, end, fn) \
- stage2_apply_range(kvm, addr, end, fn, true)
+#define stage2_apply_range_resched(mmu, addr, end, fn) \
+ stage2_apply_range(mmu, addr, end, fn, true)
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
@@ -235,7 +236,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
lockdep_assert_held_write(&kvm->mmu_lock);
WARN_ON(size & ~PAGE_MASK);
- WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
+ WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
may_block));
}
@@ -250,7 +251,7 @@ static void stage2_flush_memslot(struct kvm *kvm,
phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
- stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
+ stage2_apply_range_resched(&kvm->arch.mmu, addr, end, kvm_pgtable_stage2_flush);
}
/**
@@ -280,7 +281,7 @@ static void stage2_flush_vm(struct kvm *kvm)
/**
* free_hyp_pgds - free Hyp-mode page tables
*/
-void free_hyp_pgds(void)
+void __init free_hyp_pgds(void)
{
mutex_lock(&kvm_hyp_pgd_mutex);
if (hyp_pgtable) {
@@ -934,8 +935,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
*/
static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
- struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
- stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect);
+ stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect);
}
/**
@@ -1383,7 +1383,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
else
ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
__pfn_to_phys(pfn), prot,
- memcache, KVM_PGTABLE_WALK_SHARED);
+ memcache,
+ KVM_PGTABLE_WALK_HANDLE_FAULT |
+ KVM_PGTABLE_WALK_SHARED);
/* Mark the page dirty only if the fault is handled successfully */
if (writable && !ret) {
@@ -1401,20 +1403,18 @@ out_unlock:
/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
- pte_t pte;
- kvm_pte_t kpte;
+ kvm_pte_t pte;
struct kvm_s2_mmu *mmu;
trace_kvm_access_fault(fault_ipa);
- write_lock(&vcpu->kvm->mmu_lock);
+ read_lock(&vcpu->kvm->mmu_lock);
mmu = vcpu->arch.hw_mmu;
- kpte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
- write_unlock(&vcpu->kvm->mmu_lock);
+ pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
+ read_unlock(&vcpu->kvm->mmu_lock);
- pte = __pte(kpte);
- if (pte_valid(pte))
- kvm_set_pfn_accessed(pte_pfn(pte));
+ if (kvm_pte_valid(pte))
+ kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
}
/**
@@ -1668,7 +1668,7 @@ static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
.virt_to_phys = kvm_host_pa,
};
-int kvm_mmu_init(u32 *hyp_va_bits)
+int __init kvm_mmu_init(u32 *hyp_va_bits)
{
int err;
u32 idmap_bits;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
new file mode 100644
index 000000000000..315354d27978
--- /dev/null
+++ b/arch/arm64/kvm/nested.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017 - Columbia University and Linaro Ltd.
+ * Author: Jintack Lim <jintack.lim@linaro.org>
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_nested.h>
+#include <asm/sysreg.h>
+
+#include "sys_regs.h"
+
+/* Protection against the sysreg repainting madness... */
+#define NV_FTR(r, f) ID_AA64##r##_EL1_##f
+
+/*
+ * Our emulated CPU doesn't support all the possible features. For the
+ * sake of simplicity (and probably mental sanity), wipe out a number
+ * of feature bits we don't intend to support for the time being.
+ * This list should get updated as new features get added to the NV
+ * support, and new extension to the architecture.
+ */
+void access_nested_id_reg(struct kvm_vcpu *v, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 id = reg_to_encoding(r);
+ u64 val, tmp;
+
+ val = p->regval;
+
+ switch (id) {
+ case SYS_ID_AA64ISAR0_EL1:
+ /* Support everything but TME, O.S. and Range TLBIs */
+ val &= ~(NV_FTR(ISAR0, TLB) |
+ NV_FTR(ISAR0, TME));
+ break;
+
+ case SYS_ID_AA64ISAR1_EL1:
+ /* Support everything but PtrAuth and Spec Invalidation */
+ val &= ~(GENMASK_ULL(63, 56) |
+ NV_FTR(ISAR1, SPECRES) |
+ NV_FTR(ISAR1, GPI) |
+ NV_FTR(ISAR1, GPA) |
+ NV_FTR(ISAR1, API) |
+ NV_FTR(ISAR1, APA));
+ break;
+
+ case SYS_ID_AA64PFR0_EL1:
+ /* No AMU, MPAM, S-EL2, RAS or SVE */
+ val &= ~(GENMASK_ULL(55, 52) |
+ NV_FTR(PFR0, AMU) |
+ NV_FTR(PFR0, MPAM) |
+ NV_FTR(PFR0, SEL2) |
+ NV_FTR(PFR0, RAS) |
+ NV_FTR(PFR0, SVE) |
+ NV_FTR(PFR0, EL3) |
+ NV_FTR(PFR0, EL2) |
+ NV_FTR(PFR0, EL1));
+ /* 64bit EL1/EL2/EL3 only */
+ val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001);
+ val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001);
+ val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001);
+ break;
+
+ case SYS_ID_AA64PFR1_EL1:
+ /* Only support SSBS */
+ val &= NV_FTR(PFR1, SSBS);
+ break;
+
+ case SYS_ID_AA64MMFR0_EL1:
+ /* Hide ECV, FGT, ExS, Secure Memory */
+ val &= ~(GENMASK_ULL(63, 43) |
+ NV_FTR(MMFR0, TGRAN4_2) |
+ NV_FTR(MMFR0, TGRAN16_2) |
+ NV_FTR(MMFR0, TGRAN64_2) |
+ NV_FTR(MMFR0, SNSMEM));
+
+ /* Disallow unsupported S2 page sizes */
+ switch (PAGE_SIZE) {
+ case SZ_64K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0001);
+ fallthrough;
+ case SZ_16K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0001);
+ fallthrough;
+ case SZ_4K:
+ /* Support everything */
+ break;
+ }
+ /*
+ * Since we can't support a guest S2 page size smaller than
+ * the host's own page size (due to KVM only populating its
+ * own S2 using the kernel's page size), advertise the
+ * limitation using FEAT_GTG.
+ */
+ switch (PAGE_SIZE) {
+ case SZ_4K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN4_2), 0b0010);
+ fallthrough;
+ case SZ_16K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN16_2), 0b0010);
+ fallthrough;
+ case SZ_64K:
+ val |= FIELD_PREP(NV_FTR(MMFR0, TGRAN64_2), 0b0010);
+ break;
+ }
+ /* Cap PARange to 48bits */
+ tmp = FIELD_GET(NV_FTR(MMFR0, PARANGE), val);
+ if (tmp > 0b0101) {
+ val &= ~NV_FTR(MMFR0, PARANGE);
+ val |= FIELD_PREP(NV_FTR(MMFR0, PARANGE), 0b0101);
+ }
+ break;
+
+ case SYS_ID_AA64MMFR1_EL1:
+ val &= (NV_FTR(MMFR1, PAN) |
+ NV_FTR(MMFR1, LO) |
+ NV_FTR(MMFR1, HPDS) |
+ NV_FTR(MMFR1, VH) |
+ NV_FTR(MMFR1, VMIDBits));
+ break;
+
+ case SYS_ID_AA64MMFR2_EL1:
+ val &= ~(NV_FTR(MMFR2, EVT) |
+ NV_FTR(MMFR2, BBM) |
+ NV_FTR(MMFR2, TTL) |
+ GENMASK_ULL(47, 44) |
+ NV_FTR(MMFR2, ST) |
+ NV_FTR(MMFR2, CCIDX) |
+ NV_FTR(MMFR2, VARange));
+
+ /* Force TTL support */
+ val |= FIELD_PREP(NV_FTR(MMFR2, TTL), 0b0001);
+ break;
+
+ case SYS_ID_AA64DFR0_EL1:
+ /* Only limited support for PMU, Debug, BPs and WPs */
+ val &= (NV_FTR(DFR0, PMUVer) |
+ NV_FTR(DFR0, WRPs) |
+ NV_FTR(DFR0, BRPs) |
+ NV_FTR(DFR0, DebugVer));
+
+ /* Cap Debug to ARMv8.1 */
+ tmp = FIELD_GET(NV_FTR(DFR0, DebugVer), val);
+ if (tmp > 0b0111) {
+ val &= ~NV_FTR(DFR0, DebugVer);
+ val |= FIELD_PREP(NV_FTR(DFR0, DebugVer), 0b0111);
+ }
+ break;
+
+ default:
+ /* Unknown register, just wipe it clean */
+ val = 0;
+ break;
+ }
+
+ p->regval = val;
+}
diff --git a/arch/arm64/kvm/pvtime.c b/arch/arm64/kvm/pvtime.c
index 78a09f7a6637..4ceabaa4c30b 100644
--- a/arch/arm64/kvm/pvtime.c
+++ b/arch/arm64/kvm/pvtime.c
@@ -19,7 +19,7 @@ void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
u64 steal = 0;
int idx;
- if (base == GPA_INVALID)
+ if (base == INVALID_GPA)
return;
idx = srcu_read_lock(&kvm->srcu);
@@ -40,7 +40,7 @@ long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
switch (feature) {
case ARM_SMCCC_HV_PV_TIME_FEATURES:
case ARM_SMCCC_HV_PV_TIME_ST:
- if (vcpu->arch.steal.base != GPA_INVALID)
+ if (vcpu->arch.steal.base != INVALID_GPA)
val = SMCCC_RET_SUCCESS;
break;
}
@@ -54,7 +54,7 @@ gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu->kvm;
u64 base = vcpu->arch.steal.base;
- if (base == GPA_INVALID)
+ if (base == INVALID_GPA)
return base;
/*
@@ -89,7 +89,7 @@ int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
return -EFAULT;
if (!IS_ALIGNED(ipa, 64))
return -EINVAL;
- if (vcpu->arch.steal.base != GPA_INVALID)
+ if (vcpu->arch.steal.base != INVALID_GPA)
return -EEXIST;
/* Check the address is in a valid memslot */
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e0267f672b8a..49a3257dec46 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -27,10 +27,11 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/virt.h>
/* Maximum phys_shift supported for any VM on this host */
-static u32 kvm_ipa_limit;
+static u32 __ro_after_init kvm_ipa_limit;
/*
* ARMv8 Reset Values
@@ -38,12 +39,15 @@ static u32 kvm_ipa_limit;
#define VCPU_RESET_PSTATE_EL1 (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \
PSR_F_BIT | PSR_D_BIT)
+#define VCPU_RESET_PSTATE_EL2 (PSR_MODE_EL2h | PSR_A_BIT | PSR_I_BIT | \
+ PSR_F_BIT | PSR_D_BIT)
+
#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
PSR_AA32_I_BIT | PSR_AA32_F_BIT)
-unsigned int kvm_sve_max_vl;
+unsigned int __ro_after_init kvm_sve_max_vl;
-int kvm_arm_init_sve(void)
+int __init kvm_arm_init_sve(void)
{
if (system_supports_sve()) {
kvm_sve_max_vl = sve_max_virtualisable_vl();
@@ -157,6 +161,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
if (sve_state)
kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu));
kfree(sve_state);
+ kfree(vcpu->arch.ccsidr);
}
static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
@@ -220,6 +225,10 @@ static int kvm_set_vm_width(struct kvm_vcpu *vcpu)
if (kvm_has_mte(kvm) && is32bit)
return -EINVAL;
+ /* NV is incompatible with AArch32 */
+ if (vcpu_has_nv(vcpu) && is32bit)
+ return -EINVAL;
+
if (is32bit)
set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags);
@@ -272,6 +281,12 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
if (loaded)
kvm_arch_vcpu_put(vcpu);
+ /* Disallow NV+SVE for the time being */
+ if (vcpu_has_nv(vcpu) && vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (!kvm_arm_vcpu_sve_finalized(vcpu)) {
if (test_bit(KVM_ARM_VCPU_SVE, vcpu->arch.features)) {
ret = kvm_vcpu_enable_sve(vcpu);
@@ -294,6 +309,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
default:
if (vcpu_el1_is_32bit(vcpu)) {
pstate = VCPU_RESET_PSTATE_SVC;
+ } else if (vcpu_has_nv(vcpu)) {
+ pstate = VCPU_RESET_PSTATE_EL2;
} else {
pstate = VCPU_RESET_PSTATE_EL1;
}
@@ -352,7 +369,7 @@ u32 get_kvm_ipa_limit(void)
return kvm_ipa_limit;
}
-int kvm_set_ipa_limit(void)
+int __init kvm_set_ipa_limit(void)
{
unsigned int parange;
u64 mmfr0;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c6cbfe6b854b..53749d3a0996 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -11,6 +11,7 @@
#include <linux/bitfield.h>
#include <linux/bsearch.h>
+#include <linux/cacheinfo.h>
#include <linux/kvm_host.h>
#include <linux/mm.h>
#include <linux/printk.h>
@@ -24,6 +25,7 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include <asm/kvm_nested.h>
#include <asm/perf_event.h>
#include <asm/sysreg.h>
@@ -78,28 +80,112 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
__vcpu_write_sys_reg_to_cpu(val, reg))
return;
- __vcpu_sys_reg(vcpu, reg) = val;
+ __vcpu_sys_reg(vcpu, reg) = val;
}
-/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */
-static u32 cache_levels;
-
/* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */
#define CSSELR_MAX 14
+/*
+ * Returns the minimum line size for the selected cache, expressed as
+ * Log2(bytes).
+ */
+static u8 get_min_cache_line_size(bool icache)
+{
+ u64 ctr = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ u8 field;
+
+ if (icache)
+ field = SYS_FIELD_GET(CTR_EL0, IminLine, ctr);
+ else
+ field = SYS_FIELD_GET(CTR_EL0, DminLine, ctr);
+
+ /*
+ * Cache line size is represented as Log2(words) in CTR_EL0.
+ * Log2(bytes) can be derived with the following:
+ *
+ * Log2(words) + 2 = Log2(bytes / 4) + 2
+ * = Log2(bytes) - 2 + 2
+ * = Log2(bytes)
+ */
+ return field + 2;
+}
+
/* Which cache CCSIDR represents depends on CSSELR value. */
-static u32 get_ccsidr(u32 csselr)
+static u32 get_ccsidr(struct kvm_vcpu *vcpu, u32 csselr)
{
- u32 ccsidr;
+ u8 line_size;
- /* Make sure noone else changes CSSELR during this! */
- local_irq_disable();
- write_sysreg(csselr, csselr_el1);
- isb();
- ccsidr = read_sysreg(ccsidr_el1);
- local_irq_enable();
+ if (vcpu->arch.ccsidr)
+ return vcpu->arch.ccsidr[csselr];
- return ccsidr;
+ line_size = get_min_cache_line_size(csselr & CSSELR_EL1_InD);
+
+ /*
+ * Fabricate a CCSIDR value as the overriding value does not exist.
+ * The real CCSIDR value will not be used as it can vary by the
+ * physical CPU which the vcpu currently resides in.
+ *
+ * The line size is determined with get_min_cache_line_size(), which
+ * should be valid for all CPUs even if they have different cache
+ * configuration.
+ *
+ * The associativity bits are cleared, meaning the geometry of all data
+ * and unified caches (which are guaranteed to be PIPT and thus
+ * non-aliasing) are 1 set and 1 way.
+ * Guests should not be doing cache operations by set/way at all, and
+ * for this reason, we trap them and attempt to infer the intent, so
+ * that we can flush the entire guest's address space at the appropriate
+ * time. The exposed geometry minimizes the number of the traps.
+ * [If guests should attempt to infer aliasing properties from the
+ * geometry (which is not permitted by the architecture), they would
+ * only do so for virtually indexed caches.]
+ *
+ * We don't check if the cache level exists as it is allowed to return
+ * an UNKNOWN value if not.
+ */
+ return SYS_FIELD_PREP(CCSIDR_EL1, LineSize, line_size - 4);
+}
+
+static int set_ccsidr(struct kvm_vcpu *vcpu, u32 csselr, u32 val)
+{
+ u8 line_size = FIELD_GET(CCSIDR_EL1_LineSize, val) + 4;
+ u32 *ccsidr = vcpu->arch.ccsidr;
+ u32 i;
+
+ if ((val & CCSIDR_EL1_RES0) ||
+ line_size < get_min_cache_line_size(csselr & CSSELR_EL1_InD))
+ return -EINVAL;
+
+ if (!ccsidr) {
+ if (val == get_ccsidr(vcpu, csselr))
+ return 0;
+
+ ccsidr = kmalloc_array(CSSELR_MAX, sizeof(u32), GFP_KERNEL_ACCOUNT);
+ if (!ccsidr)
+ return -ENOMEM;
+
+ for (i = 0; i < CSSELR_MAX; i++)
+ ccsidr[i] = get_ccsidr(vcpu, i);
+
+ vcpu->arch.ccsidr = ccsidr;
+ }
+
+ ccsidr[csselr] = val;
+
+ return 0;
+}
+
+static bool access_rw(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ vcpu_write_sys_reg(vcpu, p->regval, r->reg);
+ else
+ p->regval = vcpu_read_sys_reg(vcpu, r->reg);
+
+ return true;
}
/*
@@ -260,6 +346,14 @@ static bool trap_raz_wi(struct kvm_vcpu *vcpu,
return read_zero(vcpu, p);
}
+static bool trap_undef(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ kvm_inject_undefined(vcpu);
+ return false;
+}
+
/*
* ARMv8.1 mandates at least a trivial LORegion implementation, where all the
* RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0
@@ -370,12 +464,9 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
- if (p->is_write) {
- vcpu_write_sys_reg(vcpu, p->regval, r->reg);
+ access_rw(vcpu, p, r);
+ if (p->is_write)
vcpu_set_flag(vcpu, DEBUG_DIRTY);
- } else {
- p->regval = vcpu_read_sys_reg(vcpu, r->reg);
- }
trace_trap_reg(__func__, r->reg, p->is_write, p->regval);
@@ -1049,7 +1140,9 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu,
treg = TIMER_REG_CVAL;
break;
default:
- BUG();
+ print_sys_reg_msg(p, "%s", "Unhandled trapped timer register");
+ kvm_inject_undefined(vcpu);
+ return false;
}
if (p->is_write)
@@ -1155,6 +1248,12 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r
val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon),
pmuver_to_perfmon(vcpu_pmuver(vcpu)));
break;
+ case SYS_ID_AA64MMFR2_EL1:
+ val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
+ break;
+ case SYS_ID_MMFR4_EL1:
+ val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX);
+ break;
}
return val;
@@ -1205,6 +1304,9 @@ static bool access_id_reg(struct kvm_vcpu *vcpu,
return write_to_read_only(vcpu, p, r);
p->regval = read_id_reg(vcpu, r);
+ if (vcpu_has_nv(vcpu))
+ access_nested_id_reg(vcpu, p, r);
+
return true;
}
@@ -1385,10 +1487,78 @@ static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
if (p->is_write)
return write_to_read_only(vcpu, p, r);
- p->regval = read_sysreg(clidr_el1);
+ p->regval = __vcpu_sys_reg(vcpu, r->reg);
return true;
}
+/*
+ * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary
+ * by the physical CPU which the vcpu currently resides in.
+ */
+static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+ u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ u64 clidr;
+ u8 loc;
+
+ if ((ctr_el0 & CTR_EL0_IDC)) {
+ /*
+ * Data cache clean to the PoU is not required so LoUU and LoUIS
+ * will not be set and a unified cache, which will be marked as
+ * LoC, will be added.
+ *
+ * If not DIC, let the unified cache L2 so that an instruction
+ * cache can be added as L1 later.
+ */
+ loc = (ctr_el0 & CTR_EL0_DIC) ? 1 : 2;
+ clidr = CACHE_TYPE_UNIFIED << CLIDR_CTYPE_SHIFT(loc);
+ } else {
+ /*
+ * Data cache clean to the PoU is required so let L1 have a data
+ * cache and mark it as LoUU and LoUIS. As L1 has a data cache,
+ * it can be marked as LoC too.
+ */
+ loc = 1;
+ clidr = 1 << CLIDR_LOUU_SHIFT;
+ clidr |= 1 << CLIDR_LOUIS_SHIFT;
+ clidr |= CACHE_TYPE_DATA << CLIDR_CTYPE_SHIFT(1);
+ }
+
+ /*
+ * Instruction cache invalidation to the PoU is required so let L1 have
+ * an instruction cache. If L1 already has a data cache, it will be
+ * CACHE_TYPE_SEPARATE.
+ */
+ if (!(ctr_el0 & CTR_EL0_DIC))
+ clidr |= CACHE_TYPE_INST << CLIDR_CTYPE_SHIFT(1);
+
+ clidr |= loc << CLIDR_LOC_SHIFT;
+
+ /*
+ * Add tag cache unified to data cache. Allocation tags and data are
+ * unified in a cache line so that it looks valid even if there is only
+ * one cache line.
+ */
+ if (kvm_has_mte(vcpu->kvm))
+ clidr |= 2 << CLIDR_TTYPE_SHIFT(loc);
+
+ __vcpu_sys_reg(vcpu, r->reg) = clidr;
+}
+
+static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+ u64 val)
+{
+ u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
+ u64 idc = !CLIDR_LOC(val) || (!CLIDR_LOUIS(val) && !CLIDR_LOUU(val));
+
+ if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc))
+ return -EINVAL;
+
+ __vcpu_sys_reg(vcpu, rd->reg) = val;
+
+ return 0;
+}
+
static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
const struct sys_reg_desc *r)
{
@@ -1410,22 +1580,10 @@ static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return write_to_read_only(vcpu, p, r);
csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1);
- p->regval = get_ccsidr(csselr);
+ csselr &= CSSELR_EL1_Level | CSSELR_EL1_InD;
+ if (csselr < CSSELR_MAX)
+ p->regval = get_ccsidr(vcpu, csselr);
- /*
- * Guests should not be doing cache operations by set/way at all, and
- * for this reason, we trap them and attempt to infer the intent, so
- * that we can flush the entire guest's address space at the appropriate
- * time.
- * To prevent this trapping from causing performance problems, let's
- * expose the geometry of all data and unified caches (which are
- * guaranteed to be PIPT and thus non-aliasing) as 1 set and 1 way.
- * [If guests should attempt to infer aliasing properties from the
- * geometry (which is not permitted by the architecture), they would
- * only do so for virtually indexed caches.]
- */
- if (!(csselr & 1)) // data or unified cache
- p->regval &= ~GENMASK(27, 3);
return true;
}
@@ -1446,6 +1604,44 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
.visibility = mte_visibility, \
}
+static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ if (vcpu_has_nv(vcpu))
+ return 0;
+
+ return REG_HIDDEN;
+}
+
+#define EL2_REG(name, acc, rst, v) { \
+ SYS_DESC(SYS_##name), \
+ .access = acc, \
+ .reset = rst, \
+ .reg = name, \
+ .visibility = el2_visibility, \
+ .val = v, \
+}
+
+/*
+ * EL{0,1}2 registers are the EL2 view on an EL0 or EL1 register when
+ * HCR_EL2.E2H==1, and only in the sysreg table for convenience of
+ * handling traps. Given that, they are always hidden from userspace.
+ */
+static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *rd)
+{
+ return REG_HIDDEN_USER;
+}
+
+#define EL12_REG(name, acc, rst, v) { \
+ SYS_DESC(SYS_##name##_EL12), \
+ .access = acc, \
+ .reset = rst, \
+ .reg = name##_EL1, \
+ .val = v, \
+ .visibility = elx2_visibility, \
+}
+
/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) { \
SYS_DESC(SYS_##name), \
@@ -1490,6 +1686,42 @@ static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
.visibility = raz_visibility, \
}
+static bool access_sp_el1(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ __vcpu_sys_reg(vcpu, SP_EL1) = p->regval;
+ else
+ p->regval = __vcpu_sys_reg(vcpu, SP_EL1);
+
+ return true;
+}
+
+static bool access_elr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ vcpu_write_sys_reg(vcpu, p->regval, ELR_EL1);
+ else
+ p->regval = vcpu_read_sys_reg(vcpu, ELR_EL1);
+
+ return true;
+}
+
+static bool access_spsr(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (p->is_write)
+ __vcpu_sys_reg(vcpu, SPSR_EL1) = p->regval;
+ else
+ p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1);
+
+ return true;
+}
+
/*
* Architected system registers.
* Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@ -1646,6 +1878,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
PTRAUTH_KEY(APDB),
PTRAUTH_KEY(APGA),
+ { SYS_DESC(SYS_SPSR_EL1), access_spsr},
+ { SYS_DESC(SYS_ELR_EL1), access_elr},
+
{ SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 },
{ SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
{ SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },
@@ -1693,7 +1928,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_LORC_EL1), trap_loregion },
{ SYS_DESC(SYS_LORID_EL1), trap_loregion },
- { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 },
+ { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 },
{ SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },
{ SYS_DESC(SYS_ICC_IAR0_EL1), write_to_read_only },
@@ -1717,7 +1952,9 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0},
{ SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr },
- { SYS_DESC(SYS_CLIDR_EL1), access_clidr },
+ { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1,
+ .set_user = set_clidr },
+ { SYS_DESC(SYS_CCSIDR2_EL1), undef_access },
{ SYS_DESC(SYS_SMIDR_EL1), undef_access },
{ SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
{ SYS_DESC(SYS_CTR_EL0), access_ctr },
@@ -1913,9 +2150,67 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ PMU_SYS_REG(SYS_PMCCFILTR_EL0), .access = access_pmu_evtyper,
.reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 },
+ EL2_REG(VPIDR_EL2, access_rw, reset_unknown, 0),
+ EL2_REG(VMPIDR_EL2, access_rw, reset_unknown, 0),
+ EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
+ EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
+ EL2_REG(HCR_EL2, access_rw, reset_val, 0),
+ EL2_REG(MDCR_EL2, access_rw, reset_val, 0),
+ EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ),
+ EL2_REG(HSTR_EL2, access_rw, reset_val, 0),
+ EL2_REG(HACR_EL2, access_rw, reset_val, 0),
+
+ EL2_REG(TTBR0_EL2, access_rw, reset_val, 0),
+ EL2_REG(TTBR1_EL2, access_rw, reset_val, 0),
+ EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1),
+ EL2_REG(VTTBR_EL2, access_rw, reset_val, 0),
+ EL2_REG(VTCR_EL2, access_rw, reset_val, 0),
+
{ SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 },
+ EL2_REG(SPSR_EL2, access_rw, reset_val, 0),
+ EL2_REG(ELR_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_SP_EL1), access_sp_el1},
+
{ SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 },
+ EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
+ EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
+ EL2_REG(ESR_EL2, access_rw, reset_val, 0),
{ SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x700 },
+
+ EL2_REG(FAR_EL2, access_rw, reset_val, 0),
+ EL2_REG(HPFAR_EL2, access_rw, reset_val, 0),
+
+ EL2_REG(MAIR_EL2, access_rw, reset_val, 0),
+ EL2_REG(AMAIR_EL2, access_rw, reset_val, 0),
+
+ EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
+ EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
+ { SYS_DESC(SYS_RMR_EL2), trap_undef },
+
+ EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
+ EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),
+
+ EL2_REG(CNTVOFF_EL2, access_rw, reset_val, 0),
+ EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0),
+
+ EL12_REG(SCTLR, access_vm_reg, reset_val, 0x00C50078),
+ EL12_REG(CPACR, access_rw, reset_val, 0),
+ EL12_REG(TTBR0, access_vm_reg, reset_unknown, 0),
+ EL12_REG(TTBR1, access_vm_reg, reset_unknown, 0),
+ EL12_REG(TCR, access_vm_reg, reset_val, 0),
+ { SYS_DESC(SYS_SPSR_EL12), access_spsr},
+ { SYS_DESC(SYS_ELR_EL12), access_elr},
+ EL12_REG(AFSR0, access_vm_reg, reset_unknown, 0),
+ EL12_REG(AFSR1, access_vm_reg, reset_unknown, 0),
+ EL12_REG(ESR, access_vm_reg, reset_unknown, 0),
+ EL12_REG(FAR, access_vm_reg, reset_unknown, 0),
+ EL12_REG(MAIR, access_vm_reg, reset_unknown, 0),
+ EL12_REG(AMAIR, access_vm_reg, reset_amair_el1, 0),
+ EL12_REG(VBAR, access_rw, reset_val, 0),
+ EL12_REG(CONTEXTIDR, access_vm_reg, reset_val, 0),
+ EL12_REG(CNTKCTL, access_rw, reset_val, 0),
+
+ EL2_REG(SP_EL2, NULL, reset_unknown, 0),
};
static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
@@ -2219,6 +2514,10 @@ static const struct sys_reg_desc cp15_regs[] = {
{ Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr },
{ Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr },
+
+ /* CCSIDR2 */
+ { Op1(1), CRn( 0), CRm( 0), Op2(2), undef_access },
+
{ Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 },
};
@@ -2724,7 +3023,6 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
FUNCTION_INVARIANT(midr_el1)
FUNCTION_INVARIANT(revidr_el1)
-FUNCTION_INVARIANT(clidr_el1)
FUNCTION_INVARIANT(aidr_el1)
static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
@@ -2733,10 +3031,9 @@ static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r)
}
/* ->val is filled in by kvm_sys_reg_table_init() */
-static struct sys_reg_desc invariant_sys_regs[] = {
+static struct sys_reg_desc invariant_sys_regs[] __ro_after_init = {
{ SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 },
{ SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 },
- { SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 },
{ SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 },
{ SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 },
};
@@ -2773,33 +3070,7 @@ static int set_invariant_sys_reg(u64 id, u64 __user *uaddr)
return 0;
}
-static bool is_valid_cache(u32 val)
-{
- u32 level, ctype;
-
- if (val >= CSSELR_MAX)
- return false;
-
- /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */
- level = (val >> 1);
- ctype = (cache_levels >> (level * 3)) & 7;
-
- switch (ctype) {
- case 0: /* No cache */
- return false;
- case 1: /* Instruction cache only */
- return (val & 1);
- case 2: /* Data cache only */
- case 4: /* Unified cache */
- return !(val & 1);
- case 3: /* Separate instruction and data caches */
- return true;
- default: /* Reserved: we can't know instruction or data. */
- return false;
- }
-}
-
-static int demux_c15_get(u64 id, void __user *uaddr)
+static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
{
u32 val;
u32 __user *uval = uaddr;
@@ -2815,16 +3086,16 @@ static int demux_c15_get(u64 id, void __user *uaddr)
return -ENOENT;
val = (id & KVM_REG_ARM_DEMUX_VAL_MASK)
>> KVM_REG_ARM_DEMUX_VAL_SHIFT;
- if (!is_valid_cache(val))
+ if (val >= CSSELR_MAX)
return -ENOENT;
- return put_user(get_ccsidr(val), uval);
+ return put_user(get_ccsidr(vcpu, val), uval);
default:
return -ENOENT;
}
}
-static int demux_c15_set(u64 id, void __user *uaddr)
+static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
{
u32 val, newval;
u32 __user *uval = uaddr;
@@ -2840,16 +3111,13 @@ static int demux_c15_set(u64 id, void __user *uaddr)
return -ENOENT;
val = (id & KVM_REG_ARM_DEMUX_VAL_MASK)
>> KVM_REG_ARM_DEMUX_VAL_SHIFT;
- if (!is_valid_cache(val))
+ if (val >= CSSELR_MAX)
return -ENOENT;
if (get_user(newval, uval))
return -EFAULT;
- /* This is also invariant: you can't change it. */
- if (newval != get_ccsidr(val))
- return -EINVAL;
- return 0;
+ return set_ccsidr(vcpu, val, newval);
default:
return -ENOENT;
}
@@ -2864,7 +3132,7 @@ int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
int ret;
r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
- if (!r)
+ if (!r || sysreg_hidden_user(vcpu, r))
return -ENOENT;
if (r->get_user) {
@@ -2886,7 +3154,7 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
int err;
if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
- return demux_c15_get(reg->id, uaddr);
+ return demux_c15_get(vcpu, reg->id, uaddr);
err = get_invariant_sys_reg(reg->id, uaddr);
if (err != -ENOENT)
@@ -2908,7 +3176,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
return -EFAULT;
r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
- if (!r)
+ if (!r || sysreg_hidden_user(vcpu, r))
return -ENOENT;
if (sysreg_user_write_ignore(vcpu, r))
@@ -2930,7 +3198,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
int err;
if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
- return demux_c15_set(reg->id, uaddr);
+ return demux_c15_set(vcpu, reg->id, uaddr);
err = set_invariant_sys_reg(reg->id, uaddr);
if (err != -ENOENT)
@@ -2942,13 +3210,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
static unsigned int num_demux_regs(void)
{
- unsigned int i, count = 0;
-
- for (i = 0; i < CSSELR_MAX; i++)
- if (is_valid_cache(i))
- count++;
-
- return count;
+ return CSSELR_MAX;
}
static int write_demux_regids(u64 __user *uindices)
@@ -2958,8 +3220,6 @@ static int write_demux_regids(u64 __user *uindices)
val |= KVM_REG_ARM_DEMUX_ID_CCSIDR;
for (i = 0; i < CSSELR_MAX; i++) {
- if (!is_valid_cache(i))
- continue;
if (put_user(val | i, uindices))
return -EFAULT;
uindices++;
@@ -3002,7 +3262,7 @@ static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
if (!(rd->reg || rd->get_user))
return 0;
- if (sysreg_hidden(vcpu, rd))
+ if (sysreg_hidden_user(vcpu, rd))
return 0;
if (!copy_reg_to_user(rd, uind))
@@ -3057,11 +3317,10 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
return write_demux_regids(uindices);
}
-int kvm_sys_reg_table_init(void)
+int __init kvm_sys_reg_table_init(void)
{
bool valid = true;
unsigned int i;
- struct sys_reg_desc clidr;
/* Make sure tables are unique and in order. */
valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false);
@@ -3078,23 +3337,5 @@ int kvm_sys_reg_table_init(void)
for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++)
invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]);
- /*
- * CLIDR format is awkward, so clean it up. See ARM B4.1.20:
- *
- * If software reads the Cache Type fields from Ctype1
- * upwards, once it has seen a value of 0b000, no caches
- * exist at further-out levels of the hierarchy. So, for
- * example, if Ctype3 is the first Cache Type field with a
- * value of 0b000, the values of Ctype4 to Ctype7 must be
- * ignored.
- */
- get_clidr_el1(NULL, &clidr); /* Ugly... */
- cache_levels = clidr.val;
- for (i = 0; i < 7; i++)
- if (((cache_levels >> (i*3)) & 7) == 0)
- break;
- /* Clear all higher bits. */
- cache_levels &= (1 << (i*3))-1;
-
return 0;
}
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index e4ebb3a379fd..6b11f2cc7146 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -85,8 +85,9 @@ struct sys_reg_desc {
};
#define REG_HIDDEN (1 << 0) /* hidden from userspace and guest */
-#define REG_RAZ (1 << 1) /* RAZ from userspace and guest */
-#define REG_USER_WI (1 << 2) /* WI from userspace only */
+#define REG_HIDDEN_USER (1 << 1) /* hidden from userspace only */
+#define REG_RAZ (1 << 2) /* RAZ from userspace and guest */
+#define REG_USER_WI (1 << 3) /* WI from userspace only */
static __printf(2, 3)
inline void print_sys_reg_msg(const struct sys_reg_params *p,
@@ -152,6 +153,15 @@ static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu,
return sysreg_visibility(vcpu, r) & REG_HIDDEN;
}
+static inline bool sysreg_hidden_user(const struct kvm_vcpu *vcpu,
+ const struct sys_reg_desc *r)
+{
+ if (likely(!r->visibility))
+ return false;
+
+ return r->visibility(vcpu, r) & (REG_HIDDEN | REG_HIDDEN_USER);
+}
+
static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *r)
{
diff --git a/arch/arm64/kvm/trace_arm.h b/arch/arm64/kvm/trace_arm.h
index 33e4e7dd2719..f3e46a976125 100644
--- a/arch/arm64/kvm/trace_arm.h
+++ b/arch/arm64/kvm/trace_arm.h
@@ -2,6 +2,7 @@
#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_ARM_ARM64_KVM_H
+#include <asm/kvm_emulate.h>
#include <kvm/arm_arch_timer.h>
#include <linux/tracepoint.h>
@@ -301,6 +302,64 @@ TRACE_EVENT(kvm_timer_emulate,
__entry->timer_idx, __entry->should_fire)
);
+TRACE_EVENT(kvm_nested_eret,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned long elr_el2,
+ unsigned long spsr_el2),
+ TP_ARGS(vcpu, elr_el2, spsr_el2),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(unsigned long, elr_el2)
+ __field(unsigned long, spsr_el2)
+ __field(unsigned long, target_mode)
+ __field(unsigned long, hcr_el2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->elr_el2 = elr_el2;
+ __entry->spsr_el2 = spsr_el2;
+ __entry->target_mode = spsr_el2 & (PSR_MODE_MASK | PSR_MODE32_BIT);
+ __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2);
+ ),
+
+ TP_printk("elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx",
+ __entry->elr_el2, __entry->spsr_el2,
+ __print_symbolic(__entry->target_mode, kvm_mode_names),
+ __entry->hcr_el2)
+);
+
+TRACE_EVENT(kvm_inject_nested_exception,
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 esr_el2, int type),
+ TP_ARGS(vcpu, esr_el2, type),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(unsigned long, esr_el2)
+ __field(int, type)
+ __field(unsigned long, spsr_el2)
+ __field(unsigned long, pc)
+ __field(unsigned long, source_mode)
+ __field(unsigned long, hcr_el2)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->esr_el2 = esr_el2;
+ __entry->type = type;
+ __entry->spsr_el2 = *vcpu_cpsr(vcpu);
+ __entry->pc = *vcpu_pc(vcpu);
+ __entry->source_mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
+ __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2);
+ ),
+
+ TP_printk("%s: esr_el2 0x%lx elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx",
+ __print_symbolic(__entry->type, kvm_exception_type_names),
+ __entry->esr_el2, __entry->pc, __entry->spsr_el2,
+ __print_symbolic(__entry->source_mode, kvm_mode_names),
+ __entry->hcr_el2)
+);
+
#endif /* _TRACE_ARM_ARM64_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index f6d4f4052555..cd134db41a57 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -465,17 +465,15 @@ out:
/* GENERIC PROBE */
-static int vgic_init_cpu_starting(unsigned int cpu)
+void kvm_vgic_cpu_up(void)
{
enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
- return 0;
}
-static int vgic_init_cpu_dying(unsigned int cpu)
+void kvm_vgic_cpu_down(void)
{
disable_percpu_irq(kvm_vgic_global_state.maint_irq);
- return 0;
}
static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@@ -572,7 +570,7 @@ int kvm_vgic_hyp_init(void)
if (ret)
return ret;
- if (!has_mask)
+ if (!has_mask && !kvm_vgic_global_state.maint_irq)
return 0;
ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
@@ -584,19 +582,6 @@ int kvm_vgic_hyp_init(void)
return ret;
}
- ret = cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING,
- "kvm/arm/vgic:starting",
- vgic_init_cpu_starting, vgic_init_cpu_dying);
- if (ret) {
- kvm_err("Cannot register vgic CPU notifier\n");
- goto out_free_irq;
- }
-
kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
return 0;
-
-out_free_irq:
- free_percpu_irq(kvm_vgic_global_state.maint_irq,
- kvm_get_running_vcpus());
- return ret;
}
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c
index b32d434c1d4a..e67b3b2c8044 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio.c
@@ -473,9 +473,10 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
* active state can be overwritten when the VCPU's state is synced coming back
* from the guest.
*
- * For shared interrupts as well as GICv3 private interrupts, we have to
- * stop all the VCPUs because interrupts can be migrated while we don't hold
- * the IRQ locks and we don't want to be chasing moving targets.
+ * For shared interrupts as well as GICv3 private interrupts accessed from the
+ * non-owning CPU, we have to stop all the VCPUs because interrupts can be
+ * migrated while we don't hold the IRQ locks and we don't want to be chasing
+ * moving targets.
*
* For GICv2 private interrupts we don't have to do anything because
* userspace accesses to the VGIC state already require all VCPUs to be
@@ -484,7 +485,8 @@ int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
*/
static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
{
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 &&
+ vcpu != kvm_get_running_vcpu()) ||
intid >= VGIC_NR_PRIVATE_IRQS)
kvm_arm_halt_guest(vcpu->kvm);
}
@@ -492,7 +494,8 @@ static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
/* See vgic_access_active_prepare */
static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
{
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 &&
+ vcpu != kvm_get_running_vcpu()) ||
intid >= VGIC_NR_PRIVATE_IRQS)
kvm_arm_resume_guest(vcpu->kvm);
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 684bdfaad4a9..469d816f356f 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -3,6 +3,7 @@
#include <linux/irqchip/arm-gic-v3.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
+#include <linux/kstrtox.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
@@ -584,25 +585,25 @@ DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);
static int __init early_group0_trap_cfg(char *buf)
{
- return strtobool(buf, &group0_trap);
+ return kstrtobool(buf, &group0_trap);
}
early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);
static int __init early_group1_trap_cfg(char *buf)
{
- return strtobool(buf, &group1_trap);
+ return kstrtobool(buf, &group1_trap);
}
early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);
static int __init early_common_trap_cfg(char *buf)
{
- return strtobool(buf, &common_trap);
+ return kstrtobool(buf, &common_trap);
}
early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);
static int __init early_gicv4_enable(char *buf)
{
- return strtobool(buf, &gicv4_enable);
+ return kstrtobool(buf, &gicv4_enable);
}
early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c
index d78ae63d7c15..08978d0672e7 100644
--- a/arch/arm64/kvm/vmid.c
+++ b/arch/arm64/kvm/vmid.c
@@ -16,7 +16,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>
-unsigned int kvm_arm_vmid_bits;
+unsigned int __ro_after_init kvm_arm_vmid_bits;
static DEFINE_RAW_SPINLOCK(cpu_vmid_lock);
static atomic64_t vmid_generation;
@@ -172,7 +172,7 @@ void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
/*
* Initialize the VMID allocator
*/
-int kvm_arm_vmid_alloc_init(void)
+int __init kvm_arm_vmid_alloc_init(void)
{
kvm_arm_vmid_bits = kvm_get_vmid_bits();
@@ -190,7 +190,7 @@ int kvm_arm_vmid_alloc_init(void)
return 0;
}
-void kvm_arm_vmid_alloc_free(void)
+void __init kvm_arm_vmid_alloc_free(void)
{
kfree(vmid_map);
}
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 10dcfa13390a..37b1340e9646 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -33,6 +33,7 @@ HAS_GIC_PRIO_MASKING
HAS_GIC_PRIO_RELAXED_SYNC
HAS_LDAPR
HAS_LSE_ATOMICS
+HAS_NESTED_VIRT
HAS_NO_FPSIMD
HAS_NO_HW_PREFETCH
HAS_PAN
diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk
index 7f27d66a17e1..6fa0468caa00 100755
--- a/arch/arm64/tools/gen-sysreg.awk
+++ b/arch/arm64/tools/gen-sysreg.awk
@@ -103,6 +103,7 @@ END {
res0 = "UL(0)"
res1 = "UL(0)"
+ unkn = "UL(0)"
next_bit = 63
@@ -117,11 +118,13 @@ END {
define(reg "_RES0", "(" res0 ")")
define(reg "_RES1", "(" res1 ")")
+ define(reg "_UNKN", "(" unkn ")")
print ""
reg = null
res0 = null
res1 = null
+ unkn = null
next
}
@@ -139,6 +142,7 @@ END {
res0 = "UL(0)"
res1 = "UL(0)"
+ unkn = "UL(0)"
define("REG_" reg, "S" op0 "_" op1 "_C" crn "_C" crm "_" op2)
define("SYS_" reg, "sys_reg(" op0 ", " op1 ", " crn ", " crm ", " op2 ")")
@@ -166,7 +170,9 @@ END {
define(reg "_RES0", "(" res0 ")")
if (res1 != null)
define(reg "_RES1", "(" res1 ")")
- if (res0 != null || res1 != null)
+ if (unkn != null)
+ define(reg "_UNKN", "(" unkn ")")
+ if (res0 != null || res1 != null || unkn != null)
print ""
reg = null
@@ -177,6 +183,7 @@ END {
op2 = null
res0 = null
res1 = null
+ unkn = null
next
}
@@ -195,6 +202,7 @@ END {
next_bit = 0
res0 = null
res1 = null
+ unkn = null
next
}
@@ -220,6 +228,16 @@ END {
next
}
+/^Unkn/ && (block == "Sysreg" || block == "SysregFields") {
+ expect_fields(2)
+ parse_bitdef(reg, "UNKN", $2)
+ field = "UNKN_" msb "_" lsb
+
+ unkn = unkn " | GENMASK_ULL(" msb ", " lsb ")"
+
+ next
+}
+
/^Field/ && (block == "Sysreg" || block == "SysregFields") {
expect_fields(3)
field = $3
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index 94d78acafb67..dd5a9c7e310f 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -15,6 +15,8 @@
# Res1 <msb>[:<lsb>]
+# Unkn <msb>[:<lsb>]
+
# Field <msb>[:<lsb>] <name>
# Enum <msb>[:<lsb>] <name>
@@ -1779,6 +1781,16 @@ Sysreg SCXTNUM_EL1 3 0 13 0 7
Field 63:0 SoftwareContextNumber
EndSysreg
+# The bit layout for CCSIDR_EL1 depends on whether FEAT_CCIDX is implemented.
+# The following is for case when FEAT_CCIDX is not implemented.
+Sysreg CCSIDR_EL1 3 1 0 0 0
+Res0 63:32
+Unkn 31:28
+Field 27:13 NumSets
+Field 12:3 Associativity
+Field 2:0 LineSize
+EndSysreg
+
Sysreg CLIDR_EL1 3 1 0 0 1
Res0 63:47
Field 46:33 Ttypen
@@ -1795,6 +1807,11 @@ Field 5:3 Ctype2
Field 2:0 Ctype1
EndSysreg
+Sysreg CCSIDR2_EL1 3 1 0 0 2
+Res0 63:24
+Field 23:0 NumSets
+EndSysreg
+
Sysreg GMID_EL1 3 1 0 0 4
Res0 63:4
Field 3:0 BS
diff --git a/arch/csky/lib/delay.c b/arch/csky/lib/delay.c
index 22570b0790d6..f5db317313bb 100644
--- a/arch/csky/lib/delay.c
+++ b/arch/csky/lib/delay.c
@@ -5,7 +5,7 @@
#include <linux/init.h>
#include <linux/delay.h>
-void __delay(unsigned long loops)
+void __aligned(8) __delay(unsigned long loops)
{
asm volatile (
"mov r0, r0\n"
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index ae9ff07de4ab..d7e1cabee2ec 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -43,4 +43,4 @@ obj-$(CONFIG_ELF_CORE) += elfcore.o
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
# The gate DSO image is built using a special linker script.
-include $(src)/Makefile.gate
+include $(srctree)/$(src)/Makefile.gate
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 9cc8b84f7eb0..7fd51257e0ed 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -94,15 +94,21 @@ config LOONGARCH
select HAVE_DYNAMIC_FTRACE_WITH_ARGS
select HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_EBPF_JIT
+ select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN
select HAVE_EXIT_THREAD
select HAVE_FAST_GUP
select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GENERIC_VDSO
+ select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_IOREMAP_PROT
select HAVE_IRQ_EXIT_ON_IRQ_STACK
select HAVE_IRQ_TIME_ACCOUNTING
+ select HAVE_KPROBES
+ select HAVE_KPROBES_ON_FTRACE
+ select HAVE_KRETPROBES
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_NMI
select HAVE_PCI
@@ -441,6 +447,24 @@ config ARCH_IOREMAP
protection support. However, you can enable LoongArch DMW-based
ioremap() for better performance.
+config ARCH_STRICT_ALIGN
+ bool "Enable -mstrict-align to prevent unaligned accesses" if EXPERT
+ default y
+ help
+ Not all LoongArch cores support h/w unaligned access, we can use
+ -mstrict-align build parameter to prevent unaligned accesses.
+
+ CPUs with h/w unaligned access support:
+ Loongson-2K2000/2K3000/3A5000/3C5000/3D5000.
+
+ CPUs without h/w unaligned access support:
+ Loongson-2K500/2K1000.
+
+ This option is enabled by default to make the kernel be able to run
+ on all LoongArch systems. But you can disable it manually if you want
+ to run kernel only on systems with h/w unaligned access support in
+ order to optimise for performance.
+
config KEXEC
bool "Kexec system call"
select KEXEC_CORE
@@ -454,6 +478,7 @@ config KEXEC
config CRASH_DUMP
bool "Build kdump crash kernel"
+ select RELOCATABLE
help
Generate crash dump after being started by kexec. This should
be normally only set in special crash dump kernels which are
@@ -463,16 +488,38 @@ config CRASH_DUMP
For more details see Documentation/admin-guide/kdump/kdump.rst
-config PHYSICAL_START
- hex "Physical address where the kernel is loaded"
- default "0x90000000a0000000"
- depends on CRASH_DUMP
+config RELOCATABLE
+ bool "Relocatable kernel"
help
- This gives the XKPRANGE address where the kernel is loaded.
- If you plan to use kernel for capturing the crash dump change
- this value to start of the reserved region (the "X" value as
- specified in the "crashkernel=YM@XM" command line boot parameter
- passed to the panic-ed kernel).
+ This builds the kernel as a Position Independent Executable (PIE),
+ which retains all relocation metadata required, so as to relocate
+ the kernel binary at runtime to a different virtual address from
+ its link address.
+
+config RANDOMIZE_BASE
+ bool "Randomize the address of the kernel (KASLR)"
+ depends on RELOCATABLE
+ help
+ Randomizes the physical and virtual address at which the
+ kernel image is loaded, as a security feature that
+ deters exploit attempts relying on knowledge of the location
+ of kernel internals.
+
+ The kernel will be offset by up to RANDOMIZE_BASE_MAX_OFFSET.
+
+ If unsure, say N.
+
+config RANDOMIZE_BASE_MAX_OFFSET
+ hex "Maximum KASLR offset" if EXPERT
+ depends on RANDOMIZE_BASE
+ range 0x0 0x10000000
+ default "0x01000000"
+ help
+ When KASLR is active, this provides the maximum offset that will
+ be applied to the kernel image. It should be set according to the
+ amount of physical RAM available in the target system.
+
+ This is limited by the size of the lower address memory, 256MB.
config SECCOMP
bool "Enable seccomp to safely compute untrusted bytecode"
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index 4402387d2755..f71edf574101 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -71,14 +71,15 @@ KBUILD_AFLAGS_MODULE += -Wa,-mla-global-with-abs
KBUILD_CFLAGS_MODULE += -fplt -Wa,-mla-global-with-abs,-mla-local-with-abs
endif
+ifeq ($(CONFIG_RELOCATABLE),y)
+KBUILD_CFLAGS_KERNEL += -fPIE
+LDFLAGS_vmlinux += -static -pie --no-dynamic-linker -z notext
+endif
+
cflags-y += -ffreestanding
cflags-y += $(call cc-option, -mno-check-zero-division)
-ifndef CONFIG_PHYSICAL_START
load-y = 0x9000000000200000
-else
-load-y = $(CONFIG_PHYSICAL_START)
-endif
bootvars-y = VMLINUX_LOAD_ADDRESS=$(load-y)
drivers-$(CONFIG_PCI) += arch/loongarch/pci/
@@ -91,10 +92,15 @@ KBUILD_CPPFLAGS += -DVMLINUX_LOAD_ADDRESS=$(load-y)
# instead of .eh_frame so we don't discard them.
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
+ifdef CONFIG_ARCH_STRICT_ALIGN
# Don't emit unaligned accesses.
# Not all LoongArch cores support unaligned access, and as kernel we can't
# rely on others to provide emulation for these accesses.
KBUILD_CFLAGS += $(call cc-option,-mstrict-align)
+else
+# Optimise for performance on hardware supports unaligned access.
+KBUILD_CFLAGS += $(call cc-option,-mno-strict-align)
+endif
KBUILD_CFLAGS += -isystem $(shell $(CC) -print-file-name=include)
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index eb84cae642e5..e18213f01cc4 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -48,6 +48,7 @@ CONFIG_HOTPLUG_CPU=y
CONFIG_NR_CPUS=64
CONFIG_NUMA=y
CONFIG_KEXEC=y
+CONFIG_CRASH_DUMP=y
CONFIG_SUSPEND=y
CONFIG_HIBERNATION=y
CONFIG_ACPI=y
diff --git a/arch/loongarch/include/asm/addrspace.h b/arch/loongarch/include/asm/addrspace.h
index d342935e5a72..8fb699b4d40a 100644
--- a/arch/loongarch/include/asm/addrspace.h
+++ b/arch/loongarch/include/asm/addrspace.h
@@ -125,4 +125,6 @@ extern unsigned long vm_map_base;
#define ISA_IOSIZE SZ_16K
#define IO_SPACE_LIMIT (PCI_IOSIZE - 1)
+#define PHYS_LINK_KADDR PHYSADDR(VMLINUX_LOAD_ADDRESS)
+
#endif /* _ASM_ADDRSPACE_H */
diff --git a/arch/loongarch/include/asm/asm.h b/arch/loongarch/include/asm/asm.h
index 40eea6aa469e..f591b3245def 100644
--- a/arch/loongarch/include/asm/asm.h
+++ b/arch/loongarch/include/asm/asm.h
@@ -188,4 +188,14 @@
#define PTRLOG 3
#endif
+/* Annotate a function as being unsuitable for kprobes. */
+#ifdef CONFIG_KPROBES
+#define _ASM_NOKPROBE(name) \
+ .pushsection "_kprobe_blacklist", "aw"; \
+ .quad name; \
+ .popsection
+#else
+#define _ASM_NOKPROBE(name)
+#endif
+
#endif /* __ASM_ASM_H */
diff --git a/arch/loongarch/include/asm/asmmacro.h b/arch/loongarch/include/asm/asmmacro.h
index be037a40580d..c51a1b43acb4 100644
--- a/arch/loongarch/include/asm/asmmacro.h
+++ b/arch/loongarch/include/asm/asmmacro.h
@@ -274,4 +274,21 @@
nor \dst, \src, zero
.endm
+.macro la_abs reg, sym
+#ifndef CONFIG_RELOCATABLE
+ la.abs \reg, \sym
+#else
+ 766:
+ lu12i.w \reg, 0
+ ori \reg, \reg, 0
+ lu32i.d \reg, 0
+ lu52i.d \reg, \reg, 0
+ .pushsection ".la_abs", "aw", %progbits
+ 768:
+ .dword 768b-766b
+ .dword \sym
+ .popsection
+#endif
+.endm
+
#endif /* _ASM_ASMMACRO_H */
diff --git a/arch/loongarch/include/asm/cpu.h b/arch/loongarch/include/asm/cpu.h
index 754f28506791..c3da91759472 100644
--- a/arch/loongarch/include/asm/cpu.h
+++ b/arch/loongarch/include/asm/cpu.h
@@ -36,7 +36,7 @@
#define PRID_SERIES_LA132 0x8000 /* Loongson 32bit */
#define PRID_SERIES_LA264 0xa000 /* Loongson 64bit, 2-issue */
-#define PRID_SERIES_LA364 0xb000 /* Loongson 64bit,3-issue */
+#define PRID_SERIES_LA364 0xb000 /* Loongson 64bit, 3-issue */
#define PRID_SERIES_LA464 0xc000 /* Loongson 64bit, 4-issue */
#define PRID_SERIES_LA664 0xd000 /* Loongson 64bit, 6-issue */
diff --git a/arch/loongarch/include/asm/hw_breakpoint.h b/arch/loongarch/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..21447fb1efc7
--- /dev/null
+++ b/arch/loongarch/include/asm/hw_breakpoint.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2023 Loongson Technology Corporation Limited
+ */
+#ifndef __ASM_HW_BREAKPOINT_H
+#define __ASM_HW_BREAKPOINT_H
+
+#include <asm/loongarch.h>
+
+#ifdef __KERNEL__
+
+/* Breakpoint */
+#define LOONGARCH_BREAKPOINT_EXECUTE (0 << 0)
+
+/* Watchpoints */
+#define LOONGARCH_BREAKPOINT_LOAD (1 << 0)
+#define LOONGARCH_BREAKPOINT_STORE (1 << 1)
+
+struct arch_hw_breakpoint_ctrl {
+ u32 __reserved : 28,
+ len : 2,
+ type : 2;
+};
+
+struct arch_hw_breakpoint {
+ u64 address;
+ u64 mask;
+ struct arch_hw_breakpoint_ctrl ctrl;
+};
+
+/* Lengths */
+#define LOONGARCH_BREAKPOINT_LEN_1 0b11
+#define LOONGARCH_BREAKPOINT_LEN_2 0b10
+#define LOONGARCH_BREAKPOINT_LEN_4 0b01
+#define LOONGARCH_BREAKPOINT_LEN_8 0b00
+
+/*
+ * Limits.
+ * Changing these will require modifications to the register accessors.
+ */
+#define LOONGARCH_MAX_BRP 8
+#define LOONGARCH_MAX_WRP 8
+
+/* Virtual debug register bases. */
+#define CSR_CFG_ADDR 0
+#define CSR_CFG_MASK (CSR_CFG_ADDR + LOONGARCH_MAX_BRP)
+#define CSR_CFG_CTRL (CSR_CFG_MASK + LOONGARCH_MAX_BRP)
+#define CSR_CFG_ASID (CSR_CFG_CTRL + LOONGARCH_MAX_WRP)
+
+/* Debug register names. */
+#define LOONGARCH_CSR_NAME_ADDR ADDR
+#define LOONGARCH_CSR_NAME_MASK MASK
+#define LOONGARCH_CSR_NAME_CTRL CTRL
+#define LOONGARCH_CSR_NAME_ASID ASID
+
+/* Accessor macros for the debug registers. */
+#define LOONGARCH_CSR_WATCH_READ(N, REG, T, VAL) \
+do { \
+ if (T == 0) \
+ VAL = csr_read64(LOONGARCH_CSR_##IB##N##REG); \
+ else \
+ VAL = csr_read64(LOONGARCH_CSR_##DB##N##REG); \
+} while (0)
+
+#define LOONGARCH_CSR_WATCH_WRITE(N, REG, T, VAL) \
+do { \
+ if (T == 0) \
+ csr_write64(VAL, LOONGARCH_CSR_##IB##N##REG); \
+ else \
+ csr_write64(VAL, LOONGARCH_CSR_##DB##N##REG); \
+} while (0)
+
+/* Exact number */
+#define CSR_FWPC_NUM 0x3f
+#define CSR_MWPC_NUM 0x3f
+
+#define CTRL_PLV_ENABLE 0x1e
+
+#define MWPnCFG3_LoadEn 8
+#define MWPnCFG3_StoreEn 9
+
+#define MWPnCFG3_Type_mask 0x3
+#define MWPnCFG3_Size_mask 0x3
+
+static inline u32 encode_ctrl_reg(struct arch_hw_breakpoint_ctrl ctrl)
+{
+ return (ctrl.len << 10) | (ctrl.type << 8);
+}
+
+static inline void decode_ctrl_reg(u32 reg, struct arch_hw_breakpoint_ctrl *ctrl)
+{
+ reg >>= 8;
+ ctrl->type = reg & MWPnCFG3_Type_mask;
+ reg >>= 2;
+ ctrl->len = reg & MWPnCFG3_Size_mask;
+}
+
+struct task_struct;
+struct notifier_block;
+struct perf_event;
+struct perf_event_attr;
+
+extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
+ int *gen_len, int *gen_type, int *offset);
+extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
+extern int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw);
+extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+ unsigned long val, void *data);
+
+extern int arch_install_hw_breakpoint(struct perf_event *bp);
+extern void arch_uninstall_hw_breakpoint(struct perf_event *bp);
+extern int hw_breakpoint_slots(int type);
+extern void hw_breakpoint_pmu_read(struct perf_event *bp);
+
+void breakpoint_handler(struct pt_regs *regs);
+void watchpoint_handler(struct pt_regs *regs);
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+extern void ptrace_hw_copy_thread(struct task_struct *task);
+extern void hw_breakpoint_thread_switch(struct task_struct *next);
+#else
+static inline void ptrace_hw_copy_thread(struct task_struct *task)
+{
+}
+static inline void hw_breakpoint_thread_switch(struct task_struct *next)
+{
+}
+#endif
+
+/* Determine number of BRP registers available. */
+static inline int get_num_brps(void)
+{
+ return csr_read64(LOONGARCH_CSR_FWPC) & CSR_FWPC_NUM;
+}
+
+/* Determine number of WRP registers available. */
+static inline int get_num_wrps(void)
+{
+ return csr_read64(LOONGARCH_CSR_MWPC) & CSR_MWPC_NUM;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __ASM_BREAKPOINT_H */
diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h
index 7eedd83fd0d7..a04fe755d719 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -7,6 +7,7 @@
#include <linux/types.h>
#include <asm/asm.h>
+#include <asm/ptrace.h>
#define INSN_NOP 0x03400000
#define INSN_BREAK 0x002a0000
@@ -23,6 +24,10 @@
#define ADDR_IMM(addr, INSN) ((addr & ADDR_IMMMASK_##INSN) >> ADDR_IMMSHIFT_##INSN)
+enum reg0i15_op {
+ break_op = 0x54,
+};
+
enum reg0i26_op {
b_op = 0x14,
bl_op = 0x15,
@@ -32,6 +37,7 @@ enum reg1i20_op {
lu12iw_op = 0x0a,
lu32id_op = 0x0b,
pcaddi_op = 0x0c,
+ pcalau12i_op = 0x0d,
pcaddu12i_op = 0x0e,
pcaddu18i_op = 0x0f,
};
@@ -178,6 +184,11 @@ enum reg3sa2_op {
alsld_op = 0x16,
};
+struct reg0i15_format {
+ unsigned int immediate : 15;
+ unsigned int opcode : 17;
+};
+
struct reg0i26_format {
unsigned int immediate_h : 10;
unsigned int immediate_l : 16;
@@ -263,6 +274,7 @@ struct reg3sa2_format {
union loongarch_instruction {
unsigned int word;
+ struct reg0i15_format reg0i15_format;
struct reg0i26_format reg0i26_format;
struct reg1i20_format reg1i20_format;
struct reg1i21_format reg1i21_format;
@@ -321,6 +333,11 @@ static inline bool is_imm_negative(unsigned long val, unsigned int bit)
return val & (1UL << (bit - 1));
}
+static inline bool is_break_ins(union loongarch_instruction *ip)
+{
+ return ip->reg0i15_format.opcode == break_op;
+}
+
static inline bool is_pc_ins(union loongarch_instruction *ip)
{
return ip->reg1i20_format.opcode >= pcaddi_op &&
@@ -351,6 +368,47 @@ static inline bool is_stack_alloc_ins(union loongarch_instruction *ip)
is_imm12_negative(ip->reg2i12_format.immediate);
}
+static inline bool is_self_loop_ins(union loongarch_instruction *ip, struct pt_regs *regs)
+{
+ switch (ip->reg0i26_format.opcode) {
+ case b_op:
+ case bl_op:
+ if (ip->reg0i26_format.immediate_l == 0
+ && ip->reg0i26_format.immediate_h == 0)
+ return true;
+ }
+
+ switch (ip->reg1i21_format.opcode) {
+ case beqz_op:
+ case bnez_op:
+ case bceqz_op:
+ if (ip->reg1i21_format.immediate_l == 0
+ && ip->reg1i21_format.immediate_h == 0)
+ return true;
+ }
+
+ switch (ip->reg2i16_format.opcode) {
+ case beq_op:
+ case bne_op:
+ case blt_op:
+ case bge_op:
+ case bltu_op:
+ case bgeu_op:
+ if (ip->reg2i16_format.immediate == 0)
+ return true;
+ break;
+ case jirl_op:
+ if (regs->regs[ip->reg2i16_format.rj] +
+ ((unsigned long)ip->reg2i16_format.immediate << 2) == (unsigned long)ip)
+ return true;
+ }
+
+ return false;
+}
+
+void simu_pc(struct pt_regs *regs, union loongarch_instruction insn);
+void simu_branch(struct pt_regs *regs, union loongarch_instruction insn);
+
int larch_insn_read(void *addr, u32 *insnp);
int larch_insn_write(void *addr, u32 insn);
int larch_insn_patch_text(void *addr, u32 insn);
diff --git a/arch/loongarch/include/asm/kprobes.h b/arch/loongarch/include/asm/kprobes.h
new file mode 100644
index 000000000000..798020ae02c6
--- /dev/null
+++ b/arch/loongarch/include/asm/kprobes.h
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_LOONGARCH_KPROBES_H
+#define __ASM_LOONGARCH_KPROBES_H
+
+#include <asm-generic/kprobes.h>
+
+#ifdef CONFIG_KPROBES
+
+#include <asm/inst.h>
+#include <asm/cacheflush.h>
+
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+#define MAX_INSN_SIZE 2
+
+#define flush_insn_slot(p) \
+do { \
+ if (p->addr) \
+ flush_icache_range((unsigned long)p->addr, \
+ (unsigned long)p->addr + \
+ (MAX_INSN_SIZE * sizeof(kprobe_opcode_t))); \
+} while (0)
+
+#define kretprobe_blacklist_size 0
+
+typedef union loongarch_instruction kprobe_opcode_t;
+
+/* Architecture specific copy of original instruction */
+struct arch_specific_insn {
+ /* copy of the original instruction */
+ kprobe_opcode_t *insn;
+ /* restore address after simulation */
+ unsigned long restore;
+};
+
+struct prev_kprobe {
+ struct kprobe *kp;
+ unsigned int status;
+};
+
+/* per-cpu kprobe control block */
+struct kprobe_ctlblk {
+ unsigned int kprobe_status;
+ unsigned long saved_status;
+ struct prev_kprobe prev_kprobe;
+};
+
+void arch_remove_kprobe(struct kprobe *p);
+bool kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+bool kprobe_breakpoint_handler(struct pt_regs *regs);
+bool kprobe_singlestep_handler(struct pt_regs *regs);
+
+void __kretprobe_trampoline(void);
+void *trampoline_probe_handler(struct pt_regs *regs);
+
+#else /* !CONFIG_KPROBES */
+
+static inline bool kprobe_breakpoint_handler(struct pt_regs *regs) { return false; }
+static inline bool kprobe_singlestep_handler(struct pt_regs *regs) { return false; }
+
+#endif /* CONFIG_KPROBES */
+#endif /* __ASM_LOONGARCH_KPROBES_H */
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 7f8d57a61c8b..65b7dcdea16d 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -970,42 +970,42 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
#define LOONGARCH_CSR_DB0ADDR 0x310 /* data breakpoint 0 address */
#define LOONGARCH_CSR_DB0MASK 0x311 /* data breakpoint 0 mask */
-#define LOONGARCH_CSR_DB0CTL 0x312 /* data breakpoint 0 control */
+#define LOONGARCH_CSR_DB0CTRL 0x312 /* data breakpoint 0 control */
#define LOONGARCH_CSR_DB0ASID 0x313 /* data breakpoint 0 asid */
#define LOONGARCH_CSR_DB1ADDR 0x318 /* data breakpoint 1 address */
#define LOONGARCH_CSR_DB1MASK 0x319 /* data breakpoint 1 mask */
-#define LOONGARCH_CSR_DB1CTL 0x31a /* data breakpoint 1 control */
+#define LOONGARCH_CSR_DB1CTRL 0x31a /* data breakpoint 1 control */
#define LOONGARCH_CSR_DB1ASID 0x31b /* data breakpoint 1 asid */
#define LOONGARCH_CSR_DB2ADDR 0x320 /* data breakpoint 2 address */
#define LOONGARCH_CSR_DB2MASK 0x321 /* data breakpoint 2 mask */
-#define LOONGARCH_CSR_DB2CTL 0x322 /* data breakpoint 2 control */
+#define LOONGARCH_CSR_DB2CTRL 0x322 /* data breakpoint 2 control */
#define LOONGARCH_CSR_DB2ASID 0x323 /* data breakpoint 2 asid */
#define LOONGARCH_CSR_DB3ADDR 0x328 /* data breakpoint 3 address */
#define LOONGARCH_CSR_DB3MASK 0x329 /* data breakpoint 3 mask */
-#define LOONGARCH_CSR_DB3CTL 0x32a /* data breakpoint 3 control */
+#define LOONGARCH_CSR_DB3CTRL 0x32a /* data breakpoint 3 control */
#define LOONGARCH_CSR_DB3ASID 0x32b /* data breakpoint 3 asid */
#define LOONGARCH_CSR_DB4ADDR 0x330 /* data breakpoint 4 address */
#define LOONGARCH_CSR_DB4MASK 0x331 /* data breakpoint 4 maks */
-#define LOONGARCH_CSR_DB4CTL 0x332 /* data breakpoint 4 control */
+#define LOONGARCH_CSR_DB4CTRL 0x332 /* data breakpoint 4 control */
#define LOONGARCH_CSR_DB4ASID 0x333 /* data breakpoint 4 asid */
#define LOONGARCH_CSR_DB5ADDR 0x338 /* data breakpoint 5 address */
#define LOONGARCH_CSR_DB5MASK 0x339 /* data breakpoint 5 mask */
-#define LOONGARCH_CSR_DB5CTL 0x33a /* data breakpoint 5 control */
+#define LOONGARCH_CSR_DB5CTRL 0x33a /* data breakpoint 5 control */
#define LOONGARCH_CSR_DB5ASID 0x33b /* data breakpoint 5 asid */
#define LOONGARCH_CSR_DB6ADDR 0x340 /* data breakpoint 6 address */
#define LOONGARCH_CSR_DB6MASK 0x341 /* data breakpoint 6 mask */
-#define LOONGARCH_CSR_DB6CTL 0x342 /* data breakpoint 6 control */
+#define LOONGARCH_CSR_DB6CTRL 0x342 /* data breakpoint 6 control */
#define LOONGARCH_CSR_DB6ASID 0x343 /* data breakpoint 6 asid */
#define LOONGARCH_CSR_DB7ADDR 0x348 /* data breakpoint 7 address */
#define LOONGARCH_CSR_DB7MASK 0x349 /* data breakpoint 7 mask */
-#define LOONGARCH_CSR_DB7CTL 0x34a /* data breakpoint 7 control */
+#define LOONGARCH_CSR_DB7CTRL 0x34a /* data breakpoint 7 control */
#define LOONGARCH_CSR_DB7ASID 0x34b /* data breakpoint 7 asid */
#define LOONGARCH_CSR_FWPC 0x380 /* instruction breakpoint config */
@@ -1013,48 +1013,51 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
#define LOONGARCH_CSR_IB0ADDR 0x390 /* inst breakpoint 0 address */
#define LOONGARCH_CSR_IB0MASK 0x391 /* inst breakpoint 0 mask */
-#define LOONGARCH_CSR_IB0CTL 0x392 /* inst breakpoint 0 control */
+#define LOONGARCH_CSR_IB0CTRL 0x392 /* inst breakpoint 0 control */
#define LOONGARCH_CSR_IB0ASID 0x393 /* inst breakpoint 0 asid */
#define LOONGARCH_CSR_IB1ADDR 0x398 /* inst breakpoint 1 address */
#define LOONGARCH_CSR_IB1MASK 0x399 /* inst breakpoint 1 mask */
-#define LOONGARCH_CSR_IB1CTL 0x39a /* inst breakpoint 1 control */
+#define LOONGARCH_CSR_IB1CTRL 0x39a /* inst breakpoint 1 control */
#define LOONGARCH_CSR_IB1ASID 0x39b /* inst breakpoint 1 asid */
#define LOONGARCH_CSR_IB2ADDR 0x3a0 /* inst breakpoint 2 address */
#define LOONGARCH_CSR_IB2MASK 0x3a1 /* inst breakpoint 2 mask */
-#define LOONGARCH_CSR_IB2CTL 0x3a2 /* inst breakpoint 2 control */
+#define LOONGARCH_CSR_IB2CTRL 0x3a2 /* inst breakpoint 2 control */
#define LOONGARCH_CSR_IB2ASID 0x3a3 /* inst breakpoint 2 asid */
#define LOONGARCH_CSR_IB3ADDR 0x3a8 /* inst breakpoint 3 address */
#define LOONGARCH_CSR_IB3MASK 0x3a9 /* breakpoint 3 mask */
-#define LOONGARCH_CSR_IB3CTL 0x3aa /* inst breakpoint 3 control */
+#define LOONGARCH_CSR_IB3CTRL 0x3aa /* inst breakpoint 3 control */
#define LOONGARCH_CSR_IB3ASID 0x3ab /* inst breakpoint 3 asid */
#define LOONGARCH_CSR_IB4ADDR 0x3b0 /* inst breakpoint 4 address */
#define LOONGARCH_CSR_IB4MASK 0x3b1 /* inst breakpoint 4 mask */
-#define LOONGARCH_CSR_IB4CTL 0x3b2 /* inst breakpoint 4 control */
+#define LOONGARCH_CSR_IB4CTRL 0x3b2 /* inst breakpoint 4 control */
#define LOONGARCH_CSR_IB4ASID 0x3b3 /* inst breakpoint 4 asid */
#define LOONGARCH_CSR_IB5ADDR 0x3b8 /* inst breakpoint 5 address */
#define LOONGARCH_CSR_IB5MASK 0x3b9 /* inst breakpoint 5 mask */
-#define LOONGARCH_CSR_IB5CTL 0x3ba /* inst breakpoint 5 control */
+#define LOONGARCH_CSR_IB5CTRL 0x3ba /* inst breakpoint 5 control */
#define LOONGARCH_CSR_IB5ASID 0x3bb /* inst breakpoint 5 asid */
#define LOONGARCH_CSR_IB6ADDR 0x3c0 /* inst breakpoint 6 address */
#define LOONGARCH_CSR_IB6MASK 0x3c1 /* inst breakpoint 6 mask */
-#define LOONGARCH_CSR_IB6CTL 0x3c2 /* inst breakpoint 6 control */
+#define LOONGARCH_CSR_IB6CTRL 0x3c2 /* inst breakpoint 6 control */
#define LOONGARCH_CSR_IB6ASID 0x3c3 /* inst breakpoint 6 asid */
#define LOONGARCH_CSR_IB7ADDR 0x3c8 /* inst breakpoint 7 address */
#define LOONGARCH_CSR_IB7MASK 0x3c9 /* inst breakpoint 7 mask */
-#define LOONGARCH_CSR_IB7CTL 0x3ca /* inst breakpoint 7 control */
+#define LOONGARCH_CSR_IB7CTRL 0x3ca /* inst breakpoint 7 control */
#define LOONGARCH_CSR_IB7ASID 0x3cb /* inst breakpoint 7 asid */
#define LOONGARCH_CSR_DEBUG 0x500 /* debug config */
#define LOONGARCH_CSR_DERA 0x501 /* debug era */
#define LOONGARCH_CSR_DESAVE 0x502 /* debug save */
+#define CSR_FWPC_SKIP_SHIFT 16
+#define CSR_FWPC_SKIP (_ULCAST_(1) << CSR_FWPC_SKIP_SHIFT)
+
/*
* CSR_ECFG IM
*/
diff --git a/arch/loongarch/include/asm/processor.h b/arch/loongarch/include/asm/processor.h
index 7184f1dc61f2..636e1c66398c 100644
--- a/arch/loongarch/include/asm/processor.h
+++ b/arch/loongarch/include/asm/processor.h
@@ -11,6 +11,7 @@
#include <asm/cpu.h>
#include <asm/cpu-info.h>
+#include <asm/hw_breakpoint.h>
#include <asm/loongarch.h>
#include <asm/vdso/processor.h>
#include <uapi/asm/ptrace.h>
@@ -124,13 +125,18 @@ struct thread_struct {
/* Other stuff associated with the thread. */
unsigned long trap_nr;
unsigned long error_code;
+ unsigned long single_step; /* Used by PTRACE_SINGLESTEP */
struct loongarch_vdso_info *vdso;
/*
- * FPU & vector registers, must be at last because
- * they are conditionally copied at fork().
+ * FPU & vector registers, must be at the last of inherited
+ * context because they are conditionally copied at fork().
*/
struct loongarch_fpu fpu FPU_ALIGN;
+
+ /* Hardware breakpoints pinned to this task. */
+ struct perf_event *hbp_break[LOONGARCH_MAX_BRP];
+ struct perf_event *hbp_watch[LOONGARCH_MAX_WRP];
};
#define thread_saved_ra(tsk) (tsk->thread.sched_ra)
@@ -172,6 +178,8 @@ struct thread_struct {
.fcc = 0, \
.fpr = {{{0,},},}, \
}, \
+ .hbp_break = {0}, \
+ .hbp_watch = {0}, \
}
struct task_struct;
@@ -184,10 +192,6 @@ extern unsigned long boot_option_idle_override;
*/
extern void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp);
-static inline void flush_thread(void)
-{
-}
-
unsigned long __get_wchan(struct task_struct *p);
#define __KSTK_TOS(tsk) ((unsigned long)task_stack_page(tsk) + \
diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h
index 59c4608de91d..d761db943335 100644
--- a/arch/loongarch/include/asm/ptrace.h
+++ b/arch/loongarch/include/asm/ptrace.h
@@ -6,6 +6,7 @@
#define _ASM_PTRACE_H
#include <asm/page.h>
+#include <asm/irqflags.h>
#include <asm/thread_info.h>
#include <uapi/asm/ptrace.h>
@@ -109,6 +110,40 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsi
struct task_struct;
+/**
+ * regs_get_kernel_argument() - get Nth function argument in kernel
+ * @regs: pt_regs of that context
+ * @n: function argument number (start from 0)
+ *
+ * regs_get_argument() returns @n th argument of the function call.
+ * Note that this chooses most probably assignment, in some case
+ * it can be incorrect.
+ * This is expected to be called from kprobes or ftrace with regs
+ * where the top of stack is the return address.
+ */
+static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
+ unsigned int n)
+{
+#define NR_REG_ARGUMENTS 8
+ static const unsigned int args[] = {
+ offsetof(struct pt_regs, regs[4]),
+ offsetof(struct pt_regs, regs[5]),
+ offsetof(struct pt_regs, regs[6]),
+ offsetof(struct pt_regs, regs[7]),
+ offsetof(struct pt_regs, regs[8]),
+ offsetof(struct pt_regs, regs[9]),
+ offsetof(struct pt_regs, regs[10]),
+ offsetof(struct pt_regs, regs[11]),
+ };
+
+ if (n < NR_REG_ARGUMENTS)
+ return regs_get_register(regs, args[n]);
+ else {
+ n -= NR_REG_ARGUMENTS;
+ return regs_get_kernel_stack_nth(regs, n);
+ }
+}
+
/*
* Does the process account for user or for system time?
*/
@@ -149,4 +184,8 @@ static inline void user_stack_pointer_set(struct pt_regs *regs,
regs->regs[3] = val;
}
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+#define arch_has_single_step() (1)
+#endif
+
#endif /* _ASM_PTRACE_H */
diff --git a/arch/loongarch/include/asm/setup.h b/arch/loongarch/include/asm/setup.h
index 72ead58039f3..be05c0e706a2 100644
--- a/arch/loongarch/include/asm/setup.h
+++ b/arch/loongarch/include/asm/setup.h
@@ -21,4 +21,20 @@ extern void per_cpu_trap_init(int cpu);
extern void set_handler(unsigned long offset, void *addr, unsigned long len);
extern void set_merr_handler(unsigned long offset, void *addr, unsigned long len);
+#ifdef CONFIG_RELOCATABLE
+
+struct rela_la_abs {
+ long offset;
+ long symvalue;
+};
+
+extern long __la_abs_begin;
+extern long __la_abs_end;
+extern long __rela_dyn_begin;
+extern long __rela_dyn_end;
+
+extern void * __init relocate_kernel(void);
+
+#endif
+
#endif /* __SETUP_H */
diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h
index 4ca953062b5b..7df80e6ae9d2 100644
--- a/arch/loongarch/include/asm/stackframe.h
+++ b/arch/loongarch/include/asm/stackframe.h
@@ -7,6 +7,7 @@
#include <linux/threads.h>
+#include <asm/addrspace.h>
#include <asm/asm.h>
#include <asm/asmmacro.h>
#include <asm/asm-offsets.h>
@@ -36,6 +37,14 @@
cfi_restore \reg \offset \docfi
.endm
+/* Jump to the runtime virtual address. */
+ .macro JUMP_VIRT_ADDR temp1 temp2
+ li.d \temp1, CACHE_BASE
+ pcaddi \temp2, 0
+ or \temp1, \temp1, \temp2
+ jirl zero, \temp1, 0xc
+ .endm
+
.macro BACKUP_T0T1
csrwr t0, EXCEPTION_KS0
csrwr t1, EXCEPTION_KS1
@@ -77,7 +86,7 @@
* new value in sp.
*/
.macro get_saved_sp docfi=0
- la.abs t1, kernelsp
+ la_abs t1, kernelsp
#ifdef CONFIG_SMP
csrrd t0, PERCPU_BASE_KS
LONG_ADD t1, t1, t0
@@ -90,7 +99,7 @@
.endm
.macro set_saved_sp stackp temp temp2
- la.abs \temp, kernelsp
+ la.pcrel \temp, kernelsp
#ifdef CONFIG_SMP
LONG_ADD \temp, \temp, u0
#endif
diff --git a/arch/loongarch/include/asm/switch_to.h b/arch/loongarch/include/asm/switch_to.h
index 43a5ab162d38..24e3094bebab 100644
--- a/arch/loongarch/include/asm/switch_to.h
+++ b/arch/loongarch/include/asm/switch_to.h
@@ -34,6 +34,7 @@ extern asmlinkage struct task_struct *__switch_to(struct task_struct *prev,
#define switch_to(prev, next, last) \
do { \
lose_fpu_inatomic(1, prev); \
+ hw_breakpoint_thread_switch(next); \
(last) = __switch_to(prev, next, task_thread_info(next), \
__builtin_return_address(0), __builtin_frame_address(0)); \
} while (0)
diff --git a/arch/loongarch/include/asm/uaccess.h b/arch/loongarch/include/asm/uaccess.h
index 255899d4a7c3..0d22991ae430 100644
--- a/arch/loongarch/include/asm/uaccess.h
+++ b/arch/loongarch/include/asm/uaccess.h
@@ -22,7 +22,6 @@
extern u64 __ua_limit;
#define __UA_ADDR ".dword"
-#define __UA_LA "la.abs"
#define __UA_LIMIT __ua_limit
/*
diff --git a/arch/loongarch/include/uapi/asm/ptrace.h b/arch/loongarch/include/uapi/asm/ptrace.h
index 083193f4a5d5..cc48ed262021 100644
--- a/arch/loongarch/include/uapi/asm/ptrace.h
+++ b/arch/loongarch/include/uapi/asm/ptrace.h
@@ -46,6 +46,15 @@ struct user_fp_state {
uint32_t fcsr;
};
+struct user_watch_state {
+ uint16_t dbg_info;
+ struct {
+ uint64_t addr;
+ uint64_t mask;
+ uint32_t ctrl;
+ } dbg_regs[8];
+};
+
#define PTRACE_SYSEMU 0x1f
#define PTRACE_SYSEMU_SINGLESTEP 0x20
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index c8cfbd562921..78d4e3384305 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -8,13 +8,15 @@ extra-y := vmlinux.lds
obj-y += head.o cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \
traps.o irq.o idle.o process.o dma.o mem.o io.o reset.o switch.o \
elf.o syscall.o signal.o time.o topology.o inst.o ptrace.o vdso.o \
- alternative.o unaligned.o unwind.o
+ alternative.o unwind.o
obj-$(CONFIG_ACPI) += acpi.o
obj-$(CONFIG_EFI) += efi.o
obj-$(CONFIG_CPU_HAS_FPU) += fpu.o
+obj-$(CONFIG_ARCH_STRICT_ALIGN) += unaligned.o
+
ifdef CONFIG_FUNCTION_TRACER
ifndef CONFIG_DYNAMIC_FTRACE
obj-y += mcount.o ftrace.o
@@ -39,6 +41,8 @@ obj-$(CONFIG_NUMA) += numa.o
obj-$(CONFIG_MAGIC_SYSRQ) += sysrq.o
+obj-$(CONFIG_RELOCATABLE) += relocate.o
+
obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
@@ -46,5 +50,8 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
obj-$(CONFIG_UNWINDER_PROLOGUE) += unwind_prologue.o
obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_regs.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+
+obj-$(CONFIG_KPROBES) += kprobes.o kprobes_trampoline.o
CPPFLAGS_vmlinux.lds := $(KBUILD_CFLAGS)
diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S
index d53b631c9022..d737e3cf42d3 100644
--- a/arch/loongarch/kernel/entry.S
+++ b/arch/loongarch/kernel/entry.S
@@ -19,70 +19,71 @@
.cfi_sections .debug_frame
.align 5
SYM_FUNC_START(handle_syscall)
- csrrd t0, PERCPU_BASE_KS
- la.abs t1, kernelsp
- add.d t1, t1, t0
- move t2, sp
- ld.d sp, t1, 0
+ csrrd t0, PERCPU_BASE_KS
+ la.pcrel t1, kernelsp
+ add.d t1, t1, t0
+ move t2, sp
+ ld.d sp, t1, 0
- addi.d sp, sp, -PT_SIZE
- cfi_st t2, PT_R3
+ addi.d sp, sp, -PT_SIZE
+ cfi_st t2, PT_R3
cfi_rel_offset sp, PT_R3
- st.d zero, sp, PT_R0
- csrrd t2, LOONGARCH_CSR_PRMD
- st.d t2, sp, PT_PRMD
- csrrd t2, LOONGARCH_CSR_CRMD
- st.d t2, sp, PT_CRMD
- csrrd t2, LOONGARCH_CSR_EUEN
- st.d t2, sp, PT_EUEN
- csrrd t2, LOONGARCH_CSR_ECFG
- st.d t2, sp, PT_ECFG
- csrrd t2, LOONGARCH_CSR_ESTAT
- st.d t2, sp, PT_ESTAT
- cfi_st ra, PT_R1
- cfi_st a0, PT_R4
- cfi_st a1, PT_R5
- cfi_st a2, PT_R6
- cfi_st a3, PT_R7
- cfi_st a4, PT_R8
- cfi_st a5, PT_R9
- cfi_st a6, PT_R10
- cfi_st a7, PT_R11
- csrrd ra, LOONGARCH_CSR_ERA
- st.d ra, sp, PT_ERA
+ st.d zero, sp, PT_R0
+ csrrd t2, LOONGARCH_CSR_PRMD
+ st.d t2, sp, PT_PRMD
+ csrrd t2, LOONGARCH_CSR_CRMD
+ st.d t2, sp, PT_CRMD
+ csrrd t2, LOONGARCH_CSR_EUEN
+ st.d t2, sp, PT_EUEN
+ csrrd t2, LOONGARCH_CSR_ECFG
+ st.d t2, sp, PT_ECFG
+ csrrd t2, LOONGARCH_CSR_ESTAT
+ st.d t2, sp, PT_ESTAT
+ cfi_st ra, PT_R1
+ cfi_st a0, PT_R4
+ cfi_st a1, PT_R5
+ cfi_st a2, PT_R6
+ cfi_st a3, PT_R7
+ cfi_st a4, PT_R8
+ cfi_st a5, PT_R9
+ cfi_st a6, PT_R10
+ cfi_st a7, PT_R11
+ csrrd ra, LOONGARCH_CSR_ERA
+ st.d ra, sp, PT_ERA
cfi_rel_offset ra, PT_ERA
- cfi_st tp, PT_R2
- cfi_st u0, PT_R21
- cfi_st fp, PT_R22
+ cfi_st tp, PT_R2
+ cfi_st u0, PT_R21
+ cfi_st fp, PT_R22
SAVE_STATIC
- move u0, t0
- li.d tp, ~_THREAD_MASK
- and tp, tp, sp
+ move u0, t0
+ li.d tp, ~_THREAD_MASK
+ and tp, tp, sp
- move a0, sp
- bl do_syscall
+ move a0, sp
+ bl do_syscall
RESTORE_ALL_AND_RET
SYM_FUNC_END(handle_syscall)
+_ASM_NOKPROBE(handle_syscall)
SYM_CODE_START(ret_from_fork)
- bl schedule_tail # a0 = struct task_struct *prev
- move a0, sp
- bl syscall_exit_to_user_mode
+ bl schedule_tail # a0 = struct task_struct *prev
+ move a0, sp
+ bl syscall_exit_to_user_mode
RESTORE_STATIC
RESTORE_SOME
RESTORE_SP_AND_RET
SYM_CODE_END(ret_from_fork)
SYM_CODE_START(ret_from_kernel_thread)
- bl schedule_tail # a0 = struct task_struct *prev
- move a0, s1
- jirl ra, s0, 0
- move a0, sp
- bl syscall_exit_to_user_mode
+ bl schedule_tail # a0 = struct task_struct *prev
+ move a0, s1
+ jirl ra, s0, 0
+ move a0, sp
+ bl syscall_exit_to_user_mode
RESTORE_STATIC
RESTORE_SOME
RESTORE_SP_AND_RET
diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c
index 0f07591cab30..4a3ef8516ccc 100644
--- a/arch/loongarch/kernel/ftrace_dyn.c
+++ b/arch/loongarch/kernel/ftrace_dyn.c
@@ -6,6 +6,7 @@
*/
#include <linux/ftrace.h>
+#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <asm/inst.h>
@@ -271,3 +272,66 @@ int ftrace_disable_ftrace_graph_caller(void)
}
#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+#ifdef CONFIG_KPROBES_ON_FTRACE
+/* Ftrace callback handler for kprobes -- called under preepmt disabled */
+void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ int bit;
+ struct pt_regs *regs;
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ p = get_kprobe((kprobe_opcode_t *)ip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ goto out;
+
+ regs = ftrace_get_regs(fregs);
+ if (!regs)
+ goto out;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ unsigned long orig_ip = instruction_pointer(regs);
+
+ instruction_pointer_set(regs, ip);
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+ /*
+ * Emulate singlestep (and also recover regs->csr_era)
+ * as if there is a nop
+ */
+ instruction_pointer_set(regs, (unsigned long)p->addr + MCOUNT_INSN_SIZE);
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ instruction_pointer_set(regs, orig_ip);
+ }
+
+ /*
+ * If pre_handler returns !0, it changes regs->csr_era. We have to
+ * skip emulating post_handler.
+ */
+ __this_cpu_write(current_kprobe, NULL);
+ }
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ return 0;
+}
+#endif /* CONFIG_KPROBES_ON_FTRACE */
diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S
index 7e5c293ed89f..44ff1ff64260 100644
--- a/arch/loongarch/kernel/genex.S
+++ b/arch/loongarch/kernel/genex.S
@@ -34,7 +34,7 @@ SYM_FUNC_END(__arch_cpu_idle)
SYM_FUNC_START(handle_vint)
BACKUP_T0T1
SAVE_ALL
- la.abs t1, __arch_cpu_idle
+ la_abs t1, __arch_cpu_idle
LONG_L t0, sp, PT_ERA
/* 32 byte rollback region */
ori t0, t0, 0x1f
@@ -43,7 +43,7 @@ SYM_FUNC_START(handle_vint)
LONG_S t0, sp, PT_ERA
1: move a0, sp
move a1, sp
- la.abs t0, do_vint
+ la_abs t0, do_vint
jirl ra, t0, 0
RESTORE_ALL_AND_RET
SYM_FUNC_END(handle_vint)
@@ -72,7 +72,7 @@ SYM_FUNC_END(except_vec_cex)
SAVE_ALL
build_prep_\prep
move a0, sp
- la.abs t0, do_\handler
+ la_abs t0, do_\handler
jirl ra, t0, 0
668:
RESTORE_ALL_AND_RET
@@ -93,6 +93,6 @@ SYM_FUNC_END(except_vec_cex)
BUILD_HANDLER reserved reserved none /* others */
SYM_FUNC_START(handle_sys)
- la.abs t0, handle_syscall
+ la_abs t0, handle_syscall
jr t0
SYM_FUNC_END(handle_sys)
diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S
index 57bada6b4e93..aa64b179744f 100644
--- a/arch/loongarch/kernel/head.S
+++ b/arch/loongarch/kernel/head.S
@@ -24,7 +24,7 @@ _head:
.org 0x8
.dword kernel_entry /* Kernel entry point */
.dword _end - _text /* Kernel image effective size */
- .quad 0 /* Kernel image load offset from start of RAM */
+ .quad PHYS_LINK_KADDR /* Kernel image load offset from start of RAM */
.org 0x38 /* 0x20 ~ 0x37 reserved */
.long LINUX_PE_MAGIC
.long pe_header - _head /* Offset to the PE header */
@@ -50,11 +50,8 @@ SYM_CODE_START(kernel_entry) # kernel entry point
li.d t0, CSR_DMW1_INIT # CA, PLV0, 0x9000 xxxx xxxx xxxx
csrwr t0, LOONGARCH_CSR_DMWIN1
- /* We might not get launched at the address the kernel is linked to,
- so we jump there. */
- la.abs t0, 0f
- jr t0
-0:
+ JUMP_VIRT_ADDR t0, t1
+
/* Enable PG */
li.w t0, 0xb0 # PLV=0, IE=0, PG=1
csrwr t0, LOONGARCH_CSR_CRMD
@@ -89,6 +86,23 @@ SYM_CODE_START(kernel_entry) # kernel entry point
PTR_ADD sp, sp, tp
set_saved_sp sp, t0, t1
+#ifdef CONFIG_RELOCATABLE
+
+ bl relocate_kernel
+
+#ifdef CONFIG_RANDOMIZE_BASE
+ /* Repoint the sp into the new kernel */
+ PTR_LI sp, (_THREAD_SIZE - PT_SIZE)
+ PTR_ADD sp, sp, tp
+ set_saved_sp sp, t0, t1
+#endif
+
+ /* relocate_kernel() returns the new kernel entry point */
+ jr a0
+ ASM_BUG()
+
+#endif
+
bl start_kernel
ASM_BUG()
@@ -106,9 +120,8 @@ SYM_CODE_START(smpboot_entry)
li.d t0, CSR_DMW1_INIT # CA, PLV0
csrwr t0, LOONGARCH_CSR_DMWIN1
- la.abs t0, 0f
- jr t0
-0:
+ JUMP_VIRT_ADDR t0, t1
+
/* Enable PG */
li.w t0, 0xb0 # PLV=0, IE=0, PG=1
csrwr t0, LOONGARCH_CSR_CRMD
@@ -117,7 +130,7 @@ SYM_CODE_START(smpboot_entry)
li.w t0, 0x00 # FPE=0, SXE=0, ASXE=0, BTE=0
csrwr t0, LOONGARCH_CSR_EUEN
- la.abs t0, cpuboot_data
+ la.pcrel t0, cpuboot_data
ld.d sp, t0, CPU_BOOT_STACK
ld.d tp, t0, CPU_BOOT_TINFO
diff --git a/arch/loongarch/kernel/hw_breakpoint.c b/arch/loongarch/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..2406c95b34cc
--- /dev/null
+++ b/arch/loongarch/kernel/hw_breakpoint.c
@@ -0,0 +1,548 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2023 Loongson Technology Corporation Limited
+ */
+#define pr_fmt(fmt) "hw-breakpoint: " fmt
+
+#include <linux/hw_breakpoint.h>
+#include <linux/kprobes.h>
+#include <linux/perf_event.h>
+
+#include <asm/hw_breakpoint.h>
+
+/* Breakpoint currently in use for each BRP. */
+static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[LOONGARCH_MAX_BRP]);
+
+/* Watchpoint currently in use for each WRP. */
+static DEFINE_PER_CPU(struct perf_event *, wp_on_reg[LOONGARCH_MAX_WRP]);
+
+int hw_breakpoint_slots(int type)
+{
+ /*
+ * We can be called early, so don't rely on
+ * our static variables being initialised.
+ */
+ switch (type) {
+ case TYPE_INST:
+ return get_num_brps();
+ case TYPE_DATA:
+ return get_num_wrps();
+ default:
+ pr_warn("unknown slot type: %d\n", type);
+ return 0;
+ }
+}
+
+#define READ_WB_REG_CASE(OFF, N, REG, T, VAL) \
+ case (OFF + N): \
+ LOONGARCH_CSR_WATCH_READ(N, REG, T, VAL); \
+ break
+
+#define WRITE_WB_REG_CASE(OFF, N, REG, T, VAL) \
+ case (OFF + N): \
+ LOONGARCH_CSR_WATCH_WRITE(N, REG, T, VAL); \
+ break
+
+#define GEN_READ_WB_REG_CASES(OFF, REG, T, VAL) \
+ READ_WB_REG_CASE(OFF, 0, REG, T, VAL); \
+ READ_WB_REG_CASE(OFF, 1, REG, T, VAL); \
+ READ_WB_REG_CASE(OFF, 2, REG, T, VAL); \
+ READ_WB_REG_CASE(OFF, 3, REG, T, VAL); \
+ READ_WB_REG_CASE(OFF, 4, REG, T, VAL); \
+ READ_WB_REG_CASE(OFF, 5, REG, T, VAL); \
+ READ_WB_REG_CASE(OFF, 6, REG, T, VAL); \
+ READ_WB_REG_CASE(OFF, 7, REG, T, VAL);
+
+#define GEN_WRITE_WB_REG_CASES(OFF, REG, T, VAL) \
+ WRITE_WB_REG_CASE(OFF, 0, REG, T, VAL); \
+ WRITE_WB_REG_CASE(OFF, 1, REG, T, VAL); \
+ WRITE_WB_REG_CASE(OFF, 2, REG, T, VAL); \
+ WRITE_WB_REG_CASE(OFF, 3, REG, T, VAL); \
+ WRITE_WB_REG_CASE(OFF, 4, REG, T, VAL); \
+ WRITE_WB_REG_CASE(OFF, 5, REG, T, VAL); \
+ WRITE_WB_REG_CASE(OFF, 6, REG, T, VAL); \
+ WRITE_WB_REG_CASE(OFF, 7, REG, T, VAL);
+
+static u64 read_wb_reg(int reg, int n, int t)
+{
+ u64 val = 0;
+
+ switch (reg + n) {
+ GEN_READ_WB_REG_CASES(CSR_CFG_ADDR, ADDR, t, val);
+ GEN_READ_WB_REG_CASES(CSR_CFG_MASK, MASK, t, val);
+ GEN_READ_WB_REG_CASES(CSR_CFG_CTRL, CTRL, t, val);
+ GEN_READ_WB_REG_CASES(CSR_CFG_ASID, ASID, t, val);
+ default:
+ pr_warn("Attempt to read from unknown breakpoint register %d\n", n);
+ }
+
+ return val;
+}
+NOKPROBE_SYMBOL(read_wb_reg);
+
+static void write_wb_reg(int reg, int n, int t, u64 val)
+{
+ switch (reg + n) {
+ GEN_WRITE_WB_REG_CASES(CSR_CFG_ADDR, ADDR, t, val);
+ GEN_WRITE_WB_REG_CASES(CSR_CFG_MASK, MASK, t, val);
+ GEN_WRITE_WB_REG_CASES(CSR_CFG_CTRL, CTRL, t, val);
+ GEN_WRITE_WB_REG_CASES(CSR_CFG_ASID, ASID, t, val);
+ default:
+ pr_warn("Attempt to write to unknown breakpoint register %d\n", n);
+ }
+}
+NOKPROBE_SYMBOL(write_wb_reg);
+
+enum hw_breakpoint_ops {
+ HW_BREAKPOINT_INSTALL,
+ HW_BREAKPOINT_UNINSTALL,
+};
+
+/*
+ * hw_breakpoint_slot_setup - Find and setup a perf slot according to operations
+ *
+ * @slots: pointer to array of slots
+ * @max_slots: max number of slots
+ * @bp: perf_event to setup
+ * @ops: operation to be carried out on the slot
+ *
+ * Return:
+ * slot index on success
+ * -ENOSPC if no slot is available/matches
+ * -EINVAL on wrong operations parameter
+ */
+
+static int hw_breakpoint_slot_setup(struct perf_event **slots, int max_slots,
+ struct perf_event *bp, enum hw_breakpoint_ops ops)
+{
+ int i;
+ struct perf_event **slot;
+
+ for (i = 0; i < max_slots; ++i) {
+ slot = &slots[i];
+ switch (ops) {
+ case HW_BREAKPOINT_INSTALL:
+ if (!*slot) {
+ *slot = bp;
+ return i;
+ }
+ break;
+ case HW_BREAKPOINT_UNINSTALL:
+ if (*slot == bp) {
+ *slot = NULL;
+ return i;
+ }
+ break;
+ default:
+ pr_warn_once("Unhandled hw breakpoint ops %d\n", ops);
+ return -EINVAL;
+ }
+ }
+
+ return -ENOSPC;
+}
+
+void ptrace_hw_copy_thread(struct task_struct *tsk)
+{
+ memset(tsk->thread.hbp_break, 0, sizeof(tsk->thread.hbp_break));
+ memset(tsk->thread.hbp_watch, 0, sizeof(tsk->thread.hbp_watch));
+}
+
+/*
+ * Unregister breakpoints from this task and reset the pointers in the thread_struct.
+ */
+void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
+{
+ int i;
+ struct thread_struct *t = &tsk->thread;
+
+ for (i = 0; i < LOONGARCH_MAX_BRP; i++) {
+ if (t->hbp_break[i]) {
+ unregister_hw_breakpoint(t->hbp_break[i]);
+ t->hbp_break[i] = NULL;
+ }
+ }
+
+ for (i = 0; i < LOONGARCH_MAX_WRP; i++) {
+ if (t->hbp_watch[i]) {
+ unregister_hw_breakpoint(t->hbp_watch[i]);
+ t->hbp_watch[i] = NULL;
+ }
+ }
+}
+
+static int hw_breakpoint_control(struct perf_event *bp,
+ enum hw_breakpoint_ops ops)
+{
+ u32 ctrl;
+ int i, max_slots, enable;
+ struct perf_event **slots;
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
+
+ if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) {
+ /* Breakpoint */
+ slots = this_cpu_ptr(bp_on_reg);
+ max_slots = boot_cpu_data.watch_ireg_count;
+ } else {
+ /* Watchpoint */
+ slots = this_cpu_ptr(wp_on_reg);
+ max_slots = boot_cpu_data.watch_dreg_count;
+ }
+
+ i = hw_breakpoint_slot_setup(slots, max_slots, bp, ops);
+
+ if (WARN_ONCE(i < 0, "Can't find any breakpoint slot"))
+ return i;
+
+ switch (ops) {
+ case HW_BREAKPOINT_INSTALL:
+ /* Set the FWPnCFG/MWPnCFG 1~4 register. */
+ write_wb_reg(CSR_CFG_ADDR, i, 0, info->address);
+ write_wb_reg(CSR_CFG_ADDR, i, 1, info->address);
+ write_wb_reg(CSR_CFG_MASK, i, 0, info->mask);
+ write_wb_reg(CSR_CFG_MASK, i, 1, info->mask);
+ write_wb_reg(CSR_CFG_ASID, i, 0, 0);
+ write_wb_reg(CSR_CFG_ASID, i, 1, 0);
+ if (info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) {
+ write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE);
+ } else {
+ ctrl = encode_ctrl_reg(info->ctrl);
+ write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl | CTRL_PLV_ENABLE |
+ 1 << MWPnCFG3_LoadEn | 1 << MWPnCFG3_StoreEn);
+ }
+ enable = csr_read64(LOONGARCH_CSR_CRMD);
+ csr_write64(CSR_CRMD_WE | enable, LOONGARCH_CSR_CRMD);
+ break;
+ case HW_BREAKPOINT_UNINSTALL:
+ /* Reset the FWPnCFG/MWPnCFG 1~4 register. */
+ write_wb_reg(CSR_CFG_ADDR, i, 0, 0);
+ write_wb_reg(CSR_CFG_ADDR, i, 1, 0);
+ write_wb_reg(CSR_CFG_MASK, i, 0, 0);
+ write_wb_reg(CSR_CFG_MASK, i, 1, 0);
+ write_wb_reg(CSR_CFG_CTRL, i, 0, 0);
+ write_wb_reg(CSR_CFG_CTRL, i, 1, 0);
+ write_wb_reg(CSR_CFG_ASID, i, 0, 0);
+ write_wb_reg(CSR_CFG_ASID, i, 1, 0);
+ break;
+ }
+
+ return 0;
+}
+
+/*
+ * Install a perf counter breakpoint.
+ */
+int arch_install_hw_breakpoint(struct perf_event *bp)
+{
+ return hw_breakpoint_control(bp, HW_BREAKPOINT_INSTALL);
+}
+
+void arch_uninstall_hw_breakpoint(struct perf_event *bp)
+{
+ hw_breakpoint_control(bp, HW_BREAKPOINT_UNINSTALL);
+}
+
+static int get_hbp_len(u8 hbp_len)
+{
+ unsigned int len_in_bytes = 0;
+
+ switch (hbp_len) {
+ case LOONGARCH_BREAKPOINT_LEN_1:
+ len_in_bytes = 1;
+ break;
+ case LOONGARCH_BREAKPOINT_LEN_2:
+ len_in_bytes = 2;
+ break;
+ case LOONGARCH_BREAKPOINT_LEN_4:
+ len_in_bytes = 4;
+ break;
+ case LOONGARCH_BREAKPOINT_LEN_8:
+ len_in_bytes = 8;
+ break;
+ }
+
+ return len_in_bytes;
+}
+
+/*
+ * Check whether bp virtual address is in kernel space.
+ */
+int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
+{
+ unsigned int len;
+ unsigned long va;
+
+ va = hw->address;
+ len = get_hbp_len(hw->ctrl.len);
+
+ return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+}
+
+/*
+ * Extract generic type and length encodings from an arch_hw_breakpoint_ctrl.
+ * Hopefully this will disappear when ptrace can bypass the conversion
+ * to generic breakpoint descriptions.
+ */
+int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
+ int *gen_len, int *gen_type, int *offset)
+{
+ /* Type */
+ switch (ctrl.type) {
+ case LOONGARCH_BREAKPOINT_EXECUTE:
+ *gen_type = HW_BREAKPOINT_X;
+ break;
+ case LOONGARCH_BREAKPOINT_LOAD:
+ *gen_type = HW_BREAKPOINT_R;
+ break;
+ case LOONGARCH_BREAKPOINT_STORE:
+ *gen_type = HW_BREAKPOINT_W;
+ break;
+ case LOONGARCH_BREAKPOINT_LOAD | LOONGARCH_BREAKPOINT_STORE:
+ *gen_type = HW_BREAKPOINT_RW;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (!ctrl.len)
+ return -EINVAL;
+
+ *offset = __ffs(ctrl.len);
+
+ /* Len */
+ switch (ctrl.len) {
+ case LOONGARCH_BREAKPOINT_LEN_1:
+ *gen_len = HW_BREAKPOINT_LEN_1;
+ break;
+ case LOONGARCH_BREAKPOINT_LEN_2:
+ *gen_len = HW_BREAKPOINT_LEN_2;
+ break;
+ case LOONGARCH_BREAKPOINT_LEN_4:
+ *gen_len = HW_BREAKPOINT_LEN_4;
+ break;
+ case LOONGARCH_BREAKPOINT_LEN_8:
+ *gen_len = HW_BREAKPOINT_LEN_8;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Construct an arch_hw_breakpoint from a perf_event.
+ */
+static int arch_build_bp_info(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ /* Type */
+ switch (attr->bp_type) {
+ case HW_BREAKPOINT_X:
+ hw->ctrl.type = LOONGARCH_BREAKPOINT_EXECUTE;
+ break;
+ case HW_BREAKPOINT_R:
+ hw->ctrl.type = LOONGARCH_BREAKPOINT_LOAD;
+ break;
+ case HW_BREAKPOINT_W:
+ hw->ctrl.type = LOONGARCH_BREAKPOINT_STORE;
+ break;
+ case HW_BREAKPOINT_RW:
+ hw->ctrl.type = LOONGARCH_BREAKPOINT_LOAD | LOONGARCH_BREAKPOINT_STORE;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Len */
+ switch (attr->bp_len) {
+ case HW_BREAKPOINT_LEN_1:
+ hw->ctrl.len = LOONGARCH_BREAKPOINT_LEN_1;
+ break;
+ case HW_BREAKPOINT_LEN_2:
+ hw->ctrl.len = LOONGARCH_BREAKPOINT_LEN_2;
+ break;
+ case HW_BREAKPOINT_LEN_4:
+ hw->ctrl.len = LOONGARCH_BREAKPOINT_LEN_4;
+ break;
+ case HW_BREAKPOINT_LEN_8:
+ hw->ctrl.len = LOONGARCH_BREAKPOINT_LEN_8;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Address */
+ hw->address = attr->bp_addr;
+
+ return 0;
+}
+
+/*
+ * Validate the arch-specific HW Breakpoint register settings.
+ */
+int hw_breakpoint_arch_parse(struct perf_event *bp,
+ const struct perf_event_attr *attr,
+ struct arch_hw_breakpoint *hw)
+{
+ int ret;
+ u64 alignment_mask, offset;
+
+ /* Build the arch_hw_breakpoint. */
+ ret = arch_build_bp_info(bp, attr, hw);
+ if (ret)
+ return ret;
+
+ if (hw->ctrl.type != LOONGARCH_BREAKPOINT_EXECUTE)
+ alignment_mask = 0x7;
+ offset = hw->address & alignment_mask;
+
+ hw->address &= ~alignment_mask;
+ hw->ctrl.len <<= offset;
+
+ return 0;
+}
+
+static void update_bp_registers(struct pt_regs *regs, int enable, int type)
+{
+ u32 ctrl;
+ int i, max_slots;
+ struct perf_event **slots;
+ struct arch_hw_breakpoint *info;
+
+ switch (type) {
+ case 0:
+ slots = this_cpu_ptr(bp_on_reg);
+ max_slots = boot_cpu_data.watch_ireg_count;
+ break;
+ case 1:
+ slots = this_cpu_ptr(wp_on_reg);
+ max_slots = boot_cpu_data.watch_dreg_count;
+ break;
+ default:
+ return;
+ }
+
+ for (i = 0; i < max_slots; ++i) {
+ if (!slots[i])
+ continue;
+
+ info = counter_arch_bp(slots[i]);
+ if (enable) {
+ if ((info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) && (type == 0)) {
+ write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE);
+ write_wb_reg(CSR_CFG_CTRL, i, 0, CTRL_PLV_ENABLE);
+ } else {
+ ctrl = read_wb_reg(CSR_CFG_CTRL, i, 1);
+ if (info->ctrl.type == LOONGARCH_BREAKPOINT_LOAD)
+ ctrl |= 0x1 << MWPnCFG3_LoadEn;
+ if (info->ctrl.type == LOONGARCH_BREAKPOINT_STORE)
+ ctrl |= 0x1 << MWPnCFG3_StoreEn;
+ write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl);
+ }
+ regs->csr_prmd |= CSR_PRMD_PWE;
+ } else {
+ if ((info->ctrl.type == LOONGARCH_BREAKPOINT_EXECUTE) && (type == 0)) {
+ write_wb_reg(CSR_CFG_CTRL, i, 0, 0);
+ } else {
+ ctrl = read_wb_reg(CSR_CFG_CTRL, i, 1);
+ if (info->ctrl.type == LOONGARCH_BREAKPOINT_LOAD)
+ ctrl &= ~0x1 << MWPnCFG3_LoadEn;
+ if (info->ctrl.type == LOONGARCH_BREAKPOINT_STORE)
+ ctrl &= ~0x1 << MWPnCFG3_StoreEn;
+ write_wb_reg(CSR_CFG_CTRL, i, 1, ctrl);
+ }
+ regs->csr_prmd &= ~CSR_PRMD_PWE;
+ }
+ }
+}
+NOKPROBE_SYMBOL(update_bp_registers);
+
+/*
+ * Debug exception handlers.
+ */
+void breakpoint_handler(struct pt_regs *regs)
+{
+ int i;
+ struct perf_event *bp, **slots;
+
+ slots = this_cpu_ptr(bp_on_reg);
+
+ for (i = 0; i < boot_cpu_data.watch_ireg_count; ++i) {
+ bp = slots[i];
+ if (bp == NULL)
+ continue;
+ perf_bp_event(bp, regs);
+ }
+ update_bp_registers(regs, 0, 0);
+}
+NOKPROBE_SYMBOL(breakpoint_handler);
+
+void watchpoint_handler(struct pt_regs *regs)
+{
+ int i;
+ struct perf_event *wp, **slots;
+
+ slots = this_cpu_ptr(wp_on_reg);
+
+ for (i = 0; i < boot_cpu_data.watch_dreg_count; ++i) {
+ wp = slots[i];
+ if (wp == NULL)
+ continue;
+ perf_bp_event(wp, regs);
+ }
+ update_bp_registers(regs, 0, 1);
+}
+NOKPROBE_SYMBOL(watchpoint_handler);
+
+static int __init arch_hw_breakpoint_init(void)
+{
+ int cpu;
+
+ boot_cpu_data.watch_ireg_count = get_num_brps();
+ boot_cpu_data.watch_dreg_count = get_num_wrps();
+
+ pr_info("Found %d breakpoint and %d watchpoint registers.\n",
+ boot_cpu_data.watch_ireg_count, boot_cpu_data.watch_dreg_count);
+
+ for (cpu = 1; cpu < NR_CPUS; cpu++) {
+ cpu_data[cpu].watch_ireg_count = boot_cpu_data.watch_ireg_count;
+ cpu_data[cpu].watch_dreg_count = boot_cpu_data.watch_dreg_count;
+ }
+
+ return 0;
+}
+arch_initcall(arch_hw_breakpoint_init);
+
+void hw_breakpoint_thread_switch(struct task_struct *next)
+{
+ u64 addr, mask;
+ struct pt_regs *regs = task_pt_regs(next);
+
+ if (test_tsk_thread_flag(next, TIF_SINGLESTEP)) {
+ addr = read_wb_reg(CSR_CFG_ADDR, 0, 0);
+ mask = read_wb_reg(CSR_CFG_MASK, 0, 0);
+ if (!((regs->csr_era ^ addr) & ~mask))
+ csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
+ regs->csr_prmd |= CSR_PRMD_PWE;
+ } else {
+ /* Update breakpoints */
+ update_bp_registers(regs, 1, 0);
+ /* Update watchpoints */
+ update_bp_registers(regs, 1, 1);
+ }
+}
+
+void hw_breakpoint_pmu_read(struct perf_event *bp)
+{
+}
+
+/*
+ * Dummy function to register with die_notifier.
+ */
+int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
+ unsigned long val, void *data)
+{
+ return NOTIFY_DONE;
+}
diff --git a/arch/loongarch/kernel/inst.c b/arch/loongarch/kernel/inst.c
index badc59087042..258ef267cd30 100644
--- a/arch/loongarch/kernel/inst.c
+++ b/arch/loongarch/kernel/inst.c
@@ -10,6 +10,129 @@
static DEFINE_RAW_SPINLOCK(patch_lock);
+void simu_pc(struct pt_regs *regs, union loongarch_instruction insn)
+{
+ unsigned long pc = regs->csr_era;
+ unsigned int rd = insn.reg1i20_format.rd;
+ unsigned int imm = insn.reg1i20_format.immediate;
+
+ if (pc & 3) {
+ pr_warn("%s: invalid pc 0x%lx\n", __func__, pc);
+ return;
+ }
+
+ switch (insn.reg1i20_format.opcode) {
+ case pcaddi_op:
+ regs->regs[rd] = pc + sign_extend64(imm << 2, 21);
+ break;
+ case pcaddu12i_op:
+ regs->regs[rd] = pc + sign_extend64(imm << 12, 31);
+ break;
+ case pcaddu18i_op:
+ regs->regs[rd] = pc + sign_extend64(imm << 18, 37);
+ break;
+ case pcalau12i_op:
+ regs->regs[rd] = pc + sign_extend64(imm << 12, 31);
+ regs->regs[rd] &= ~((1 << 12) - 1);
+ break;
+ default:
+ pr_info("%s: unknown opcode\n", __func__);
+ return;
+ }
+
+ regs->csr_era += LOONGARCH_INSN_SIZE;
+}
+
+void simu_branch(struct pt_regs *regs, union loongarch_instruction insn)
+{
+ unsigned int imm, imm_l, imm_h, rd, rj;
+ unsigned long pc = regs->csr_era;
+
+ if (pc & 3) {
+ pr_warn("%s: invalid pc 0x%lx\n", __func__, pc);
+ return;
+ }
+
+ imm_l = insn.reg0i26_format.immediate_l;
+ imm_h = insn.reg0i26_format.immediate_h;
+ switch (insn.reg0i26_format.opcode) {
+ case b_op:
+ regs->csr_era = pc + sign_extend64((imm_h << 16 | imm_l) << 2, 27);
+ return;
+ case bl_op:
+ regs->csr_era = pc + sign_extend64((imm_h << 16 | imm_l) << 2, 27);
+ regs->regs[1] = pc + LOONGARCH_INSN_SIZE;
+ return;
+ }
+
+ imm_l = insn.reg1i21_format.immediate_l;
+ imm_h = insn.reg1i21_format.immediate_h;
+ rj = insn.reg1i21_format.rj;
+ switch (insn.reg1i21_format.opcode) {
+ case beqz_op:
+ if (regs->regs[rj] == 0)
+ regs->csr_era = pc + sign_extend64((imm_h << 16 | imm_l) << 2, 22);
+ else
+ regs->csr_era = pc + LOONGARCH_INSN_SIZE;
+ return;
+ case bnez_op:
+ if (regs->regs[rj] != 0)
+ regs->csr_era = pc + sign_extend64((imm_h << 16 | imm_l) << 2, 22);
+ else
+ regs->csr_era = pc + LOONGARCH_INSN_SIZE;
+ return;
+ }
+
+ imm = insn.reg2i16_format.immediate;
+ rj = insn.reg2i16_format.rj;
+ rd = insn.reg2i16_format.rd;
+ switch (insn.reg2i16_format.opcode) {
+ case beq_op:
+ if (regs->regs[rj] == regs->regs[rd])
+ regs->csr_era = pc + sign_extend64(imm << 2, 17);
+ else
+ regs->csr_era = pc + LOONGARCH_INSN_SIZE;
+ break;
+ case bne_op:
+ if (regs->regs[rj] != regs->regs[rd])
+ regs->csr_era = pc + sign_extend64(imm << 2, 17);
+ else
+ regs->csr_era = pc + LOONGARCH_INSN_SIZE;
+ break;
+ case blt_op:
+ if ((long)regs->regs[rj] < (long)regs->regs[rd])
+ regs->csr_era = pc + sign_extend64(imm << 2, 17);
+ else
+ regs->csr_era = pc + LOONGARCH_INSN_SIZE;
+ break;
+ case bge_op:
+ if ((long)regs->regs[rj] >= (long)regs->regs[rd])
+ regs->csr_era = pc + sign_extend64(imm << 2, 17);
+ else
+ regs->csr_era = pc + LOONGARCH_INSN_SIZE;
+ break;
+ case bltu_op:
+ if (regs->regs[rj] < regs->regs[rd])
+ regs->csr_era = pc + sign_extend64(imm << 2, 17);
+ else
+ regs->csr_era = pc + LOONGARCH_INSN_SIZE;
+ break;
+ case bgeu_op:
+ if (regs->regs[rj] >= regs->regs[rd])
+ regs->csr_era = pc + sign_extend64(imm << 2, 17);
+ else
+ regs->csr_era = pc + LOONGARCH_INSN_SIZE;
+ break;
+ case jirl_op:
+ regs->csr_era = regs->regs[rj] + sign_extend64(imm << 2, 17);
+ regs->regs[rd] = pc + LOONGARCH_INSN_SIZE;
+ break;
+ default:
+ pr_info("%s: unknown opcode\n", __func__);
+ return;
+ }
+}
+
int larch_insn_read(void *addr, u32 *insnp)
{
int ret;
diff --git a/arch/loongarch/kernel/kprobes.c b/arch/loongarch/kernel/kprobes.c
new file mode 100644
index 000000000000..56c8c4b09a42
--- /dev/null
+++ b/arch/loongarch/kernel/kprobes.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kdebug.h>
+#include <linux/kprobes.h>
+#include <linux/preempt.h>
+#include <asm/break.h>
+
+static const union loongarch_instruction breakpoint_insn = {
+ .reg0i15_format = {
+ .opcode = break_op,
+ .immediate = BRK_KPROBE_BP,
+ }
+};
+
+static const union loongarch_instruction singlestep_insn = {
+ .reg0i15_format = {
+ .opcode = break_op,
+ .immediate = BRK_KPROBE_SSTEPBP,
+ }
+};
+
+DEFINE_PER_CPU(struct kprobe *, current_kprobe);
+DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
+static bool insns_not_supported(union loongarch_instruction insn)
+{
+ switch (insn.reg2i14_format.opcode) {
+ case llw_op:
+ case lld_op:
+ case scw_op:
+ case scd_op:
+ pr_notice("kprobe: ll and sc instructions are not supported\n");
+ return true;
+ }
+
+ switch (insn.reg1i21_format.opcode) {
+ case bceqz_op:
+ pr_notice("kprobe: bceqz and bcnez instructions are not supported\n");
+ return true;
+ }
+
+ return false;
+}
+NOKPROBE_SYMBOL(insns_not_supported);
+
+static bool insns_need_simulation(struct kprobe *p)
+{
+ if (is_pc_ins(&p->opcode))
+ return true;
+
+ if (is_branch_ins(&p->opcode))
+ return true;
+
+ return false;
+}
+NOKPROBE_SYMBOL(insns_need_simulation);
+
+static void arch_simulate_insn(struct kprobe *p, struct pt_regs *regs)
+{
+ if (is_pc_ins(&p->opcode))
+ simu_pc(regs, p->opcode);
+ else if (is_branch_ins(&p->opcode))
+ simu_branch(regs, p->opcode);
+}
+NOKPROBE_SYMBOL(arch_simulate_insn);
+
+static void arch_prepare_ss_slot(struct kprobe *p)
+{
+ p->ainsn.insn[0] = *p->addr;
+ p->ainsn.insn[1] = singlestep_insn;
+ p->ainsn.restore = (unsigned long)p->addr + LOONGARCH_INSN_SIZE;
+}
+NOKPROBE_SYMBOL(arch_prepare_ss_slot);
+
+static void arch_prepare_simulate(struct kprobe *p)
+{
+ p->ainsn.restore = 0;
+}
+NOKPROBE_SYMBOL(arch_prepare_simulate);
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+ if ((unsigned long)p->addr & 0x3)
+ return -EILSEQ;
+
+ /* copy instruction */
+ p->opcode = *p->addr;
+
+ /* decode instruction */
+ if (insns_not_supported(p->opcode))
+ return -EINVAL;
+
+ if (insns_need_simulation(p)) {
+ p->ainsn.insn = NULL;
+ } else {
+ p->ainsn.insn = get_insn_slot();
+ if (!p->ainsn.insn)
+ return -ENOMEM;
+ }
+
+ /* prepare the instruction */
+ if (p->ainsn.insn)
+ arch_prepare_ss_slot(p);
+ else
+ arch_prepare_simulate(p);
+
+ return 0;
+}
+NOKPROBE_SYMBOL(arch_prepare_kprobe);
+
+/* Install breakpoint in text */
+void arch_arm_kprobe(struct kprobe *p)
+{
+ *p->addr = breakpoint_insn;
+ flush_insn_slot(p);
+}
+NOKPROBE_SYMBOL(arch_arm_kprobe);
+
+/* Remove breakpoint from text */
+void arch_disarm_kprobe(struct kprobe *p)
+{
+ *p->addr = p->opcode;
+ flush_insn_slot(p);
+}
+NOKPROBE_SYMBOL(arch_disarm_kprobe);
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+ if (p->ainsn.insn) {
+ free_insn_slot(p->ainsn.insn, 0);
+ p->ainsn.insn = NULL;
+ }
+}
+NOKPROBE_SYMBOL(arch_remove_kprobe);
+
+static void save_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ kcb->prev_kprobe.kp = kprobe_running();
+ kcb->prev_kprobe.status = kcb->kprobe_status;
+}
+NOKPROBE_SYMBOL(save_previous_kprobe);
+
+static void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+{
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
+ kcb->kprobe_status = kcb->prev_kprobe.status;
+}
+NOKPROBE_SYMBOL(restore_previous_kprobe);
+
+static void set_current_kprobe(struct kprobe *p)
+{
+ __this_cpu_write(current_kprobe, p);
+}
+NOKPROBE_SYMBOL(set_current_kprobe);
+
+/*
+ * Interrupts need to be disabled before single-step mode is set,
+ * and not reenabled until after single-step mode ends.
+ * Without disabling interrupt on local CPU, there is a chance of
+ * interrupt occurrence in the period of exception return and start
+ * of out-of-line single-step, that result in wrongly single stepping
+ * into the interrupt handler.
+ */
+static void save_local_irqflag(struct kprobe_ctlblk *kcb,
+ struct pt_regs *regs)
+{
+ kcb->saved_status = regs->csr_prmd;
+ regs->csr_prmd &= ~CSR_PRMD_PIE;
+}
+NOKPROBE_SYMBOL(save_local_irqflag);
+
+static void restore_local_irqflag(struct kprobe_ctlblk *kcb,
+ struct pt_regs *regs)
+{
+ regs->csr_prmd = kcb->saved_status;
+}
+NOKPROBE_SYMBOL(restore_local_irqflag);
+
+static void post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb,
+ struct pt_regs *regs)
+{
+ /* return addr restore if non-branching insn */
+ if (cur->ainsn.restore != 0)
+ instruction_pointer_set(regs, cur->ainsn.restore);
+
+ /* restore back original saved kprobe variables and continue */
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe(kcb);
+ preempt_enable_no_resched();
+ return;
+ }
+
+ /*
+ * update the kcb status even if the cur->post_handler is
+ * not set because reset_curent_kprobe() doesn't update kcb.
+ */
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ if (cur->post_handler)
+ cur->post_handler(cur, regs, 0);
+
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+}
+NOKPROBE_SYMBOL(post_kprobe_handler);
+
+static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, int reenter)
+{
+ if (reenter) {
+ save_previous_kprobe(kcb);
+ set_current_kprobe(p);
+ kcb->kprobe_status = KPROBE_REENTER;
+ } else {
+ kcb->kprobe_status = KPROBE_HIT_SS;
+ }
+
+ if (p->ainsn.insn) {
+ /* IRQs and single stepping do not mix well */
+ save_local_irqflag(kcb, regs);
+ /* set ip register to prepare for single stepping */
+ regs->csr_era = (unsigned long)p->ainsn.insn;
+ } else {
+ /* simulate single steping */
+ arch_simulate_insn(p, regs);
+ /* now go for post processing */
+ post_kprobe_handler(p, kcb, regs);
+ }
+}
+NOKPROBE_SYMBOL(setup_singlestep);
+
+static bool reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ switch (kcb->kprobe_status) {
+ case KPROBE_HIT_SS:
+ case KPROBE_HIT_SSDONE:
+ case KPROBE_HIT_ACTIVE:
+ kprobes_inc_nmissed_count(p);
+ setup_singlestep(p, regs, kcb, 1);
+ break;
+ case KPROBE_REENTER:
+ pr_warn("Failed to recover from reentered kprobes.\n");
+ dump_kprobe(p);
+ WARN_ON_ONCE(1);
+ break;
+ default:
+ WARN_ON(1);
+ return false;
+ }
+
+ return true;
+}
+NOKPROBE_SYMBOL(reenter_kprobe);
+
+bool kprobe_breakpoint_handler(struct pt_regs *regs)
+{
+ struct kprobe_ctlblk *kcb;
+ struct kprobe *p, *cur_kprobe;
+ kprobe_opcode_t *addr = (kprobe_opcode_t *)regs->csr_era;
+
+ /*
+ * We don't want to be preempted for the entire
+ * duration of kprobe processing.
+ */
+ preempt_disable();
+ kcb = get_kprobe_ctlblk();
+ cur_kprobe = kprobe_running();
+
+ p = get_kprobe(addr);
+ if (p) {
+ if (cur_kprobe) {
+ if (reenter_kprobe(p, regs, kcb))
+ return true;
+ } else {
+ /* Probe hit */
+ set_current_kprobe(p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ /*
+ * If we have no pre-handler or it returned 0, we
+ * continue with normal processing. If we have a
+ * pre-handler and it returned non-zero, it will
+ * modify the execution path and no need to single
+ * stepping. Let's just reset current kprobe and exit.
+ *
+ * pre_handler can hit a breakpoint and can step thru
+ * before return.
+ */
+ if (!p->pre_handler || !p->pre_handler(p, regs)) {
+ setup_singlestep(p, regs, kcb, 0);
+ } else {
+ reset_current_kprobe();
+ preempt_enable_no_resched();
+ }
+ return true;
+ }
+ }
+
+ if (addr->word != breakpoint_insn.word) {
+ /*
+ * The breakpoint instruction was removed right
+ * after we hit it. Another cpu has removed
+ * either a probepoint or a debugger breakpoint
+ * at this address. In either case, no further
+ * handling of this interrupt is appropriate.
+ * Return back to original instruction, and continue.
+ */
+ regs->csr_era = (unsigned long)addr;
+ preempt_enable_no_resched();
+ return true;
+ }
+
+ preempt_enable_no_resched();
+ return false;
+}
+NOKPROBE_SYMBOL(kprobe_breakpoint_handler);
+
+bool kprobe_singlestep_handler(struct pt_regs *regs)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ unsigned long addr = instruction_pointer(regs);
+
+ if (cur && (kcb->kprobe_status & (KPROBE_HIT_SS | KPROBE_REENTER)) &&
+ ((unsigned long)&cur->ainsn.insn[1] == addr)) {
+ restore_local_irqflag(kcb, regs);
+ post_kprobe_handler(cur, kcb, regs);
+ return true;
+ }
+
+ preempt_enable_no_resched();
+ return false;
+}
+NOKPROBE_SYMBOL(kprobe_singlestep_handler);
+
+bool kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+{
+ struct kprobe *cur = kprobe_running();
+ struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+
+ switch (kcb->kprobe_status) {
+ case KPROBE_HIT_SS:
+ case KPROBE_REENTER:
+ /*
+ * We are here because the instruction being single
+ * stepped caused a page fault. We reset the current
+ * kprobe and the ip points back to the probe address
+ * and allow the page fault handler to continue as a
+ * normal page fault.
+ */
+ regs->csr_era = (unsigned long)cur->addr;
+ WARN_ON_ONCE(!instruction_pointer(regs));
+
+ if (kcb->kprobe_status == KPROBE_REENTER) {
+ restore_previous_kprobe(kcb);
+ } else {
+ restore_local_irqflag(kcb, regs);
+ reset_current_kprobe();
+ }
+ preempt_enable_no_resched();
+ break;
+ }
+ return false;
+}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
+
+/*
+ * Provide a blacklist of symbols identifying ranges which cannot be kprobed.
+ * This blacklist is exposed to userspace via debugfs (kprobes/blacklist).
+ */
+int __init arch_populate_kprobe_blacklist(void)
+{
+ return kprobe_add_area_blacklist((unsigned long)__irqentry_text_start,
+ (unsigned long)__irqentry_text_end);
+}
+
+int __init arch_init_kprobes(void)
+{
+ return 0;
+}
+
+/* ASM function that handles the kretprobes must not be probed */
+NOKPROBE_SYMBOL(__kretprobe_trampoline);
+
+/* Called from __kretprobe_trampoline */
+void __used *trampoline_probe_handler(struct pt_regs *regs)
+{
+ return (void *)kretprobe_trampoline_handler(regs, NULL);
+}
+NOKPROBE_SYMBOL(trampoline_probe_handler);
+
+void arch_prepare_kretprobe(struct kretprobe_instance *ri,
+ struct pt_regs *regs)
+{
+ ri->ret_addr = (kprobe_opcode_t *)regs->regs[1];
+ ri->fp = NULL;
+
+ /* Replace the return addr with trampoline addr */
+ regs->regs[1] = (unsigned long)&__kretprobe_trampoline;
+}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
+
+int arch_trampoline_kprobe(struct kprobe *p)
+{
+ return 0;
+}
+NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/loongarch/kernel/kprobes_trampoline.S b/arch/loongarch/kernel/kprobes_trampoline.S
new file mode 100644
index 000000000000..af94b0d213fa
--- /dev/null
+++ b/arch/loongarch/kernel/kprobes_trampoline.S
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#include <linux/linkage.h>
+#include <asm/stackframe.h>
+
+ .text
+
+ .macro save_all_base_regs
+ cfi_st ra, PT_R1
+ cfi_st tp, PT_R2
+ cfi_st a0, PT_R4
+ cfi_st a1, PT_R5
+ cfi_st a2, PT_R6
+ cfi_st a3, PT_R7
+ cfi_st a4, PT_R8
+ cfi_st a5, PT_R9
+ cfi_st a6, PT_R10
+ cfi_st a7, PT_R11
+ cfi_st t0, PT_R12
+ cfi_st t1, PT_R13
+ cfi_st t2, PT_R14
+ cfi_st t3, PT_R15
+ cfi_st t4, PT_R16
+ cfi_st t5, PT_R17
+ cfi_st t6, PT_R18
+ cfi_st t7, PT_R19
+ cfi_st t8, PT_R20
+ cfi_st u0, PT_R21
+ cfi_st fp, PT_R22
+ cfi_st s0, PT_R23
+ cfi_st s1, PT_R24
+ cfi_st s2, PT_R25
+ cfi_st s3, PT_R26
+ cfi_st s4, PT_R27
+ cfi_st s5, PT_R28
+ cfi_st s6, PT_R29
+ cfi_st s7, PT_R30
+ cfi_st s8, PT_R31
+ csrrd t0, LOONGARCH_CSR_CRMD
+ andi t0, t0, 0x7 /* extract bit[1:0] PLV, bit[2] IE */
+ LONG_S t0, sp, PT_CRMD
+ .endm
+
+ .macro restore_all_base_regs
+ cfi_ld tp, PT_R2
+ cfi_ld a0, PT_R4
+ cfi_ld a1, PT_R5
+ cfi_ld a2, PT_R6
+ cfi_ld a3, PT_R7
+ cfi_ld a4, PT_R8
+ cfi_ld a5, PT_R9
+ cfi_ld a6, PT_R10
+ cfi_ld a7, PT_R11
+ cfi_ld t0, PT_R12
+ cfi_ld t1, PT_R13
+ cfi_ld t2, PT_R14
+ cfi_ld t3, PT_R15
+ cfi_ld t4, PT_R16
+ cfi_ld t5, PT_R17
+ cfi_ld t6, PT_R18
+ cfi_ld t7, PT_R19
+ cfi_ld t8, PT_R20
+ cfi_ld u0, PT_R21
+ cfi_ld fp, PT_R22
+ cfi_ld s0, PT_R23
+ cfi_ld s1, PT_R24
+ cfi_ld s2, PT_R25
+ cfi_ld s3, PT_R26
+ cfi_ld s4, PT_R27
+ cfi_ld s5, PT_R28
+ cfi_ld s6, PT_R29
+ cfi_ld s7, PT_R30
+ cfi_ld s8, PT_R31
+ LONG_L t0, sp, PT_CRMD
+ li.d t1, 0x7 /* mask bit[1:0] PLV, bit[2] IE */
+ csrxchg t0, t1, LOONGARCH_CSR_CRMD
+ .endm
+
+SYM_CODE_START(__kretprobe_trampoline)
+ addi.d sp, sp, -PT_SIZE
+ save_all_base_regs
+
+ addi.d t0, sp, PT_SIZE
+ LONG_S t0, sp, PT_R3
+
+ move a0, sp /* pt_regs */
+
+ bl trampoline_probe_handler
+
+ /* use the result as the return-address */
+ move ra, a0
+
+ restore_all_base_regs
+ addi.d sp, sp, PT_SIZE
+
+ jr ra
+SYM_CODE_END(__kretprobe_trampoline)
diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c
index edfd220a3737..fa2443c7afb2 100644
--- a/arch/loongarch/kernel/process.c
+++ b/arch/loongarch/kernel/process.c
@@ -18,6 +18,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
+#include <linux/hw_breakpoint.h>
#include <linux/mm.h>
#include <linux/stddef.h>
#include <linux/unistd.h>
@@ -96,6 +97,11 @@ void start_thread(struct pt_regs *regs, unsigned long pc, unsigned long sp)
regs->regs[3] = sp;
}
+void flush_thread(void)
+{
+ flush_ptrace_hw_breakpoint(current);
+}
+
void exit_thread(struct task_struct *tsk)
{
}
@@ -181,6 +187,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
childregs->regs[2] = tls;
out:
+ ptrace_hw_copy_thread(p);
clear_tsk_thread_flag(p, TIF_USEDFPU);
clear_tsk_thread_flag(p, TIF_USEDSIMD);
clear_tsk_thread_flag(p, TIF_LSX_CTX_LIVE);
diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c
index dc2b82ea894c..06bceae7d104 100644
--- a/arch/loongarch/kernel/ptrace.c
+++ b/arch/loongarch/kernel/ptrace.c
@@ -20,7 +20,9 @@
#include <linux/context_tracking.h>
#include <linux/elf.h>
#include <linux/errno.h>
+#include <linux/hw_breakpoint.h>
#include <linux/mm.h>
+#include <linux/nospec.h>
#include <linux/ptrace.h>
#include <linux/regset.h>
#include <linux/sched.h>
@@ -29,6 +31,7 @@
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/seccomp.h>
+#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <asm/byteorder.h>
@@ -39,6 +42,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
+#include <asm/ptrace.h>
#include <asm/reg.h>
#include <asm/syscall.h>
@@ -246,6 +250,384 @@ static int cfg_set(struct task_struct *target,
return 0;
}
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+
+/*
+ * Handle hitting a HW-breakpoint.
+ */
+static void ptrace_hbptriggered(struct perf_event *bp,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int i;
+ struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp);
+
+ for (i = 0; i < LOONGARCH_MAX_BRP; ++i)
+ if (current->thread.hbp_break[i] == bp)
+ break;
+
+ for (i = 0; i < LOONGARCH_MAX_WRP; ++i)
+ if (current->thread.hbp_watch[i] == bp)
+ break;
+
+ force_sig_ptrace_errno_trap(i, (void __user *)bkpt->address);
+}
+
+static struct perf_event *ptrace_hbp_get_event(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx)
+{
+ struct perf_event *bp;
+
+ switch (note_type) {
+ case NT_LOONGARCH_HW_BREAK:
+ if (idx >= LOONGARCH_MAX_BRP)
+ return ERR_PTR(-EINVAL);
+ idx = array_index_nospec(idx, LOONGARCH_MAX_BRP);
+ bp = tsk->thread.hbp_break[idx];
+ break;
+ case NT_LOONGARCH_HW_WATCH:
+ if (idx >= LOONGARCH_MAX_WRP)
+ return ERR_PTR(-EINVAL);
+ idx = array_index_nospec(idx, LOONGARCH_MAX_WRP);
+ bp = tsk->thread.hbp_watch[idx];
+ break;
+ }
+
+ return bp;
+}
+
+static int ptrace_hbp_set_event(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx,
+ struct perf_event *bp)
+{
+ switch (note_type) {
+ case NT_LOONGARCH_HW_BREAK:
+ if (idx >= LOONGARCH_MAX_BRP)
+ return -EINVAL;
+ idx = array_index_nospec(idx, LOONGARCH_MAX_BRP);
+ tsk->thread.hbp_break[idx] = bp;
+ break;
+ case NT_LOONGARCH_HW_WATCH:
+ if (idx >= LOONGARCH_MAX_WRP)
+ return -EINVAL;
+ idx = array_index_nospec(idx, LOONGARCH_MAX_WRP);
+ tsk->thread.hbp_watch[idx] = bp;
+ break;
+ }
+
+ return 0;
+}
+
+static struct perf_event *ptrace_hbp_create(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx)
+{
+ int err, type;
+ struct perf_event *bp;
+ struct perf_event_attr attr;
+
+ switch (note_type) {
+ case NT_LOONGARCH_HW_BREAK:
+ type = HW_BREAKPOINT_X;
+ break;
+ case NT_LOONGARCH_HW_WATCH:
+ type = HW_BREAKPOINT_RW;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ ptrace_breakpoint_init(&attr);
+
+ /*
+ * Initialise fields to sane defaults
+ * (i.e. values that will pass validation).
+ */
+ attr.bp_addr = 0;
+ attr.bp_len = HW_BREAKPOINT_LEN_4;
+ attr.bp_type = type;
+ attr.disabled = 1;
+
+ bp = register_user_hw_breakpoint(&attr, ptrace_hbptriggered, NULL, tsk);
+ if (IS_ERR(bp))
+ return bp;
+
+ err = ptrace_hbp_set_event(note_type, tsk, idx, bp);
+ if (err)
+ return ERR_PTR(err);
+
+ return bp;
+}
+
+static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
+ struct arch_hw_breakpoint_ctrl ctrl,
+ struct perf_event_attr *attr)
+{
+ int err, len, type, offset;
+
+ err = arch_bp_generic_fields(ctrl, &len, &type, &offset);
+ if (err)
+ return err;
+
+ switch (note_type) {
+ case NT_LOONGARCH_HW_BREAK:
+ if ((type & HW_BREAKPOINT_X) != type)
+ return -EINVAL;
+ break;
+ case NT_LOONGARCH_HW_WATCH:
+ if ((type & HW_BREAKPOINT_RW) != type)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ attr->bp_len = len;
+ attr->bp_type = type;
+ attr->bp_addr += offset;
+
+ return 0;
+}
+
+static int ptrace_hbp_get_resource_info(unsigned int note_type, u16 *info)
+{
+ u8 num;
+ u16 reg = 0;
+
+ switch (note_type) {
+ case NT_LOONGARCH_HW_BREAK:
+ num = hw_breakpoint_slots(TYPE_INST);
+ break;
+ case NT_LOONGARCH_HW_WATCH:
+ num = hw_breakpoint_slots(TYPE_DATA);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ *info = reg | num;
+
+ return 0;
+}
+
+static struct perf_event *ptrace_hbp_get_initialised_bp(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx)
+{
+ struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx);
+
+ if (!bp)
+ bp = ptrace_hbp_create(note_type, tsk, idx);
+
+ return bp;
+}
+
+static int ptrace_hbp_get_ctrl(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx, u32 *ctrl)
+{
+ struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx);
+
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ *ctrl = bp ? encode_ctrl_reg(counter_arch_bp(bp)->ctrl) : 0;
+
+ return 0;
+}
+
+static int ptrace_hbp_get_mask(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx, u64 *mask)
+{
+ struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx);
+
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ *mask = bp ? counter_arch_bp(bp)->mask : 0;
+
+ return 0;
+}
+
+static int ptrace_hbp_get_addr(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx, u64 *addr)
+{
+ struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx);
+
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ *addr = bp ? counter_arch_bp(bp)->address : 0;
+
+ return 0;
+}
+
+static int ptrace_hbp_set_ctrl(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx, u32 uctrl)
+{
+ int err;
+ struct perf_event *bp;
+ struct perf_event_attr attr;
+ struct arch_hw_breakpoint_ctrl ctrl;
+
+ bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx);
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ attr = bp->attr;
+ decode_ctrl_reg(uctrl, &ctrl);
+ err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr);
+ if (err)
+ return err;
+
+ return modify_user_hw_breakpoint(bp, &attr);
+}
+
+static int ptrace_hbp_set_mask(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx, u64 mask)
+{
+ struct perf_event *bp;
+ struct perf_event_attr attr;
+ struct arch_hw_breakpoint *info;
+
+ bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx);
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ attr = bp->attr;
+ info = counter_arch_bp(bp);
+ info->mask = mask;
+
+ return modify_user_hw_breakpoint(bp, &attr);
+}
+
+static int ptrace_hbp_set_addr(unsigned int note_type,
+ struct task_struct *tsk,
+ unsigned long idx, u64 addr)
+{
+ struct perf_event *bp;
+ struct perf_event_attr attr;
+
+ bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx);
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ attr = bp->attr;
+ attr.bp_addr = addr;
+
+ return modify_user_hw_breakpoint(bp, &attr);
+}
+
+#define PTRACE_HBP_CTRL_SZ sizeof(u32)
+#define PTRACE_HBP_ADDR_SZ sizeof(u64)
+#define PTRACE_HBP_MASK_SZ sizeof(u64)
+
+static int hw_break_get(struct task_struct *target,
+ const struct user_regset *regset,
+ struct membuf to)
+{
+ u16 info;
+ u32 ctrl;
+ u64 addr, mask;
+ int ret, idx = 0;
+ unsigned int note_type = regset->core_note_type;
+
+ /* Resource info */
+ ret = ptrace_hbp_get_resource_info(note_type, &info);
+ if (ret)
+ return ret;
+
+ membuf_write(&to, &info, sizeof(info));
+
+ /* (address, ctrl) registers */
+ while (to.left) {
+ ret = ptrace_hbp_get_addr(note_type, target, idx, &addr);
+ if (ret)
+ return ret;
+
+ ret = ptrace_hbp_get_mask(note_type, target, idx, &mask);
+ if (ret)
+ return ret;
+
+ ret = ptrace_hbp_get_ctrl(note_type, target, idx, &ctrl);
+ if (ret)
+ return ret;
+
+ membuf_store(&to, addr);
+ membuf_store(&to, mask);
+ membuf_store(&to, ctrl);
+ idx++;
+ }
+
+ return 0;
+}
+
+static int hw_break_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+ u32 ctrl;
+ u64 addr, mask;
+ int ret, idx = 0, offset, limit;
+ unsigned int note_type = regset->core_note_type;
+
+ /* Resource info */
+ offset = offsetof(struct user_watch_state, dbg_regs);
+ user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset);
+
+ /* (address, ctrl) registers */
+ limit = regset->n * regset->size;
+ while (count && offset < limit) {
+ if (count < PTRACE_HBP_ADDR_SZ)
+ return -EINVAL;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &addr,
+ offset, offset + PTRACE_HBP_ADDR_SZ);
+ if (ret)
+ return ret;
+
+ ret = ptrace_hbp_set_addr(note_type, target, idx, addr);
+ if (ret)
+ return ret;
+ offset += PTRACE_HBP_ADDR_SZ;
+
+ if (!count)
+ break;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &mask,
+ offset, offset + PTRACE_HBP_ADDR_SZ);
+ if (ret)
+ return ret;
+
+ ret = ptrace_hbp_set_mask(note_type, target, idx, mask);
+ if (ret)
+ return ret;
+ offset += PTRACE_HBP_MASK_SZ;
+
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &mask,
+ offset, offset + PTRACE_HBP_MASK_SZ);
+ if (ret)
+ return ret;
+
+ ret = ptrace_hbp_set_ctrl(note_type, target, idx, ctrl);
+ if (ret)
+ return ret;
+ offset += PTRACE_HBP_CTRL_SZ;
+ idx++;
+ }
+
+ return 0;
+}
+
+#endif
+
struct pt_regs_offset {
const char *name;
int offset;
@@ -319,6 +701,10 @@ enum loongarch_regset {
REGSET_GPR,
REGSET_FPR,
REGSET_CPUCFG,
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ REGSET_HW_BREAK,
+ REGSET_HW_WATCH,
+#endif
};
static const struct user_regset loongarch64_regsets[] = {
@@ -346,6 +732,24 @@ static const struct user_regset loongarch64_regsets[] = {
.regset_get = cfg_get,
.set = cfg_set,
},
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+ [REGSET_HW_BREAK] = {
+ .core_note_type = NT_LOONGARCH_HW_BREAK,
+ .n = sizeof(struct user_watch_state) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .regset_get = hw_break_get,
+ .set = hw_break_set,
+ },
+ [REGSET_HW_WATCH] = {
+ .core_note_type = NT_LOONGARCH_HW_WATCH,
+ .n = sizeof(struct user_watch_state) / sizeof(u32),
+ .size = sizeof(u32),
+ .align = sizeof(u32),
+ .regset_get = hw_break_get,
+ .set = hw_break_set,
+ },
+#endif
};
static const struct user_regset_view user_loongarch64_view = {
@@ -431,3 +835,71 @@ long arch_ptrace(struct task_struct *child, long request,
return ret;
}
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+static void ptrace_triggered(struct perf_event *bp,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ struct perf_event_attr attr;
+
+ attr = bp->attr;
+ attr.disabled = true;
+ modify_user_hw_breakpoint(bp, &attr);
+}
+
+static int set_single_step(struct task_struct *tsk, unsigned long addr)
+{
+ struct perf_event *bp;
+ struct perf_event_attr attr;
+ struct arch_hw_breakpoint *info;
+ struct thread_struct *thread = &tsk->thread;
+
+ bp = thread->hbp_break[0];
+ if (!bp) {
+ ptrace_breakpoint_init(&attr);
+
+ attr.bp_addr = addr;
+ attr.bp_len = HW_BREAKPOINT_LEN_8;
+ attr.bp_type = HW_BREAKPOINT_X;
+
+ bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+ NULL, tsk);
+ if (IS_ERR(bp))
+ return PTR_ERR(bp);
+
+ thread->hbp_break[0] = bp;
+ } else {
+ int err;
+
+ attr = bp->attr;
+ attr.bp_addr = addr;
+
+ /* Reenable breakpoint */
+ attr.disabled = false;
+ err = modify_user_hw_breakpoint(bp, &attr);
+ if (unlikely(err))
+ return err;
+
+ csr_write64(attr.bp_addr, LOONGARCH_CSR_IB0ADDR);
+ }
+ info = counter_arch_bp(bp);
+ info->mask = TASK_SIZE - 1;
+
+ return 0;
+}
+
+/* ptrace API */
+void user_enable_single_step(struct task_struct *task)
+{
+ struct thread_info *ti = task_thread_info(task);
+
+ set_single_step(task, task_pt_regs(task)->csr_era);
+ task->thread.single_step = task_pt_regs(task)->csr_era;
+ set_ti_thread_flag(ti, TIF_SINGLESTEP);
+}
+
+void user_disable_single_step(struct task_struct *task)
+{
+ clear_tsk_thread_flag(task, TIF_SINGLESTEP);
+}
+#endif
diff --git a/arch/loongarch/kernel/relocate.c b/arch/loongarch/kernel/relocate.c
new file mode 100644
index 000000000000..01f94d1e3edf
--- /dev/null
+++ b/arch/loongarch/kernel/relocate.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for Kernel relocation at boot time
+ *
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ */
+
+#include <linux/elf.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/panic_notifier.h>
+#include <linux/start_kernel.h>
+#include <asm/bootinfo.h>
+#include <asm/early_ioremap.h>
+#include <asm/inst.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+
+#define RELOCATED(x) ((void *)((long)x + reloc_offset))
+#define RELOCATED_KASLR(x) ((void *)((long)x + random_offset))
+
+static unsigned long reloc_offset;
+
+static inline void __init relocate_relative(void)
+{
+ Elf64_Rela *rela, *rela_end;
+ rela = (Elf64_Rela *)&__rela_dyn_begin;
+ rela_end = (Elf64_Rela *)&__rela_dyn_end;
+
+ for ( ; rela < rela_end; rela++) {
+ Elf64_Addr addr = rela->r_offset;
+ Elf64_Addr relocated_addr = rela->r_addend;
+
+ if (rela->r_info != R_LARCH_RELATIVE)
+ continue;
+
+ if (relocated_addr >= VMLINUX_LOAD_ADDRESS)
+ relocated_addr = (Elf64_Addr)RELOCATED(relocated_addr);
+
+ *(Elf64_Addr *)RELOCATED(addr) = relocated_addr;
+ }
+}
+
+static inline void __init relocate_absolute(long random_offset)
+{
+ void *begin, *end;
+ struct rela_la_abs *p;
+
+ begin = RELOCATED_KASLR(&__la_abs_begin);
+ end = RELOCATED_KASLR(&__la_abs_end);
+
+ for (p = begin; (void *)p < end; p++) {
+ long v = p->symvalue;
+ uint32_t lu12iw, ori, lu32id, lu52id;
+ union loongarch_instruction *insn = (void *)p - p->offset;
+
+ lu12iw = (v >> 12) & 0xfffff;
+ ori = v & 0xfff;
+ lu32id = (v >> 32) & 0xfffff;
+ lu52id = v >> 52;
+
+ insn[0].reg1i20_format.immediate = lu12iw;
+ insn[1].reg2i12_format.immediate = ori;
+ insn[2].reg1i20_format.immediate = lu32id;
+ insn[3].reg2i12_format.immediate = lu52id;
+ }
+}
+
+#ifdef CONFIG_RANDOMIZE_BASE
+static inline __init unsigned long rotate_xor(unsigned long hash,
+ const void *area, size_t size)
+{
+ size_t i, diff;
+ const typeof(hash) *ptr = PTR_ALIGN(area, sizeof(hash));
+
+ diff = (void *)ptr - area;
+ if (size < diff + sizeof(hash))
+ return hash;
+
+ size = ALIGN_DOWN(size - diff, sizeof(hash));
+
+ for (i = 0; i < size / sizeof(hash); i++) {
+ /* Rotate by odd number of bits and XOR. */
+ hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
+ hash ^= ptr[i];
+ }
+
+ return hash;
+}
+
+static inline __init unsigned long get_random_boot(void)
+{
+ unsigned long hash = 0;
+ unsigned long entropy = random_get_entropy();
+
+ /* Attempt to create a simple but unpredictable starting entropy. */
+ hash = rotate_xor(hash, linux_banner, strlen(linux_banner));
+
+ /* Add in any runtime entropy we can get */
+ hash = rotate_xor(hash, &entropy, sizeof(entropy));
+
+ return hash;
+}
+
+static inline __init bool kaslr_disabled(void)
+{
+ char *str;
+ const char *builtin_cmdline = CONFIG_CMDLINE;
+
+ str = strstr(builtin_cmdline, "nokaslr");
+ if (str == builtin_cmdline || (str > builtin_cmdline && *(str - 1) == ' '))
+ return true;
+
+ str = strstr(boot_command_line, "nokaslr");
+ if (str == boot_command_line || (str > boot_command_line && *(str - 1) == ' '))
+ return true;
+
+ return false;
+}
+
+/* Choose a new address for the kernel */
+static inline void __init *determine_relocation_address(void)
+{
+ unsigned long kernel_length;
+ unsigned long random_offset;
+ void *destination = _text;
+
+ if (kaslr_disabled())
+ return destination;
+
+ kernel_length = (long)_end - (long)_text;
+
+ random_offset = get_random_boot() << 16;
+ random_offset &= (CONFIG_RANDOMIZE_BASE_MAX_OFFSET - 1);
+ if (random_offset < kernel_length)
+ random_offset += ALIGN(kernel_length, 0xffff);
+
+ return RELOCATED_KASLR(destination);
+}
+
+static inline int __init relocation_addr_valid(void *location_new)
+{
+ if ((unsigned long)location_new & 0x00000ffff)
+ return 0; /* Inappropriately aligned new location */
+
+ if ((unsigned long)location_new < (unsigned long)_end)
+ return 0; /* New location overlaps original kernel */
+
+ return 1;
+}
+#endif
+
+static inline void __init update_reloc_offset(unsigned long *addr, long random_offset)
+{
+ unsigned long *new_addr = (unsigned long *)RELOCATED_KASLR(addr);
+
+ *new_addr = (unsigned long)reloc_offset;
+}
+
+void * __init relocate_kernel(void)
+{
+ unsigned long kernel_length;
+ unsigned long random_offset = 0;
+ void *location_new = _text; /* Default to original kernel start */
+ void *kernel_entry = start_kernel; /* Default to original kernel entry point */
+ char *cmdline = early_ioremap(fw_arg1, COMMAND_LINE_SIZE); /* Boot command line is passed in fw_arg1 */
+
+ strscpy(boot_command_line, cmdline, COMMAND_LINE_SIZE);
+
+#ifdef CONFIG_RANDOMIZE_BASE
+ location_new = determine_relocation_address();
+
+ /* Sanity check relocation address */
+ if (relocation_addr_valid(location_new))
+ random_offset = (unsigned long)location_new - (unsigned long)(_text);
+#endif
+ reloc_offset = (unsigned long)_text - VMLINUX_LOAD_ADDRESS;
+
+ if (random_offset) {
+ kernel_length = (long)(_end) - (long)(_text);
+
+ /* Copy the kernel to it's new location */
+ memcpy(location_new, _text, kernel_length);
+
+ /* Sync the caches ready for execution of new kernel */
+ __asm__ __volatile__ (
+ "ibar 0 \t\n"
+ "dbar 0 \t\n"
+ ::: "memory");
+
+ reloc_offset += random_offset;
+
+ /* Return the new kernel's entry point */
+ kernel_entry = RELOCATED_KASLR(start_kernel);
+
+ /* The current thread is now within the relocated kernel */
+ __current_thread_info = RELOCATED_KASLR(__current_thread_info);
+
+ update_reloc_offset(&reloc_offset, random_offset);
+ }
+
+ if (reloc_offset)
+ relocate_relative();
+
+ relocate_absolute(random_offset);
+
+ return kernel_entry;
+}
+
+/*
+ * Show relocation information on panic.
+ */
+static void show_kernel_relocation(const char *level)
+{
+ if (reloc_offset > 0) {
+ printk(level);
+ pr_cont("Kernel relocated by 0x%lx\n", reloc_offset);
+ pr_cont(" .text @ 0x%px\n", _text);
+ pr_cont(" .data @ 0x%px\n", _sdata);
+ pr_cont(" .bss @ 0x%px\n", __bss_start);
+ }
+}
+
+static int kernel_location_notifier_fn(struct notifier_block *self,
+ unsigned long v, void *p)
+{
+ show_kernel_relocation(KERN_EMERG);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kernel_location_notifier = {
+ .notifier_call = kernel_location_notifier_fn
+};
+
+static int __init register_kernel_offset_dumper(void)
+{
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &kernel_location_notifier);
+ return 0;
+}
+
+arch_initcall(register_kernel_offset_dumper);
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index 4344502c0b31..bae84ccf6d36 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -234,11 +234,14 @@ static void __init arch_reserve_vmcore(void)
#endif
}
+/* 2MB alignment for crash kernel regions */
+#define CRASH_ALIGN SZ_2M
+#define CRASH_ADDR_MAX SZ_4G
+
static void __init arch_parse_crashkernel(void)
{
#ifdef CONFIG_KEXEC
int ret;
- unsigned long long start;
unsigned long long total_mem;
unsigned long long crash_base, crash_size;
@@ -247,8 +250,13 @@ static void __init arch_parse_crashkernel(void)
if (ret < 0 || crash_size <= 0)
return;
- start = memblock_phys_alloc_range(crash_size, 1, crash_base, crash_base + crash_size);
- if (start != crash_base) {
+ if (crash_base <= 0) {
+ crash_base = memblock_phys_alloc_range(crash_size, CRASH_ALIGN, CRASH_ALIGN, CRASH_ADDR_MAX);
+ if (!crash_base) {
+ pr_warn("crashkernel reservation failed - No suitable area found.\n");
+ return;
+ }
+ } else if (!memblock_phys_alloc_range(crash_size, CRASH_ALIGN, crash_base, crash_base + crash_size)) {
pr_warn("Invalid memory region reserved for crash kernel\n");
return;
}
diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c
index a6576dea590c..4351f69d9950 100644
--- a/arch/loongarch/kernel/time.c
+++ b/arch/loongarch/kernel/time.c
@@ -140,16 +140,17 @@ static int get_timer_irq(void)
int constant_clockevent_init(void)
{
- int irq;
unsigned int cpu = smp_processor_id();
unsigned long min_delta = 0x600;
unsigned long max_delta = (1UL << 48) - 1;
struct clock_event_device *cd;
- static int timer_irq_installed = 0;
+ static int irq = 0, timer_irq_installed = 0;
- irq = get_timer_irq();
- if (irq < 0)
- pr_err("Failed to map irq %d (timer)\n", irq);
+ if (!timer_irq_installed) {
+ irq = get_timer_irq();
+ if (irq < 0)
+ pr_err("Failed to map irq %d (timer)\n", irq);
+ }
cd = &per_cpu(constant_clockevent_device, cpu);
diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
index c38a146a973b..de8ebe20b666 100644
--- a/arch/loongarch/kernel/traps.c
+++ b/arch/loongarch/kernel/traps.c
@@ -371,9 +371,14 @@ int no_unaligned_warning __read_mostly = 1; /* Only 1 warning by default */
asmlinkage void noinstr do_ale(struct pt_regs *regs)
{
- unsigned int *pc;
irqentry_state_t state = irqentry_enter(regs);
+#ifndef CONFIG_ARCH_STRICT_ALIGN
+ die_if_kernel("Kernel ale access", regs);
+ force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr);
+#else
+ unsigned int *pc;
+
perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->csr_badvaddr);
/*
@@ -397,8 +402,8 @@ asmlinkage void noinstr do_ale(struct pt_regs *regs)
sigbus:
die_if_kernel("Kernel ale access", regs);
force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr);
-
out:
+#endif
irqentry_exit(regs, state);
}
@@ -432,7 +437,9 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs)
unsigned long era = exception_era(regs);
irqentry_state_t state = irqentry_enter(regs);
- local_irq_enable();
+ if (regs->csr_prmd & CSR_PRMD_PIE)
+ local_irq_enable();
+
current->thread.trap_nr = read_csr_excode();
if (__get_inst(&opcode, (u32 *)era, user))
goto out_sigsegv;
@@ -445,14 +452,12 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs)
*/
switch (bcode) {
case BRK_KPROBE_BP:
- if (notify_die(DIE_BREAK, "Kprobe", regs, bcode,
- current->thread.trap_nr, SIGTRAP) == NOTIFY_STOP)
+ if (kprobe_breakpoint_handler(regs))
goto out;
else
break;
case BRK_KPROBE_SSTEPBP:
- if (notify_die(DIE_SSTEPBP, "Kprobe_SingleStep", regs, bcode,
- current->thread.trap_nr, SIGTRAP) == NOTIFY_STOP)
+ if (kprobe_singlestep_handler(regs))
goto out;
else
break;
@@ -495,7 +500,9 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs)
}
out:
- local_irq_disable();
+ if (regs->csr_prmd & CSR_PRMD_PIE)
+ local_irq_disable();
+
irqentry_exit(regs, state);
return;
@@ -506,7 +513,52 @@ out_sigsegv:
asmlinkage void noinstr do_watch(struct pt_regs *regs)
{
+ irqentry_state_t state = irqentry_enter(regs);
+
+#ifndef CONFIG_HAVE_HW_BREAKPOINT
pr_warn("Hardware watch point handler not implemented!\n");
+#else
+ if (test_tsk_thread_flag(current, TIF_SINGLESTEP)) {
+ int llbit = (csr_read32(LOONGARCH_CSR_LLBCTL) & 0x1);
+ unsigned long pc = instruction_pointer(regs);
+ union loongarch_instruction *ip = (union loongarch_instruction *)pc;
+
+ if (llbit) {
+ /*
+ * When the ll-sc combo is encountered, it is regarded as an single
+ * instruction. So don't clear llbit and reset CSR.FWPS.Skip until
+ * the llsc execution is completed.
+ */
+ csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
+ csr_write32(CSR_LLBCTL_KLO, LOONGARCH_CSR_LLBCTL);
+ goto out;
+ }
+
+ if (pc == current->thread.single_step) {
+ /*
+ * Certain insns are occasionally not skipped when CSR.FWPS.Skip is
+ * set, such as fld.d/fst.d. So singlestep needs to compare whether
+ * the csr_era is equal to the value of singlestep which last time set.
+ */
+ if (!is_self_loop_ins(ip, regs)) {
+ /*
+ * Check if the given instruction the target pc is equal to the
+ * current pc, If yes, then we should not set the CSR.FWPS.SKIP
+ * bit to break the original instruction stream.
+ */
+ csr_write32(CSR_FWPC_SKIP, LOONGARCH_CSR_FWPS);
+ goto out;
+ }
+ }
+ } else {
+ breakpoint_handler(regs);
+ watchpoint_handler(regs);
+ }
+
+ force_sig(SIGTRAP);
+out:
+#endif
+ irqentry_exit(regs, state);
}
asmlinkage void noinstr do_ri(struct pt_regs *regs)
diff --git a/arch/loongarch/kernel/vmlinux.lds.S b/arch/loongarch/kernel/vmlinux.lds.S
index 78506b31ba61..0c7b041be9d8 100644
--- a/arch/loongarch/kernel/vmlinux.lds.S
+++ b/arch/loongarch/kernel/vmlinux.lds.S
@@ -65,10 +65,21 @@ SECTIONS
__alt_instructions_end = .;
}
+#ifdef CONFIG_RELOCATABLE
+ . = ALIGN(8);
+ .la_abs : AT(ADDR(.la_abs) - LOAD_OFFSET) {
+ __la_abs_begin = .;
+ *(.la_abs)
+ __la_abs_end = .;
+ }
+#endif
+
.got : ALIGN(16) { *(.got) }
.plt : ALIGN(16) { *(.plt) }
.got.plt : ALIGN(16) { *(.got.plt) }
+ .data.rel : { *(.data.rel*) }
+
. = ALIGN(PECOFF_SEGMENT_ALIGN);
__init_begin = .;
__inittext_begin = .;
@@ -92,8 +103,6 @@ SECTIONS
PERCPU_SECTION(1 << CONFIG_L1_CACHE_SHIFT)
#endif
- .rela.dyn : ALIGN(8) { *(.rela.dyn) *(.rela*) }
-
.init.bss : {
*(.init.bss)
}
@@ -106,6 +115,12 @@ SECTIONS
RO_DATA(4096)
RW_DATA(1 << CONFIG_L1_CACHE_SHIFT, PAGE_SIZE, THREAD_SIZE)
+ .rela.dyn : ALIGN(8) {
+ __rela_dyn_begin = .;
+ *(.rela.dyn) *(.rela*)
+ __rela_dyn_end = .;
+ }
+
.sdata : {
*(.sdata)
}
@@ -132,6 +147,7 @@ SECTIONS
DISCARDS
/DISCARD/ : {
+ *(.dynamic .dynsym .dynstr .hash .gnu.hash)
*(.gnu.attributes)
*(.options)
*(.eh_frame)
diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S
index 7c07d595ee89..3b7e1dec7109 100644
--- a/arch/loongarch/lib/memcpy.S
+++ b/arch/loongarch/lib/memcpy.S
@@ -17,6 +17,7 @@ SYM_FUNC_START(memcpy)
ALTERNATIVE "b __memcpy_generic", \
"b __memcpy_fast", CPU_FEATURE_UAL
SYM_FUNC_END(memcpy)
+_ASM_NOKPROBE(memcpy)
EXPORT_SYMBOL(memcpy)
@@ -41,6 +42,7 @@ SYM_FUNC_START(__memcpy_generic)
2: move a0, a3
jr ra
SYM_FUNC_END(__memcpy_generic)
+_ASM_NOKPROBE(__memcpy_generic)
/*
* void *__memcpy_fast(void *dst, const void *src, size_t n)
@@ -93,3 +95,4 @@ SYM_FUNC_START(__memcpy_fast)
3: move a0, a3
jr ra
SYM_FUNC_END(__memcpy_fast)
+_ASM_NOKPROBE(__memcpy_fast)
diff --git a/arch/loongarch/lib/memmove.S b/arch/loongarch/lib/memmove.S
index 6ffdb46da78f..b796c3d6da05 100644
--- a/arch/loongarch/lib/memmove.S
+++ b/arch/loongarch/lib/memmove.S
@@ -29,6 +29,7 @@ SYM_FUNC_START(memmove)
b rmemcpy
4: b __rmemcpy_generic
SYM_FUNC_END(memmove)
+_ASM_NOKPROBE(memmove)
EXPORT_SYMBOL(memmove)
@@ -39,6 +40,7 @@ SYM_FUNC_START(rmemcpy)
ALTERNATIVE "b __rmemcpy_generic", \
"b __rmemcpy_fast", CPU_FEATURE_UAL
SYM_FUNC_END(rmemcpy)
+_ASM_NOKPROBE(rmemcpy)
/*
* void *__rmemcpy_generic(void *dst, const void *src, size_t n)
@@ -64,6 +66,7 @@ SYM_FUNC_START(__rmemcpy_generic)
2: move a0, a3
jr ra
SYM_FUNC_END(__rmemcpy_generic)
+_ASM_NOKPROBE(__rmemcpy_generic)
/*
* void *__rmemcpy_fast(void *dst, const void *src, size_t n)
@@ -119,3 +122,4 @@ SYM_FUNC_START(__rmemcpy_fast)
3: move a0, a3
jr ra
SYM_FUNC_END(__rmemcpy_fast)
+_ASM_NOKPROBE(__rmemcpy_fast)
diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S
index e7cb4ea3747d..a9eb732ab2ad 100644
--- a/arch/loongarch/lib/memset.S
+++ b/arch/loongarch/lib/memset.S
@@ -23,6 +23,7 @@ SYM_FUNC_START(memset)
ALTERNATIVE "b __memset_generic", \
"b __memset_fast", CPU_FEATURE_UAL
SYM_FUNC_END(memset)
+_ASM_NOKPROBE(memset)
EXPORT_SYMBOL(memset)
@@ -45,6 +46,7 @@ SYM_FUNC_START(__memset_generic)
2: move a0, a3
jr ra
SYM_FUNC_END(__memset_generic)
+_ASM_NOKPROBE(__memset_generic)
/*
* void *__memset_fast(void *s, int c, size_t n)
@@ -89,3 +91,4 @@ SYM_FUNC_START(__memset_fast)
3: move a0, a3
jr ra
SYM_FUNC_END(__memset_fast)
+_ASM_NOKPROBE(__memset_fast)
diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c
index 1ccd53655cab..449087bd589d 100644
--- a/arch/loongarch/mm/fault.c
+++ b/arch/loongarch/mm/fault.c
@@ -135,6 +135,9 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
struct vm_area_struct *vma = NULL;
vm_fault_t fault;
+ if (kprobe_page_fault(regs, current->thread.trap_nr))
+ return;
+
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
index 58781c6e4191..244e2f5aeee5 100644
--- a/arch/loongarch/mm/tlbex.S
+++ b/arch/loongarch/mm/tlbex.S
@@ -24,8 +24,7 @@
move a0, sp
REG_S a2, sp, PT_BVADDR
li.w a1, \write
- la.abs t0, do_page_fault
- jirl ra, t0, 0
+ bl do_page_fault
RESTORE_ALL_AND_RET
SYM_FUNC_END(tlb_do_page_fault_\write)
.endm
@@ -40,7 +39,7 @@ SYM_FUNC_START(handle_tlb_protect)
move a1, zero
csrrd a2, LOONGARCH_CSR_BADV
REG_S a2, sp, PT_BVADDR
- la.abs t0, do_page_fault
+ la_abs t0, do_page_fault
jirl ra, t0, 0
RESTORE_ALL_AND_RET
SYM_FUNC_END(handle_tlb_protect)
@@ -116,7 +115,7 @@ smp_pgtable_change_load:
#ifdef CONFIG_64BIT
vmalloc_load:
- la.abs t1, swapper_pg_dir
+ la_abs t1, swapper_pg_dir
b vmalloc_done_load
#endif
@@ -187,7 +186,7 @@ tlb_huge_update_load:
nopage_tlb_load:
dbar 0
csrrd ra, EXCEPTION_KS2
- la.abs t0, tlb_do_page_fault_0
+ la_abs t0, tlb_do_page_fault_0
jr t0
SYM_FUNC_END(handle_tlb_load)
@@ -263,7 +262,7 @@ smp_pgtable_change_store:
#ifdef CONFIG_64BIT
vmalloc_store:
- la.abs t1, swapper_pg_dir
+ la_abs t1, swapper_pg_dir
b vmalloc_done_store
#endif
@@ -336,7 +335,7 @@ tlb_huge_update_store:
nopage_tlb_store:
dbar 0
csrrd ra, EXCEPTION_KS2
- la.abs t0, tlb_do_page_fault_1
+ la_abs t0, tlb_do_page_fault_1
jr t0
SYM_FUNC_END(handle_tlb_store)
@@ -411,7 +410,7 @@ smp_pgtable_change_modify:
#ifdef CONFIG_64BIT
vmalloc_modify:
- la.abs t1, swapper_pg_dir
+ la_abs t1, swapper_pg_dir
b vmalloc_done_modify
#endif
@@ -483,7 +482,7 @@ tlb_huge_update_modify:
nopage_tlb_modify:
dbar 0
csrrd ra, EXCEPTION_KS2
- la.abs t0, tlb_do_page_fault_1
+ la_abs t0, tlb_do_page_fault_1
jr t0
SYM_FUNC_END(handle_tlb_modify)
diff --git a/arch/loongarch/power/suspend_asm.S b/arch/loongarch/power/suspend_asm.S
index eb2675642f9f..90da899c06a1 100644
--- a/arch/loongarch/power/suspend_asm.S
+++ b/arch/loongarch/power/suspend_asm.S
@@ -78,9 +78,8 @@ SYM_INNER_LABEL(loongarch_wakeup_start, SYM_L_GLOBAL)
li.d t0, CSR_DMW1_INIT # CA, PLV0
csrwr t0, LOONGARCH_CSR_DMWIN1
- la.abs t0, 0f
- jr t0
-0:
+ JUMP_VIRT_ADDR t0, t1
+
la.pcrel t0, acpi_saved_sp
ld.d sp, t0, 0
SETUP_WAKEUP
diff --git a/arch/m68k/68000/dragen2.c b/arch/m68k/68000/dragen2.c
index 1a57eff28cfe..7f1804e31a06 100644
--- a/arch/m68k/68000/dragen2.c
+++ b/arch/m68k/68000/dragen2.c
@@ -15,7 +15,7 @@
#include "screen.h"
/***************************************************************************/
-/* Init Drangon Engine hardware */
+/* Init Dragon Engine II hardware */
/***************************************************************************/
static void dragen2_reset(void)
diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine
index 53c45ccda564..e2f961208f18 100644
--- a/arch/m68k/Kconfig.machine
+++ b/arch/m68k/Kconfig.machine
@@ -192,18 +192,18 @@ config UCSIMM
Support for the Arcturus Networks uCsimm module.
config UCDIMM
- bool "uDsimm module support"
+ bool "uCdimm module support"
depends on !MMU
select M68VZ328
help
- Support for the Arcturus Networks uDsimm module.
+ Support for the Arcturus Networks uCdimm module.
config DRAGEN2
- bool "DragenEngine II board support"
+ bool "DragonEngine II board support"
depends on !MMU
select M68VZ328
help
- Support for the DragenEngine II board.
+ Support for the DragonEngine II board.
config DIRECT_IO_ACCESS
bool "Allow user to access IO directly"
diff --git a/arch/mips/Kbuild b/arch/mips/Kbuild
index 9e8071f0e58f..af2967bffb73 100644
--- a/arch/mips/Kbuild
+++ b/arch/mips/Kbuild
@@ -7,7 +7,7 @@ subdir-ccflags-y := -Werror
endif
# platform specific definitions
-include arch/mips/Kbuild.platforms
+include $(srctree)/arch/mips/Kbuild.platforms
obj-y := $(platform-y)
# make clean traverses $(obj-) without having included .config, so
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 15cb692b0a09..37072e15b263 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -445,6 +445,7 @@ config LANTIQ
select IRQ_MIPS_CPU
select CEVT_R4K
select CSRC_R4K
+ select NO_EXCEPT_FILL
select SYS_HAS_CPU_MIPS32_R1
select SYS_HAS_CPU_MIPS32_R2
select SYS_SUPPORTS_BIG_ENDIAN
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 490dea07d4e0..04e46ec24319 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -95,7 +95,7 @@ all-$(CONFIG_SYS_SUPPORTS_ZBOOT)+= vmlinuz
# crossformat linking we rely on the elf2ecoff tool for format conversion.
#
cflags-y += -G 0 -mno-abicalls -fno-pic -pipe
-cflags-y += -msoft-float
+cflags-y += -msoft-float -Wa,-msoft-float
LDFLAGS_vmlinux += -G 0 -static -n -nostdlib
KBUILD_AFLAGS_MODULE += -mlong-calls
KBUILD_CFLAGS_MODULE += -mlong-calls
@@ -104,15 +104,6 @@ ifeq ($(CONFIG_RELOCATABLE),y)
LDFLAGS_vmlinux += --emit-relocs
endif
-#
-# pass -msoft-float to GAS if it supports it. However on newer binutils
-# (specifically newer than 2.24.51.20140728) we then also need to explicitly
-# set ".set hardfloat" in all files which manipulate floating point registers.
-#
-ifneq ($(call as-option,-Wa$(comma)-msoft-float,),)
- cflags-y += -DGAS_HAS_SET_HARDFLOAT -Wa,-msoft-float
-endif
-
cflags-y += -ffreestanding
cflags-$(CONFIG_CPU_BIG_ENDIAN) += -EB
@@ -152,7 +143,7 @@ cflags-y += -fno-stack-check
#
# Avoid this by explicitly disabling that assembler behaviour.
#
-cflags-y += $(call as-option,-Wa$(comma)-mno-fix-loongson3-llsc,)
+cflags-y += $(call cc-option,-Wa$(comma)-mno-fix-loongson3-llsc,)
#
# CPU-dependent compiler/assembler options for optimization.
diff --git a/arch/mips/Makefile.postlink b/arch/mips/Makefile.postlink
index 4b1d3ba3a8a2..34e3bd71f3b0 100644
--- a/arch/mips/Makefile.postlink
+++ b/arch/mips/Makefile.postlink
@@ -10,7 +10,7 @@ PHONY := __archpost
__archpost:
-include include/config/auto.conf
-include scripts/Kbuild.include
+include $(srctree)/scripts/Kbuild.include
CMD_LS3_LLSC = arch/mips/tools/loongson3-llsc-check
quiet_cmd_ls3_llsc = LLSCCHK $@
diff --git a/arch/mips/bcm47xx/board.c b/arch/mips/bcm47xx/board.c
index 8ef002471b9c..90fb48b046c0 100644
--- a/arch/mips/bcm47xx/board.c
+++ b/arch/mips/bcm47xx/board.c
@@ -130,6 +130,7 @@ struct bcm47xx_board_type_list2 bcm47xx_board_list_boot_hw[] __initconst = {
{{BCM47XX_BOARD_LINKSYS_E1000V21, "Linksys E1000 V2.1"}, "E1000", "2.1"},
{{BCM47XX_BOARD_LINKSYS_E1200V2, "Linksys E1200 V2"}, "E1200", "2.0"},
{{BCM47XX_BOARD_LINKSYS_E2000V1, "Linksys E2000 V1"}, "Linksys E2000", "1.0"},
+ {{BCM47XX_BOARD_LINKSYS_E2500V3, "Linksys E2500 V3"}, "E2500", "1.0"},
/* like WRT610N v2.0 */
{{BCM47XX_BOARD_LINKSYS_E3000V1, "Linksys E3000 V1"}, "E300", "1.0"},
{{BCM47XX_BOARD_LINKSYS_E3200V1, "Linksys E3200 V1"}, "E3200", "1.0"},
diff --git a/arch/mips/bcm47xx/buttons.c b/arch/mips/bcm47xx/buttons.c
index 38e4a9cbcf4e..437a737c01dd 100644
--- a/arch/mips/bcm47xx/buttons.c
+++ b/arch/mips/bcm47xx/buttons.c
@@ -223,6 +223,12 @@ bcm47xx_buttons_linksys_e2000v1[] __initconst = {
};
static const struct gpio_keys_button
+bcm47xx_buttons_linksys_e2500v3[] __initconst = {
+ BCM47XX_GPIO_KEY(9, KEY_WPS_BUTTON),
+ BCM47XX_GPIO_KEY(10, KEY_RESTART),
+};
+
+static const struct gpio_keys_button
bcm47xx_buttons_linksys_e3000v1[] __initconst = {
BCM47XX_GPIO_KEY(4, KEY_WPS_BUTTON),
BCM47XX_GPIO_KEY(6, KEY_RESTART),
@@ -617,6 +623,9 @@ int __init bcm47xx_buttons_register(void)
case BCM47XX_BOARD_LINKSYS_E2000V1:
err = bcm47xx_copy_bdata(bcm47xx_buttons_linksys_e2000v1);
break;
+ case BCM47XX_BOARD_LINKSYS_E2500V3:
+ err = bcm47xx_copy_bdata(bcm47xx_buttons_linksys_e2500v3);
+ break;
case BCM47XX_BOARD_LINKSYS_E3000V1:
err = bcm47xx_copy_bdata(bcm47xx_buttons_linksys_e3000v1);
break;
diff --git a/arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts b/arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts
index 2fdb4baad19c..cb460eaf8835 100644
--- a/arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts
+++ b/arch/mips/boot/dts/cavium-octeon/dlink_dsr-1000n.dts
@@ -20,27 +20,27 @@
leds {
compatible = "gpio-leds";
- usb1 {
+ led-usb1 {
label = "usb1";
gpios = <&gpio 9 GPIO_ACTIVE_LOW>;
};
- usb2 {
+ led-usb2 {
label = "usb2";
gpios = <&gpio 10 GPIO_ACTIVE_LOW>;
};
- wps {
+ led-wps {
label = "wps";
gpios = <&gpio 11 GPIO_ACTIVE_LOW>;
};
- wireless1 {
+ led-wireless1 {
label = "5g";
gpios = <&gpio 17 GPIO_ACTIVE_LOW>;
};
- wireless2 {
+ led-wireless2 {
label = "2.4g";
gpios = <&gpio 18 GPIO_ACTIVE_LOW>;
};
diff --git a/arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts b/arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts
index e04237281b41..c55845fd84ca 100644
--- a/arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts
+++ b/arch/mips/boot/dts/cavium-octeon/dlink_dsr-500n.dts
@@ -21,15 +21,15 @@
leds {
compatible = "gpio-leds";
- usb {
+ led-usb {
gpios = <&gpio 9 GPIO_ACTIVE_LOW>;
};
- wps {
+ led-wps {
gpios = <&gpio 11 GPIO_ACTIVE_LOW>;
};
- wireless {
+ led-wireless {
label = "2.4g";
gpios = <&gpio 18 GPIO_ACTIVE_LOW>;
};
diff --git a/arch/mips/boot/dts/img/boston.dts b/arch/mips/boot/dts/img/boston.dts
index 84328afa3a55..72f7605d2e31 100644
--- a/arch/mips/boot/dts/img/boston.dts
+++ b/arch/mips/boot/dts/img/boston.dts
@@ -125,7 +125,7 @@
#interrupt-cells = <1>;
};
- pci2_root@0,0,0 {
+ pci2_root@0,0 {
compatible = "pci10ee,7021";
reg = <0x00000000 0 0 0 0>;
diff --git a/arch/mips/boot/dts/ingenic/ci20.dts b/arch/mips/boot/dts/ingenic/ci20.dts
index f38c39572a9e..239c4537484d 100644
--- a/arch/mips/boot/dts/ingenic/ci20.dts
+++ b/arch/mips/boot/dts/ingenic/ci20.dts
@@ -42,25 +42,25 @@
leds {
compatible = "gpio-leds";
- led0 {
+ led-0 {
label = "ci20:red:led0";
gpios = <&gpc 3 GPIO_ACTIVE_HIGH>;
linux,default-trigger = "none";
};
- led1 {
+ led-1 {
label = "ci20:red:led1";
gpios = <&gpc 2 GPIO_ACTIVE_HIGH>;
linux,default-trigger = "nand-disk";
};
- led2 {
+ led-2 {
label = "ci20:red:led2";
gpios = <&gpc 1 GPIO_ACTIVE_HIGH>;
linux,default-trigger = "cpu1";
};
- led3 {
+ led-3 {
label = "ci20:red:led3";
gpios = <&gpc 0 GPIO_ACTIVE_HIGH>;
linux,default-trigger = "cpu0";
@@ -113,7 +113,7 @@
regulator-min-microvolt = <5000000>;
regulator-max-microvolt = <5000000>;
- gpio = <&gpf 14 GPIO_ACTIVE_LOW>;
+ gpio = <&gpf 15 GPIO_ACTIVE_LOW>;
enable-active-high;
};
};
diff --git a/arch/mips/boot/dts/ingenic/jz4780.dtsi b/arch/mips/boot/dts/ingenic/jz4780.dtsi
index c182a656d63b..18affff85ce3 100644
--- a/arch/mips/boot/dts/ingenic/jz4780.dtsi
+++ b/arch/mips/boot/dts/ingenic/jz4780.dtsi
@@ -155,6 +155,8 @@
clocks = <&cgu JZ4780_CLK_RTCLK>;
clock-names = "rtc";
+
+ #clock-cells = <0>;
};
pinctrl: pin-controller@10010000 {
diff --git a/arch/mips/boot/dts/lantiq/danube.dtsi b/arch/mips/boot/dts/lantiq/danube.dtsi
index 510be63c8bdf..7a7ba66aa534 100644
--- a/arch/mips/boot/dts/lantiq/danube.dtsi
+++ b/arch/mips/boot/dts/lantiq/danube.dtsi
@@ -40,7 +40,6 @@
eiu0: eiu@101000 {
#interrupt-cells = <1>;
interrupt-controller;
- interrupt-parent;
compatible = "lantiq,eiu-xway";
reg = <0x101000 0x1000>;
};
diff --git a/arch/mips/boot/dts/pic32/pic32mzda_sk.dts b/arch/mips/boot/dts/pic32/pic32mzda_sk.dts
index ab70637bbec5..b1c5ffdb33fc 100644
--- a/arch/mips/boot/dts/pic32/pic32mzda_sk.dts
+++ b/arch/mips/boot/dts/pic32/pic32mzda_sk.dts
@@ -28,19 +28,19 @@
pinctrl-names = "default";
pinctrl-0 = <&user_leds_s0>;
- led@1 {
+ led-1 {
label = "pic32mzda_sk:red:led1";
gpios = <&gpio7 0 GPIO_ACTIVE_HIGH>;
linux,default-trigger = "heartbeat";
};
- led@2 {
+ led-2 {
label = "pic32mzda_sk:yellow:led2";
gpios = <&gpio7 1 GPIO_ACTIVE_HIGH>;
linux,default-trigger = "mmc0";
};
- led@3 {
+ led-3 {
label = "pic32mzda_sk:green:led3";
gpios = <&gpio7 2 GPIO_ACTIVE_HIGH>;
default-state = "on";
diff --git a/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts b/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts
index f3dff4009ab5..f894fe17816b 100644
--- a/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts
+++ b/arch/mips/boot/dts/qca/ar9132_tl_wr1043nd_v1.dts
@@ -41,23 +41,23 @@
leds {
compatible = "gpio-leds";
- led@0 {
+ led-0 {
label = "tp-link:green:usb";
gpios = <&gpio 1 GPIO_ACTIVE_LOW>;
};
- led@1 {
+ led-1 {
label = "tp-link:green:system";
gpios = <&gpio 2 GPIO_ACTIVE_LOW>;
linux,default-trigger = "heartbeat";
};
- led@2 {
+ led-2 {
label = "tp-link:green:qss";
gpios = <&gpio 5 GPIO_ACTIVE_HIGH>;
};
- led@3 {
+ led-3 {
label = "tp-link:green:wlan";
gpios = <&gpio 9 GPIO_ACTIVE_LOW>;
};
diff --git a/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts b/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts
index 40e4c5da0e65..7affa58d4fa6 100644
--- a/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts
+++ b/arch/mips/boot/dts/qca/ar9331_dragino_ms14.dts
@@ -22,25 +22,25 @@
leds {
compatible = "gpio-leds";
- wlan {
+ led-wlan {
label = "dragino2:red:wlan";
gpios = <&gpio 0 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- lan {
+ led-lan {
label = "dragino2:red:lan";
gpios = <&gpio 13 GPIO_ACTIVE_LOW>;
default-state = "off";
};
- wan {
+ led-wan {
label = "dragino2:red:wan";
gpios = <&gpio 17 GPIO_ACTIVE_LOW>;
default-state = "off";
};
- system {
+ led-system {
label = "dragino2:red:system";
gpios = <&gpio 28 GPIO_ACTIVE_HIGH>;
default-state = "off";
diff --git a/arch/mips/boot/dts/qca/ar9331_omega.dts b/arch/mips/boot/dts/qca/ar9331_omega.dts
index ed184d861d5f..8904aa917a6e 100644
--- a/arch/mips/boot/dts/qca/ar9331_omega.dts
+++ b/arch/mips/boot/dts/qca/ar9331_omega.dts
@@ -22,7 +22,7 @@
leds {
compatible = "gpio-leds";
- system {
+ led-system {
label = "onion:amber:system";
gpios = <&gpio 27 GPIO_ACTIVE_LOW>;
default-state = "off";
diff --git a/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts b/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts
index 5f424c2cd781..10b9759228b7 100644
--- a/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts
+++ b/arch/mips/boot/dts/qca/ar9331_tl_mr3020.dts
@@ -22,25 +22,25 @@
leds {
compatible = "gpio-leds";
- wlan {
+ led-wlan {
label = "tp-link:green:wlan";
gpios = <&gpio 0 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- lan {
+ led-lan {
label = "tp-link:green:lan";
gpios = <&gpio 17 GPIO_ACTIVE_LOW>;
default-state = "off";
};
- wps {
+ led-wps {
label = "tp-link:green:wps";
gpios = <&gpio 26 GPIO_ACTIVE_LOW>;
default-state = "off";
};
- led3g {
+ led-led3g {
label = "tp-link:green:3g";
gpios = <&gpio 27 GPIO_ACTIVE_LOW>;
default-state = "off";
diff --git a/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts b/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts
index 179558161f85..18107ca0a06b 100644
--- a/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts
+++ b/arch/mips/boot/dts/ralink/gardena_smart_gateway_mt7688.dts
@@ -47,67 +47,67 @@
* (see below). So we can't include it in this LED node.
*/
- power_blue {
+ led-power-blue {
label = "smartgw:power:blue";
gpios = <&gpio 18 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- power_green {
+ led-power-green {
label = "smartgw:power:green";
gpios = <&gpio 19 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- power_red {
+ led-power-red {
label = "smartgw:power:red";
gpios = <&gpio 22 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- radio_blue {
+ led-radio-blue {
label = "smartgw:radio:blue";
gpios = <&gpio 23 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- radio_green {
+ led-radio-green {
label = "smartgw:radio:green";
gpios = <&gpio 24 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- radio_red {
+ led-radio-red {
label = "smartgw:radio:red";
gpios = <&gpio 25 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- internet_blue {
+ led-internet-blue {
label = "smartgw:internet:blue";
gpios = <&gpio 26 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- internet_green {
+ led-internet-green {
label = "smartgw:internet:green";
gpios = <&gpio 27 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- internet_red {
+ led-internet-red {
label = "smartgw:internet:red";
gpios = <&gpio 28 GPIO_ACTIVE_HIGH>;
default-state = "off";
};
- ethernet_link {
+ led-ethernet-link {
label = "smartgw:eth:link";
gpios = <&gpio 3 GPIO_ACTIVE_LOW>;
linux,default-trigger = "netdev";
};
- ethernet_activity {
+ led-ethernet-activity {
label = "smartgw:eth:act";
gpios = <&gpio 43 GPIO_ACTIVE_LOW>;
linux,default-trigger = "netdev";
diff --git a/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts b/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts
index 0128bd8fa7ed..129b6710b699 100644
--- a/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts
+++ b/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc1.dts
@@ -33,13 +33,13 @@
gpio-leds {
compatible = "gpio-leds";
- power {
+ led-power {
label = "green:power";
gpios = <&gpio 6 GPIO_ACTIVE_LOW>;
linux,default-trigger = "default-on";
};
- system {
+ led-system {
label = "green:system";
gpios = <&gpio 8 GPIO_ACTIVE_LOW>;
linux,default-trigger = "disk-activity";
@@ -91,22 +91,16 @@
status = "okay";
};
-&gmac1 {
- status = "okay";
- phy-handle = <&ethphy4>;
-};
-
-&mdio {
- ethphy4: ethernet-phy@4 {
- reg = <4>;
- };
-};
-
&switch0 {
ports {
port@0 {
status = "okay";
label = "ethblack";
};
+
+ port@4 {
+ status = "okay";
+ label = "ethblue";
+ };
};
};
diff --git a/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts b/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts
index e31417569e09..f810cd10f4f4 100644
--- a/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts
+++ b/arch/mips/boot/dts/ralink/mt7621-gnubee-gb-pc2.dts
@@ -33,33 +33,33 @@
gpio-leds {
compatible = "gpio-leds";
- ethblack-green {
+ led-ethblack-green {
label = "green:ethblack";
gpios = <&gpio 3 GPIO_ACTIVE_LOW>;
};
- ethblue-green {
+ led-ethblue-green {
label = "green:ethblue";
gpios = <&gpio 4 GPIO_ACTIVE_LOW>;
};
- ethyellow-green {
+ led-ethyellow-green {
label = "green:ethyellow";
gpios = <&gpio 15 GPIO_ACTIVE_LOW>;
};
- ethyellow-orange {
+ led-ethyellow-orange {
label = "orange:ethyellow";
gpios = <&gpio 13 GPIO_ACTIVE_LOW>;
};
- power {
+ led-power {
label = "green:power";
gpios = <&gpio 6 GPIO_ACTIVE_LOW>;
linux,default-trigger = "default-on";
};
- system {
+ led-system {
label = "green:system";
gpios = <&gpio 8 GPIO_ACTIVE_LOW>;
linux,default-trigger = "disk-activity";
@@ -112,9 +112,12 @@
};
&gmac1 {
- status = "okay";
phy-mode = "rgmii-rxid";
phy-handle = <&ethphy5>;
+
+ fixed-link {
+ status = "disabled";
+ };
};
&mdio {
@@ -134,5 +137,9 @@
status = "okay";
label = "ethblue";
};
+
+ port@5 {
+ status = "disabled";
+ };
};
};
diff --git a/arch/mips/boot/dts/ralink/mt7621.dtsi b/arch/mips/boot/dts/ralink/mt7621.dtsi
index aec85c779359..290d47fbcfbb 100644
--- a/arch/mips/boot/dts/ralink/mt7621.dtsi
+++ b/arch/mips/boot/dts/ralink/mt7621.dtsi
@@ -332,8 +332,13 @@
gmac1: mac@1 {
compatible = "mediatek,eth-mac";
reg = <1>;
- status = "disabled";
phy-mode = "rgmii";
+
+ fixed-link {
+ speed = <1000>;
+ full-duplex;
+ pause;
+ };
};
mdio: mdio-bus {
@@ -384,6 +389,18 @@
label = "swp4";
};
+ port@5 {
+ reg = <5>;
+ ethernet = <&gmac1>;
+ phy-mode = "rgmii";
+
+ fixed-link {
+ speed = <1000>;
+ full-duplex;
+ pause;
+ };
+ };
+
port@6 {
reg = <6>;
ethernet = <&gmac0>;
diff --git a/arch/mips/cavium-octeon/octeon-usb.c b/arch/mips/cavium-octeon/octeon-usb.c
index 5cffe1ed2447..28677c615175 100644
--- a/arch/mips/cavium-octeon/octeon-usb.c
+++ b/arch/mips/cavium-octeon/octeon-usb.c
@@ -245,7 +245,7 @@ static int dwc3_octeon_config_power(struct device *dev, u64 base)
power_active_low = 0;
gpio = gpio_pwr[1];
} else {
- dev_err(dev, "dwc3 controller clock init failure.\n");
+ dev_err(dev, "invalid power configuration\n");
return -EINVAL;
}
if ((OCTEON_IS_MODEL(OCTEON_CN73XX) ||
@@ -278,7 +278,7 @@ static int dwc3_octeon_config_power(struct device *dev, u64 base)
uctl_host_cfg.s.ppc_en = 0;
uctl_host_cfg.s.ppc_active_high_en = 0;
cvmx_write_csr(base + UCTL_HOST_CFG, uctl_host_cfg.u64);
- dev_warn(dev, "dwc3 controller clock init failure.\n");
+ dev_info(dev, "power control disabled\n");
}
return 0;
}
@@ -301,19 +301,19 @@ static int dwc3_octeon_clocks_start(struct device *dev, u64 base)
i = of_property_read_u32(dev->of_node,
"refclk-frequency", &clock_rate);
if (i) {
- pr_err("No UCTL \"refclk-frequency\"\n");
+ dev_err(dev, "No UCTL \"refclk-frequency\"\n");
return -EINVAL;
}
i = of_property_read_string(dev->of_node,
"refclk-type-ss", &ss_clock_type);
if (i) {
- pr_err("No UCTL \"refclk-type-ss\"\n");
+ dev_err(dev, "No UCTL \"refclk-type-ss\"\n");
return -EINVAL;
}
i = of_property_read_string(dev->of_node,
"refclk-type-hs", &hs_clock_type);
if (i) {
- pr_err("No UCTL \"refclk-type-hs\"\n");
+ dev_err(dev, "No UCTL \"refclk-type-hs\"\n");
return -EINVAL;
}
if (strcmp("dlmc_ref_clk0", ss_clock_type) == 0) {
@@ -322,29 +322,29 @@ static int dwc3_octeon_clocks_start(struct device *dev, u64 base)
else if (strcmp(hs_clock_type, "pll_ref_clk") == 0)
ref_clk_sel = 2;
else
- pr_err("Invalid HS clock type %s, using pll_ref_clk instead\n",
- hs_clock_type);
+ dev_warn(dev, "Invalid HS clock type %s, using pll_ref_clk instead\n",
+ hs_clock_type);
} else if (strcmp(ss_clock_type, "dlmc_ref_clk1") == 0) {
if (strcmp(hs_clock_type, "dlmc_ref_clk1") == 0)
ref_clk_sel = 1;
else if (strcmp(hs_clock_type, "pll_ref_clk") == 0)
ref_clk_sel = 3;
else {
- pr_err("Invalid HS clock type %s, using pll_ref_clk instead\n",
- hs_clock_type);
+ dev_warn(dev, "Invalid HS clock type %s, using pll_ref_clk instead\n",
+ hs_clock_type);
ref_clk_sel = 3;
}
} else
- pr_err("Invalid SS clock type %s, using dlmc_ref_clk0 instead\n",
- ss_clock_type);
+ dev_warn(dev, "Invalid SS clock type %s, using dlmc_ref_clk0 instead\n",
+ ss_clock_type);
if ((ref_clk_sel == 0 || ref_clk_sel == 1) &&
- (clock_rate != 100000000))
- pr_err("Invalid UCTL clock rate of %u, using 100000000 instead\n",
- clock_rate);
+ (clock_rate != 100000000))
+ dev_warn(dev, "Invalid UCTL clock rate of %u, using 100000000 instead\n",
+ clock_rate);
} else {
- pr_err("No USB UCTL device node\n");
+ dev_err(dev, "No USB UCTL device node\n");
return -EINVAL;
}
@@ -396,8 +396,8 @@ static int dwc3_octeon_clocks_start(struct device *dev, u64 base)
uctl_ctl.s.ref_clk_div2 = 0;
switch (clock_rate) {
default:
- dev_err(dev, "Invalid ref_clk %u, using 100000000 instead\n",
- clock_rate);
+ dev_warn(dev, "Invalid ref_clk %u, using 100000000 instead\n",
+ clock_rate);
fallthrough;
case 100000000:
mpll_mul = 0x19;
@@ -438,10 +438,8 @@ static int dwc3_octeon_clocks_start(struct device *dev, u64 base)
udelay(10);
/* Steo 8c: Setup power-power control. */
- if (dwc3_octeon_config_power(dev, base)) {
- dev_err(dev, "Error configuring power.\n");
+ if (dwc3_octeon_config_power(dev, base))
return -EINVAL;
- }
/* Step 8d: Deassert UAHC reset signal. */
uctl_ctl.u64 = cvmx_read_csr(uctl_ctl_reg);
@@ -529,10 +527,10 @@ static int __init dwc3_octeon_device_init(void)
}
mutex_lock(&dwc3_octeon_clocks_mutex);
- dwc3_octeon_clocks_start(&pdev->dev, (u64)base);
+ if (dwc3_octeon_clocks_start(&pdev->dev, (u64)base) == 0)
+ dev_info(&pdev->dev, "clocks initialized.\n");
dwc3_octeon_set_endian_mode((u64)base);
dwc3_octeon_phy_reset((u64)base);
- dev_info(&pdev->dev, "clocks initialized.\n");
mutex_unlock(&dwc3_octeon_clocks_mutex);
devm_iounmap(&pdev->dev, base);
devm_release_mem_region(&pdev->dev, res->start,
diff --git a/arch/mips/include/asm/asmmacro-32.h b/arch/mips/include/asm/asmmacro-32.h
index 1c08c1f7903c..83a4940b7c89 100644
--- a/arch/mips/include/asm/asmmacro-32.h
+++ b/arch/mips/include/asm/asmmacro-32.h
@@ -15,7 +15,7 @@
.macro fpu_save_single thread tmp=t0
.set push
- SET_HARDFLOAT
+ .set hardfloat
cfc1 \tmp, fcr31
s.d $f0, THREAD_FPR0(\thread)
s.d $f2, THREAD_FPR2(\thread)
@@ -39,7 +39,7 @@
.macro fpu_restore_single thread tmp=t0
.set push
- SET_HARDFLOAT
+ .set hardfloat
lw \tmp, THREAD_FCR31(\thread)
l.d $f0, THREAD_FPR0(\thread)
l.d $f2, THREAD_FPR2(\thread)
diff --git a/arch/mips/include/asm/asmmacro.h b/arch/mips/include/asm/asmmacro.h
index ca83ada7015f..1c4438f3f2ab 100644
--- a/arch/mips/include/asm/asmmacro.h
+++ b/arch/mips/include/asm/asmmacro.h
@@ -83,7 +83,7 @@
.macro fpu_save_16even thread tmp=t0
.set push
- SET_HARDFLOAT
+ .set hardfloat
cfc1 \tmp, fcr31
sdc1 $f0, THREAD_FPR0(\thread)
sdc1 $f2, THREAD_FPR2(\thread)
@@ -109,7 +109,7 @@
.set push
.set mips64r2
.set fp=64
- SET_HARDFLOAT
+ .set hardfloat
sdc1 $f1, THREAD_FPR1(\thread)
sdc1 $f3, THREAD_FPR3(\thread)
sdc1 $f5, THREAD_FPR5(\thread)
@@ -142,7 +142,7 @@
.macro fpu_restore_16even thread tmp=t0
.set push
- SET_HARDFLOAT
+ .set hardfloat
lw \tmp, THREAD_FCR31(\thread)
ldc1 $f0, THREAD_FPR0(\thread)
ldc1 $f2, THREAD_FPR2(\thread)
@@ -168,7 +168,7 @@
.set push
.set mips64r2
.set fp=64
- SET_HARDFLOAT
+ .set hardfloat
ldc1 $f1, THREAD_FPR1(\thread)
ldc1 $f3, THREAD_FPR3(\thread)
ldc1 $f5, THREAD_FPR5(\thread)
@@ -373,7 +373,7 @@
.macro _cfcmsa rd, cs
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
insn_if_mips 0x787e0059 | (\cs << 11)
insn32_if_mm 0x587e0056 | (\cs << 11)
move \rd, $1
@@ -383,7 +383,7 @@
.macro _ctcmsa cd, rs
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
move $1, \rs
insn_if_mips 0x783e0819 | (\cd << 6)
insn32_if_mm 0x583e0816 | (\cd << 6)
@@ -393,7 +393,7 @@
.macro ld_b wd, off, base
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
PTR_ADDU $1, \base, \off
insn_if_mips 0x78000820 | (\wd << 6)
insn32_if_mm 0x58000807 | (\wd << 6)
@@ -403,7 +403,7 @@
.macro ld_h wd, off, base
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
PTR_ADDU $1, \base, \off
insn_if_mips 0x78000821 | (\wd << 6)
insn32_if_mm 0x58000817 | (\wd << 6)
@@ -413,7 +413,7 @@
.macro ld_w wd, off, base
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
PTR_ADDU $1, \base, \off
insn_if_mips 0x78000822 | (\wd << 6)
insn32_if_mm 0x58000827 | (\wd << 6)
@@ -423,7 +423,7 @@
.macro ld_d wd, off, base
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
PTR_ADDU $1, \base, \off
insn_if_mips 0x78000823 | (\wd << 6)
insn32_if_mm 0x58000837 | (\wd << 6)
@@ -433,7 +433,7 @@
.macro st_b wd, off, base
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
PTR_ADDU $1, \base, \off
insn_if_mips 0x78000824 | (\wd << 6)
insn32_if_mm 0x5800080f | (\wd << 6)
@@ -443,7 +443,7 @@
.macro st_h wd, off, base
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
PTR_ADDU $1, \base, \off
insn_if_mips 0x78000825 | (\wd << 6)
insn32_if_mm 0x5800081f | (\wd << 6)
@@ -453,7 +453,7 @@
.macro st_w wd, off, base
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
PTR_ADDU $1, \base, \off
insn_if_mips 0x78000826 | (\wd << 6)
insn32_if_mm 0x5800082f | (\wd << 6)
@@ -463,7 +463,7 @@
.macro st_d wd, off, base
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
PTR_ADDU $1, \base, \off
insn_if_mips 0x78000827 | (\wd << 6)
insn32_if_mm 0x5800083f | (\wd << 6)
@@ -473,7 +473,7 @@
.macro copy_s_w ws, n
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
insn_if_mips 0x78b00059 | (\n << 16) | (\ws << 11)
insn32_if_mm 0x58b00056 | (\n << 16) | (\ws << 11)
.set pop
@@ -482,7 +482,7 @@
.macro copy_s_d ws, n
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
insn_if_mips 0x78b80059 | (\n << 16) | (\ws << 11)
insn32_if_mm 0x58b80056 | (\n << 16) | (\ws << 11)
.set pop
@@ -491,7 +491,7 @@
.macro insert_w wd, n
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
insn_if_mips 0x79300819 | (\n << 16) | (\wd << 6)
insn32_if_mm 0x59300816 | (\n << 16) | (\wd << 6)
.set pop
@@ -500,7 +500,7 @@
.macro insert_d wd, n
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
insn_if_mips 0x79380819 | (\n << 16) | (\wd << 6)
insn32_if_mm 0x59380816 | (\n << 16) | (\wd << 6)
.set pop
@@ -553,7 +553,7 @@
st_d 29, THREAD_FPR29 - FPR_BASE_OFFS, FPR_BASE
st_d 30, THREAD_FPR30 - FPR_BASE_OFFS, FPR_BASE
st_d 31, THREAD_FPR31 - FPR_BASE_OFFS, FPR_BASE
- SET_HARDFLOAT
+ .set hardfloat
_cfcmsa $1, MSA_CSR
sw $1, THREAD_MSA_CSR(\thread)
.set pop
@@ -562,7 +562,7 @@
.macro msa_restore_all thread
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
lw $1, THREAD_MSA_CSR(\thread)
_ctcmsa MSA_CSR, $1
#ifdef TOOLCHAIN_SUPPORTS_MSA
@@ -618,7 +618,7 @@
.macro msa_init_all_upper
.set push
.set noat
- SET_HARDFLOAT
+ .set hardfloat
not $1, zero
msa_init_upper 0
msa_init_upper 1
diff --git a/arch/mips/include/asm/fpregdef.h b/arch/mips/include/asm/fpregdef.h
index f184ba088532..429481f9028d 100644
--- a/arch/mips/include/asm/fpregdef.h
+++ b/arch/mips/include/asm/fpregdef.h
@@ -14,20 +14,6 @@
#include <asm/sgidefs.h>
-/*
- * starting with binutils 2.24.51.20140729, MIPS binutils warn about mixing
- * hardfloat and softfloat object files. The kernel build uses soft-float by
- * default, so we also need to pass -msoft-float along to GAS if it supports it.
- * But this in turn causes assembler errors in files which access hardfloat
- * registers. We detect if GAS supports "-msoft-float" in the Makefile and
- * explicitly put ".set hardfloat" where floating point registers are touched.
- */
-#ifdef GAS_HAS_SET_HARDFLOAT
-#define SET_HARDFLOAT .set hardfloat
-#else
-#define SET_HARDFLOAT
-#endif
-
#if _MIPS_SIM == _MIPS_SIM_ABI32
/*
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 5cedb28e8a40..2803c9c21ef9 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -758,7 +758,7 @@ struct kvm_mips_callbacks {
void (*vcpu_reenter)(struct kvm_vcpu *vcpu);
};
extern struct kvm_mips_callbacks *kvm_mips_callbacks;
-int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
+int kvm_mips_emulation_init(void);
/* Debug: dump vcpu state */
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
@@ -888,7 +888,6 @@ extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm);
extern int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
struct kvm_mips_interrupt *irq);
-static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot) {}
diff --git a/arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h b/arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h
index 30f4114ab872..3c401f11655e 100644
--- a/arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h
+++ b/arch/mips/include/asm/mach-bcm47xx/bcm47xx_board.h
@@ -61,6 +61,7 @@ enum bcm47xx_board {
BCM47XX_BOARD_LINKSYS_E1000V21,
BCM47XX_BOARD_LINKSYS_E1200V2,
BCM47XX_BOARD_LINKSYS_E2000V1,
+ BCM47XX_BOARD_LINKSYS_E2500V3,
BCM47XX_BOARD_LINKSYS_E3000V1,
BCM47XX_BOARD_LINKSYS_E3200V1,
BCM47XX_BOARD_LINKSYS_E4200V1,
diff --git a/arch/mips/include/asm/mach-rc32434/pci.h b/arch/mips/include/asm/mach-rc32434/pci.h
index 9a6eefd12757..3eb767c8a4ee 100644
--- a/arch/mips/include/asm/mach-rc32434/pci.h
+++ b/arch/mips/include/asm/mach-rc32434/pci.h
@@ -374,7 +374,7 @@ struct pci_msu {
PCI_CFG04_STAT_SSE | \
PCI_CFG04_STAT_PE)
-#define KORINA_CNFG1 ((KORINA_STAT<<16)|KORINA_CMD)
+#define KORINA_CNFG1 (KORINA_STAT | KORINA_CMD)
#define KORINA_REVID 0
#define KORINA_CLASS_CODE 0
diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index 99eeafe6dcab..2d53704d9f24 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -2367,7 +2367,7 @@ do { \
/*
* Macros to access the floating point coprocessor control registers
*/
-#define _read_32bit_cp1_register(source, gas_hardfloat) \
+#define read_32bit_cp1_register(source) \
({ \
unsigned int __res; \
\
@@ -2377,36 +2377,24 @@ do { \
" # gas fails to assemble cfc1 for some archs, \n" \
" # like Octeon. \n" \
" .set mips1 \n" \
- " "STR(gas_hardfloat)" \n" \
+ " .set hardfloat \n" \
" cfc1 %0,"STR(source)" \n" \
" .set pop \n" \
: "=r" (__res)); \
__res; \
})
-#define _write_32bit_cp1_register(dest, val, gas_hardfloat) \
+#define write_32bit_cp1_register(dest, val) \
do { \
__asm__ __volatile__( \
" .set push \n" \
" .set reorder \n" \
- " "STR(gas_hardfloat)" \n" \
+ " .set hardfloat \n" \
" ctc1 %0,"STR(dest)" \n" \
" .set pop \n" \
: : "r" (val)); \
} while (0)
-#ifdef GAS_HAS_SET_HARDFLOAT
-#define read_32bit_cp1_register(source) \
- _read_32bit_cp1_register(source, .set hardfloat)
-#define write_32bit_cp1_register(dest, val) \
- _write_32bit_cp1_register(dest, val, .set hardfloat)
-#else
-#define read_32bit_cp1_register(source) \
- _read_32bit_cp1_register(source, )
-#define write_32bit_cp1_register(dest, val) \
- _write_32bit_cp1_register(dest, val, )
-#endif
-
#ifdef TOOLCHAIN_SUPPORTS_DSP
#define rddsp(mask) \
({ \
diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h
index 25fa651c937d..ebdf4d910af2 100644
--- a/arch/mips/include/asm/syscall.h
+++ b/arch/mips/include/asm/syscall.h
@@ -38,7 +38,7 @@ static inline bool mips_syscall_is_indirect(struct task_struct *task,
static inline long syscall_get_nr(struct task_struct *task,
struct pt_regs *regs)
{
- return current_thread_info()->syscall;
+ return task_thread_info(task)->syscall;
}
static inline void mips_syscall_update_nr(struct task_struct *task,
diff --git a/arch/mips/include/asm/vpe.h b/arch/mips/include/asm/vpe.h
index baa949a744cb..ef7e07829607 100644
--- a/arch/mips/include/asm/vpe.h
+++ b/arch/mips/include/asm/vpe.h
@@ -102,7 +102,6 @@ struct vpe_control {
struct list_head tc_list; /* Thread contexts */
};
-extern unsigned long physical_memsize;
extern struct vpe_control vpecontrol;
extern const struct file_operations vpe_fops;
diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S
index 3425df6019c0..b6de8e88c1bd 100644
--- a/arch/mips/kernel/genex.S
+++ b/arch/mips/kernel/genex.S
@@ -480,7 +480,7 @@ NESTED(nmi_handler, PT_SIZE, sp)
.set push
/* gas fails to assemble cfc1 for some archs (octeon).*/ \
.set mips1
- SET_HARDFLOAT
+ .set hardfloat
cfc1 a1, fcr31
.set pop
.endm
diff --git a/arch/mips/kernel/r2300_fpu.S b/arch/mips/kernel/r2300_fpu.S
index 2748c55820c2..6c745aa9e825 100644
--- a/arch/mips/kernel/r2300_fpu.S
+++ b/arch/mips/kernel/r2300_fpu.S
@@ -64,7 +64,7 @@ LEAF(_restore_fp)
*/
LEAF(_save_fp_context)
.set push
- SET_HARDFLOAT
+ .set hardfloat
li v0, 0 # assume success
cfc1 t1, fcr31
EX2(s.d $f0, 0(a0))
@@ -98,7 +98,7 @@ LEAF(_save_fp_context)
*/
LEAF(_restore_fp_context)
.set push
- SET_HARDFLOAT
+ .set hardfloat
li v0, 0 # assume success
EX(lw t0, (a1))
EX2(l.d $f0, 0(a0))
diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
index 2e687c60bc4f..4e8c98517d9d 100644
--- a/arch/mips/kernel/r4k_fpu.S
+++ b/arch/mips/kernel/r4k_fpu.S
@@ -26,7 +26,7 @@
.macro EX insn, reg, src
.set push
- SET_HARDFLOAT
+ .set hardfloat
.set nomacro
.ex\@: \insn \reg, \src
.set pop
@@ -98,14 +98,14 @@ LEAF(_init_msa_upper)
*/
LEAF(_save_fp_context)
.set push
- SET_HARDFLOAT
+ .set hardfloat
cfc1 t1, fcr31
.set pop
#if defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPSR2) || \
defined(CONFIG_CPU_MIPSR5) || defined(CONFIG_CPU_MIPSR6)
.set push
- SET_HARDFLOAT
+ .set hardfloat
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5)
.set mips32r2
.set fp=64
@@ -135,7 +135,7 @@ LEAF(_save_fp_context)
#endif
.set push
- SET_HARDFLOAT
+ .set hardfloat
/* Store the 16 even double precision registers */
EX sdc1 $f0, 0(a0)
EX sdc1 $f2, 16(a0)
@@ -173,7 +173,7 @@ LEAF(_restore_fp_context)
#if defined(CONFIG_64BIT) || defined(CONFIG_CPU_MIPSR2) || \
defined(CONFIG_CPU_MIPSR5) || defined(CONFIG_CPU_MIPSR6)
.set push
- SET_HARDFLOAT
+ .set hardfloat
#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR5)
.set mips32r2
.set fp=64
@@ -201,7 +201,7 @@ LEAF(_restore_fp_context)
1: .set pop
#endif
.set push
- SET_HARDFLOAT
+ .set hardfloat
EX ldc1 $f0, 0(a0)
EX ldc1 $f2, 16(a0)
EX ldc1 $f4, 32(a0)
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index bcd6a944b839..f2df0cae1b4d 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -424,9 +424,11 @@ static void cps_shutdown_this_cpu(enum cpu_death death)
wmb();
}
} else {
- pr_debug("Gating power to core %d\n", core);
- /* Power down the core */
- cps_pm_enter_state(CPS_PM_POWER_GATED);
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+ pr_debug("Gating power to core %d\n", core);
+ /* Power down the core */
+ cps_pm_enter_state(CPS_PM_POWER_GATED);
+ }
}
}
diff --git a/arch/mips/kernel/vpe-mt.c b/arch/mips/kernel/vpe-mt.c
index 84a82b551ec3..223d6274f2e5 100644
--- a/arch/mips/kernel/vpe-mt.c
+++ b/arch/mips/kernel/vpe-mt.c
@@ -92,12 +92,11 @@ int vpe_run(struct vpe *v)
write_tc_c0_tchalt(read_tc_c0_tchalt() & ~TCHALT_H);
/*
- * The sde-kit passes 'memsize' to __start in $a3, so set something
- * here... Or set $a3 to zero and define DFLT_STACK_SIZE and
- * DFLT_HEAP_SIZE when you compile your program
+ * We don't pass the memsize here, so VPE programs need to be
+ * compiled with DFLT_STACK_SIZE and DFLT_HEAP_SIZE defined.
*/
+ mttgpr(7, 0);
mttgpr(6, v->ntcs);
- mttgpr(7, physical_memsize);
/* set up VPE1 */
/*
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 91d197bee9c0..29e51649203b 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
select MMU_NOTIFIER
select SRCU
select INTERVAL_TREE
+ select KVM_GENERIC_HARDWARE_ENABLING
help
Support for hosting Guest kernels.
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 21ff75bcdbc4..805aeea2166e 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -17,4 +17,4 @@ kvm-$(CONFIG_CPU_LOONGSON64) += loongson_ipi.o
kvm-y += vz.o
obj-$(CONFIG_KVM) += kvm.o
-obj-y += callback.o tlb.o
+obj-y += tlb.o
diff --git a/arch/mips/kvm/callback.c b/arch/mips/kvm/callback.c
deleted file mode 100644
index d88aa2173fb0..000000000000
--- a/arch/mips/kvm/callback.c
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
- * Authors: Yann Le Du <ledu@kymasys.com>
- */
-
-#include <linux/export.h>
-#include <linux/kvm_host.h>
-
-struct kvm_mips_callbacks *kvm_mips_callbacks;
-EXPORT_SYMBOL_GPL(kvm_mips_callbacks);
diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S
index 16f17c6390dd..eb2e8cc3532f 100644
--- a/arch/mips/kvm/fpu.S
+++ b/arch/mips/kvm/fpu.S
@@ -22,7 +22,7 @@
LEAF(__kvm_save_fpu)
.set push
- SET_HARDFLOAT
+ .set hardfloat
.set fp=64
mfc0 t0, CP0_STATUS
sll t0, t0, 5 # is Status.FR set?
@@ -66,7 +66,7 @@ LEAF(__kvm_save_fpu)
LEAF(__kvm_restore_fpu)
.set push
- SET_HARDFLOAT
+ .set hardfloat
.set fp=64
mfc0 t0, CP0_STATUS
sll t0, t0, 5 # is Status.FR set?
@@ -110,7 +110,7 @@ LEAF(__kvm_restore_fpu)
LEAF(__kvm_restore_fcsr)
.set push
- SET_HARDFLOAT
+ .set hardfloat
lw t0, VCPU_FCR31(a0)
/*
* The ctc1 must stay at this offset in __kvm_restore_fcsr.
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index a25e0b73ee70..36c8991b5d39 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -135,16 +135,6 @@ void kvm_arch_hardware_disable(void)
kvm_mips_callbacks->hardware_disable();
}
-int kvm_arch_hardware_setup(void *opaque)
-{
- return 0;
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- return 0;
-}
-
extern void kvm_init_loongson_ipi(struct kvm *kvm);
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -1015,21 +1005,6 @@ long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
return r;
}
-int kvm_arch_init(void *opaque)
-{
- if (kvm_mips_callbacks) {
- kvm_err("kvm: module already exists\n");
- return -EEXIST;
- }
-
- return kvm_mips_emulation_init(&kvm_mips_callbacks);
-}
-
-void kvm_arch_exit(void)
-{
- kvm_mips_callbacks = NULL;
-}
-
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@@ -1646,16 +1621,21 @@ static int __init kvm_mips_init(void)
if (ret)
return ret;
- ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
-
+ ret = kvm_mips_emulation_init();
if (ret)
return ret;
+
if (boot_cpu_type() == CPU_LOONGSON64)
kvm_priority_to_irq = kvm_loongson3_priority_to_irq;
register_die_notifier(&kvm_mips_csr_die_notifier);
+ ret = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ if (ret) {
+ unregister_die_notifier(&kvm_mips_csr_die_notifier);
+ return ret;
+ }
return 0;
}
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index c706f5890a05..dafab003ea0d 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -3304,7 +3304,10 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
.vcpu_reenter = kvm_vz_vcpu_reenter,
};
-int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
+/* FIXME: Get rid of the callbacks now that trap-and-emulate is gone. */
+struct kvm_mips_callbacks *kvm_mips_callbacks = &kvm_vz_callbacks;
+
+int kvm_mips_emulation_init(void)
{
if (!cpu_has_vz)
return -ENODEV;
@@ -3318,7 +3321,5 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
return -ENODEV;
pr_info("Starting KVM with MIPS VZ extensions\n");
-
- *install_callbacks = &kvm_vz_callbacks;
return 0;
}
diff --git a/arch/mips/lantiq/prom.c b/arch/mips/lantiq/prom.c
index be4829cc7a3a..a3cf29365858 100644
--- a/arch/mips/lantiq/prom.c
+++ b/arch/mips/lantiq/prom.c
@@ -23,12 +23,6 @@ DEFINE_SPINLOCK(ebu_lock);
EXPORT_SYMBOL_GPL(ebu_lock);
/*
- * This is needed by the VPE loader code, just set it to 0 and assume
- * that the firmware hardcodes this value to something useful.
- */
-unsigned long physical_memsize = 0L;
-
-/*
* this struct is filled by the soc specific detection code and holds
* information about the specific soc type, revision and name
*/
diff --git a/arch/mips/lantiq/xway/dcdc.c b/arch/mips/lantiq/xway/dcdc.c
index 4960bee0a99d..96199966a350 100644
--- a/arch/mips/lantiq/xway/dcdc.c
+++ b/arch/mips/lantiq/xway/dcdc.c
@@ -22,10 +22,7 @@ static void __iomem *dcdc_membase;
static int dcdc_probe(struct platform_device *pdev)
{
- struct resource *res;
-
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- dcdc_membase = devm_ioremap_resource(&pdev->dev, res);
+ dcdc_membase = devm_platform_get_and_ioremap_resource(pdev, 0, NULL);
if (IS_ERR(dcdc_membase))
return PTR_ERR(dcdc_membase);
diff --git a/arch/mips/lantiq/xway/dma.c b/arch/mips/lantiq/xway/dma.c
index f8eedeb15f18..934ac72937e5 100644
--- a/arch/mips/lantiq/xway/dma.c
+++ b/arch/mips/lantiq/xway/dma.c
@@ -239,12 +239,10 @@ static int
ltq_dma_init(struct platform_device *pdev)
{
struct clk *clk;
- struct resource *res;
unsigned int id, nchannels;
int i;
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- ltq_dma_membase = devm_ioremap_resource(&pdev->dev, res);
+ ltq_dma_membase = devm_platform_get_and_ioremap_resource(pdev, 0, NULL);
if (IS_ERR(ltq_dma_membase))
panic("Failed to remap dma resource");
diff --git a/arch/mips/lantiq/xway/gptu.c b/arch/mips/lantiq/xway/gptu.c
index 200fe9ff641d..a492b1eb1925 100644
--- a/arch/mips/lantiq/xway/gptu.c
+++ b/arch/mips/lantiq/xway/gptu.c
@@ -136,17 +136,14 @@ static inline void clkdev_add_gptu(struct device *dev, const char *con,
static int gptu_probe(struct platform_device *pdev)
{
struct clk *clk;
- struct resource *res;
if (of_irq_to_resource_table(pdev->dev.of_node, irqres, 6) != 6) {
dev_err(&pdev->dev, "Failed to get IRQ list\n");
return -EINVAL;
}
- res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
/* remap gptu register range */
- gptu_membase = devm_ioremap_resource(&pdev->dev, res);
+ gptu_membase = devm_platform_get_and_ioremap_resource(pdev, 0, NULL);
if (IS_ERR(gptu_membase))
return PTR_ERR(gptu_membase);
diff --git a/arch/mips/loongson2ef/Platform b/arch/mips/loongson2ef/Platform
index eebabf9df6ac..c6f7a4b95997 100644
--- a/arch/mips/loongson2ef/Platform
+++ b/arch/mips/loongson2ef/Platform
@@ -25,7 +25,7 @@ cflags-$(CONFIG_CPU_LOONGSON2F) += -march=loongson2f
# binutils does not merge support for the flag then we can revisit & remove
# this later - for now it ensures vendor toolchains don't cause problems.
#
-cflags-$(CONFIG_CPU_LOONGSON2EF) += $(call as-option,-Wa$(comma)-mno-fix-loongson3-llsc,)
+cflags-$(CONFIG_CPU_LOONGSON2EF) += $(call cc-option,-Wa$(comma)-mno-fix-loongson3-llsc,)
# Enable the workarounds for Loongson2f
ifdef CONFIG_CPU_LOONGSON2F_WORKAROUNDS
diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c
index 8d16cd021f60..d967e4c0cf24 100644
--- a/arch/mips/pci/pci-lantiq.c
+++ b/arch/mips/pci/pci-lantiq.c
@@ -204,17 +204,13 @@ static int ltq_pci_startup(struct platform_device *pdev)
static int ltq_pci_probe(struct platform_device *pdev)
{
- struct resource *res_cfg, *res_bridge;
-
pci_clear_flags(PCI_PROBE_ONLY);
- res_bridge = platform_get_resource(pdev, IORESOURCE_MEM, 1);
- ltq_pci_membase = devm_ioremap_resource(&pdev->dev, res_bridge);
+ ltq_pci_membase = devm_platform_get_and_ioremap_resource(pdev, 1, NULL);
if (IS_ERR(ltq_pci_membase))
return PTR_ERR(ltq_pci_membase);
- res_cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- ltq_pci_mapped_cfg = devm_ioremap_resource(&pdev->dev, res_cfg);
+ ltq_pci_mapped_cfg = devm_platform_get_and_ioremap_resource(pdev, 0, NULL);
if (IS_ERR(ltq_pci_mapped_cfg))
return PTR_ERR(ltq_pci_mapped_cfg);
diff --git a/arch/mips/pci/pci-mt7620.c b/arch/mips/pci/pci-mt7620.c
index e032932348d6..2700d75d41c5 100644
--- a/arch/mips/pci/pci-mt7620.c
+++ b/arch/mips/pci/pci-mt7620.c
@@ -282,21 +282,17 @@ static int mt7628_pci_hw_init(struct platform_device *pdev)
static int mt7620_pci_probe(struct platform_device *pdev)
{
- struct resource *bridge_res = platform_get_resource(pdev,
- IORESOURCE_MEM, 0);
- struct resource *pcie_res = platform_get_resource(pdev,
- IORESOURCE_MEM, 1);
u32 val = 0;
rstpcie0 = devm_reset_control_get_exclusive(&pdev->dev, "pcie0");
if (IS_ERR(rstpcie0))
return PTR_ERR(rstpcie0);
- bridge_base = devm_ioremap_resource(&pdev->dev, bridge_res);
+ bridge_base = devm_platform_get_and_ioremap_resource(pdev, 0, NULL);
if (IS_ERR(bridge_base))
return PTR_ERR(bridge_base);
- pcie_base = devm_ioremap_resource(&pdev->dev, pcie_res);
+ pcie_base = devm_platform_get_and_ioremap_resource(pdev, 1, NULL);
if (IS_ERR(pcie_base))
return PTR_ERR(pcie_base);
diff --git a/arch/mips/ralink/Kconfig b/arch/mips/ralink/Kconfig
index f9fe15630abb..06031796c87b 100644
--- a/arch/mips/ralink/Kconfig
+++ b/arch/mips/ralink/Kconfig
@@ -54,10 +54,11 @@ choice
select HAVE_PCI
select PCI_DRIVERS_GENERIC
select SOC_BUS
+ select PINCTRL_MT7621
help
- The MT7621 system-on-a-chip includes an 880 MHz MIPS1004Kc dual-core CPU,
- a 5-port 10/100/1000 switch/PHY and one RGMII.
+ The MT7621 system-on-a-chip includes an 880 MHz MIPS1004Kc
+ dual-core CPU, a 5-port 10/100/1000 switch/PHY and one RGMII.
endchoice
choice
diff --git a/arch/mips/ralink/timer.c b/arch/mips/ralink/timer.c
index 652424d8ed51..fc503679a93d 100644
--- a/arch/mips/ralink/timer.c
+++ b/arch/mips/ralink/timer.c
@@ -95,7 +95,6 @@ static int rt_timer_enable(struct rt_timer *rt)
static int rt_timer_probe(struct platform_device *pdev)
{
- struct resource *res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
struct rt_timer *rt;
struct clk *clk;
@@ -109,7 +108,7 @@ static int rt_timer_probe(struct platform_device *pdev)
if (rt->irq < 0)
return rt->irq;
- rt->membase = devm_ioremap_resource(&pdev->dev, res);
+ rt->membase = devm_platform_get_and_ioremap_resource(pdev, 0, NULL);
if (IS_ERR(rt->membase))
return PTR_ERR(rt->membase);
diff --git a/arch/mips/vdso/Kconfig b/arch/mips/vdso/Kconfig
index a665f6108cb5..70140248da72 100644
--- a/arch/mips/vdso/Kconfig
+++ b/arch/mips/vdso/Kconfig
@@ -1,18 +1,6 @@
-# For the pre-R6 code in arch/mips/vdso/vdso.h for locating
-# the base address of VDSO, the linker will emit a R_MIPS_PC32
-# relocation in binutils > 2.25 but it will fail with older versions
-# because that relocation is not supported for that symbol. As a result
-# of which we are forced to disable the VDSO symbols when building
-# with < 2.25 binutils on pre-R6 kernels. For more references on why we
-# can't use other methods to get the base address of VDSO please refer to
-# the comments on that file.
-#
# GCC (at least up to version 9.2) appears to emit function calls that make use
# of the GOT when targeting microMIPS, which we can't use in the VDSO due to
# the lack of relocations. As such, we disable the VDSO for microMIPS builds.
-config MIPS_LD_CAN_LINK_VDSO
- def_bool LD_VERSION >= 22500 || LD_IS_LLD
-
config MIPS_DISABLE_VDSO
- def_bool CPU_MICROMIPS || (!CPU_MIPSR6 && !MIPS_LD_CAN_LINK_VDSO)
+ def_bool CPU_MICROMIPS
diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index 1f7d5c6c10b0..18af9474ed0e 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -52,9 +52,6 @@ endif
CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE)
ifdef CONFIG_MIPS_DISABLE_VDSO
- ifndef CONFIG_MIPS_LD_CAN_LINK_VDSO
- $(warning MIPS VDSO requires binutils >= 2.25)
- endif
obj-vdso-y := $(filter-out vgettimeofday.o, $(obj-vdso-y))
endif
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7a5f8dbfbdd0..2c9cdf1d8761 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -197,6 +197,7 @@ config PPC
select HAVE_ARCH_KASAN if PPC_RADIX_MMU
select HAVE_ARCH_KASAN if PPC_BOOK3E_64
select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
+ select HAVE_ARCH_KCSAN if PPC_BOOK3S_64
select HAVE_ARCH_KFENCE if ARCH_SUPPORTS_DEBUG_PAGEALLOC
select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_KGDB
@@ -206,7 +207,7 @@ config PPC
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_ASM_MODVERSIONS
- select HAVE_CONTEXT_TRACKING_USER if PPC64
+ select HAVE_CONTEXT_TRACKING_USER
select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
@@ -256,6 +257,7 @@ config PPC
select HAVE_STATIC_CALL if PPC32
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
+ select HAVE_VIRT_CPU_ACCOUNTING_GEN
select HUGETLB_PAGE_SIZE_VARIABLE if PPC_BOOK3S_64 && HUGETLB_PAGE
select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
@@ -387,10 +389,22 @@ config PPC_DCR
depends on PPC_DCR_NATIVE || PPC_DCR_MMIO
default y
+config PPC_PCI_OF_BUS_MAP
+ bool "Use pci_to_OF_bus_map (deprecated)"
+ depends on PPC32
+ depends on PPC_PMAC || PPC_CHRP
+ help
+ This option uses pci_to_OF_bus_map to map OF nodes to PCI devices, which
+ restricts the system to only having 256 PCI buses. On CHRP it also causes
+ the "pci-OF-bus-map" property to be created in the device tree.
+
+ If unsure, say "N".
+
config PPC_PCI_BUS_NUM_DOMAIN_DEPENDENT
depends on PPC32
- depends on !PPC_PMAC && !PPC_CHRP
+ depends on !PPC_PCI_OF_BUS_MAP
bool "Assign PCI bus numbers from zero individually for each PCI domain"
+ default y
help
By default on PPC32 were PCI bus numbers unique across all PCI domains.
So system could have only 256 PCI buses independently of available
@@ -1028,6 +1042,7 @@ config PPC_SECURE_BOOT
depends on PPC_POWERNV || PPC_PSERIES
depends on IMA_ARCH_POLICY
imply IMA_SECURE_AND_OR_TRUSTED_BOOT
+ select PSERIES_PLPKS if PPC_PSERIES
help
Systems with firmware secure boot enabled need to define security
policies to extend secure boot to the OS. This config allows a user
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 533457466ce2..e91d7e91347d 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -90,7 +90,7 @@ aflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -mlittle-endian
ifeq ($(HAS_BIARCH),y)
KBUILD_CFLAGS += -m$(BITS)
-KBUILD_AFLAGS += -m$(BITS) -Wl,-a$(BITS)
+KBUILD_AFLAGS += -m$(BITS)
KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION)
endif
@@ -146,19 +146,6 @@ CFLAGS-$(CONFIG_PPC32) += $(call cc-option, $(MULTIPLEWORD))
CFLAGS-$(CONFIG_PPC32) += $(call cc-option,-mno-readonly-in-sdata)
-ifdef CONFIG_PPC_BOOK3S_64
-ifdef CONFIG_CPU_LITTLE_ENDIAN
-CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=power8
-else
-CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=power4
-endif
-CFLAGS-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=power10, \
- $(call cc-option,-mtune=power9, \
- $(call cc-option,-mtune=power8)))
-else ifdef CONFIG_PPC_BOOK3E_64
-CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64
-endif
-
ifdef CONFIG_FUNCTION_TRACER
CC_FLAGS_FTRACE := -pg
ifdef CONFIG_MPROFILE_KERNEL
@@ -166,11 +153,12 @@ CC_FLAGS_FTRACE += -mprofile-kernel
endif
endif
-CFLAGS-$(CONFIG_TARGET_CPU_BOOL) += $(call cc-option,-mcpu=$(CONFIG_TARGET_CPU))
-AFLAGS-$(CONFIG_TARGET_CPU_BOOL) += $(call cc-option,-mcpu=$(CONFIG_TARGET_CPU))
+CFLAGS-$(CONFIG_TARGET_CPU_BOOL) += -mcpu=$(CONFIG_TARGET_CPU)
+AFLAGS-$(CONFIG_TARGET_CPU_BOOL) += -mcpu=$(CONFIG_TARGET_CPU)
-CFLAGS-$(CONFIG_E5500_CPU) += $(call cc-option,-mcpu=e500mc64,-mcpu=powerpc64)
-CFLAGS-$(CONFIG_E6500_CPU) += $(call cc-option,-mcpu=e6500,$(E5500_CPU))
+CFLAGS-$(CONFIG_POWERPC64_CPU) += $(call cc-option,-mtune=power10, \
+ $(call cc-option,-mtune=power9, \
+ $(call cc-option,-mtune=power8)))
asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1)
@@ -213,10 +201,7 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# often slow when they are implemented at all
KBUILD_CFLAGS += $(call cc-option,-mno-string)
-cpu-as-$(CONFIG_40x) += -Wa,-m405
-cpu-as-$(CONFIG_44x) += -Wa,-m440
cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec)
-cpu-as-$(CONFIG_PPC_E500) += -Wa,-me500
# When using '-many -mpower4' gas will first try and find a matching power4
# mnemonic and failing that it will allow any valid mnemonic that GAS knows
@@ -224,7 +209,6 @@ cpu-as-$(CONFIG_PPC_E500) += -Wa,-me500
# LLVM IAS doesn't understand either flag: https://github.com/ClangBuiltLinux/linux/issues/675
# but LLVM IAS only supports ISA >= 2.06 for Book3S 64 anyway...
cpu-as-$(CONFIG_PPC_BOOK3S_64) += $(call as-option,-Wa$(comma)-mpower4) $(call as-option,-Wa$(comma)-many)
-cpu-as-$(CONFIG_PPC_E500MC) += $(call as-option,-Wa$(comma)-me500mc)
KBUILD_AFLAGS += $(cpu-as-y)
KBUILD_CFLAGS += $(cpu-as-y)
diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink
index a6c77f4d32b2..1f860b3c9bec 100644
--- a/arch/powerpc/Makefile.postlink
+++ b/arch/powerpc/Makefile.postlink
@@ -9,7 +9,7 @@ PHONY := __archpost
__archpost:
-include include/config/auto.conf
-include scripts/Kbuild.include
+include $(srctree)/scripts/Kbuild.include
quiet_cmd_head_check = CHKHEAD $@
cmd_head_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/head_check.sh "$(NM)" "$@"
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index d32d95aea5d6..295f76df13b5 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -39,13 +39,19 @@ BOOTCFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
$(LINUXINCLUDE)
ifdef CONFIG_PPC64_BOOT_WRAPPER
-ifdef CONFIG_CPU_LITTLE_ENDIAN
-BOOTCFLAGS += -m64 -mcpu=powerpc64le
+BOOTCFLAGS += -m64
else
-BOOTCFLAGS += -m64 -mcpu=powerpc64
+BOOTCFLAGS += -m32
endif
+
+ifdef CONFIG_TARGET_CPU_BOOL
+BOOTCFLAGS += -mcpu=$(CONFIG_TARGET_CPU)
+else ifdef CONFIG_PPC64_BOOT_WRAPPER
+ifdef CONFIG_CPU_LITTLE_ENDIAN
+BOOTCFLAGS += -mcpu=powerpc64le
else
-BOOTCFLAGS += -m32 -mcpu=powerpc
+BOOTCFLAGS += -mcpu=powerpc64
+endif
endif
BOOTCFLAGS += -isystem $(shell $(BOOTCC) -print-file-name=include)
diff --git a/arch/powerpc/boot/dts/turris1x.dts b/arch/powerpc/boot/dts/turris1x.dts
index e9cda34a140e..c9b619f6ed5c 100644
--- a/arch/powerpc/boot/dts/turris1x.dts
+++ b/arch/powerpc/boot/dts/turris1x.dts
@@ -367,11 +367,34 @@
};
reboot@d {
+ /*
+ * CPLD firmware which manages system reset and
+ * watchdog registers has bugs. It does not
+ * autoclear system reset register after change
+ * and watchdog ignores reset line on immediate
+ * succeeding reset cycle triggered by watchdog.
+ * These bugs have to be workarounded in U-Boot
+ * bootloader. So use system reset via syscon as
+ * a last resort because older U-Boot versions
+ * do not have workaround for watchdog.
+ *
+ * Reset method via rstcr's global-utilities
+ * (the preferred one) has priority level 128,
+ * watchdog has priority level 0 and default
+ * syscon-reboot priority level is 192.
+ *
+ * So define syscon-reboot with custom priority
+ * level 64 (between rstcr and watchdog) because
+ * rstcr should stay as default preferred reset
+ * method and reset via watchdog is more broken
+ * than system reset via syscon.
+ */
compatible = "syscon-reboot";
reg = <0x0d 0x01>;
offset = <0x0d>;
mask = <0x01>;
value = <0x01>;
+ priority = <64>;
};
led-controller@13 {
diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig
index 0a1b42c4f26a..52a8c5450ecb 100644
--- a/arch/powerpc/configs/ps3_defconfig
+++ b/arch/powerpc/configs/ps3_defconfig
@@ -1,8 +1,3 @@
-CONFIG_PPC64=y
-CONFIG_CELL_CPU=y
-CONFIG_ALTIVEC=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=2
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
CONFIG_HIGH_RES_TIMERS=y
@@ -10,11 +5,12 @@ CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_EMBEDDED=y
# CONFIG_PERF_EVENTS is not set
-# CONFIG_COMPAT_BRK is not set
-CONFIG_SLAB=y
CONFIG_PROFILING=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
+CONFIG_PPC64=y
+CONFIG_CELL_CPU=y
+CONFIG_ALTIVEC=y
+CONFIG_SMP=y
+CONFIG_NR_CPUS=2
# CONFIG_PPC_POWERNV is not set
# CONFIG_PPC_PSERIES is not set
# CONFIG_PPC_PMAC is not set
@@ -27,17 +23,20 @@ CONFIG_PS3_FLASH=y
CONFIG_PS3_VRAM=m
CONFIG_PS3_LPM=m
# CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set
-# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-CONFIG_BINFMT_MISC=y
CONFIG_KEXEC=y
CONFIG_PPC_4K_PAGES=y
-# CONFIG_SPARSEMEM_VMEMMAP is not set
-# CONFIG_COMPACTION is not set
CONFIG_SCHED_SMT=y
CONFIG_PM=y
CONFIG_PM_DEBUG=y
# CONFIG_SECCOMP is not set
-# CONFIG_PCI is not set
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_SLAB=y
+# CONFIG_COMPAT_BRK is not set
+# CONFIG_SPARSEMEM_VMEMMAP is not set
+# CONFIG_COMPACTION is not set
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_UNIX=y
@@ -87,7 +86,6 @@ CONFIG_USB_USBNET=m
# CONFIG_USB_NET_NET1080 is not set
# CONFIG_USB_NET_CDC_SUBSET is not set
# CONFIG_USB_NET_ZAURUS is not set
-CONFIG_INPUT_FF_MEMLESS=m
CONFIG_INPUT_JOYDEV=m
CONFIG_INPUT_EVDEV=m
# CONFIG_INPUT_KEYBOARD is not set
@@ -110,13 +108,10 @@ CONFIG_SND=m
# CONFIG_SND_DRIVERS is not set
CONFIG_SND_USB_AUDIO=m
CONFIG_HIDRAW=y
-CONFIG_HID_APPLE=m
CONFIG_HID_BELKIN=m
CONFIG_HID_CHERRY=m
CONFIG_HID_EZKEY=m
CONFIG_HID_TWINHAN=m
-CONFIG_HID_LOGITECH=m
-CONFIG_HID_LOGITECH_DJ=m
CONFIG_HID_MICROSOFT=m
CONFIG_HID_SUNPLUS=m
CONFIG_HID_SMARTJOYPLUS=m
@@ -151,8 +146,12 @@ CONFIG_CIFS=m
CONFIG_NLS=y
CONFIG_NLS_CODEPAGE_437=y
CONFIG_NLS_ISO8859_1=y
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_LZO=m
CONFIG_CRC_CCITT=m
CONFIG_CRC_T10DIF=y
+CONFIG_PRINTK_TIME=y
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_MAGIC_SYSRQ=y
CONFIG_DEBUG_MEMORY_INIT=y
@@ -163,7 +162,3 @@ CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_LIST=y
CONFIG_RCU_CPU_STALL_TIMEOUT=60
# CONFIG_FTRACE is not set
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_LZO=m
-CONFIG_PRINTK_TIME=y
diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S
index a16a717c809c..b0f87f595b26 100644
--- a/arch/powerpc/crypto/crc32-vpmsum_core.S
+++ b/arch/powerpc/crypto/crc32-vpmsum_core.S
@@ -113,9 +113,7 @@ FUNC_START(CRC_FUNCTION_NAME)
#endif
#ifdef BYTESWAP_DATA
- addis r3,r2,.byteswap_constant@toc@ha
- addi r3,r3,.byteswap_constant@toc@l
-
+ LOAD_REG_ADDR(r3, .byteswap_constant)
lvx byteswap,0,r3
addi r3,r3,16
#endif
@@ -150,8 +148,7 @@ FUNC_START(CRC_FUNCTION_NAME)
addi r7,r7,-1
mtctr r7
- addis r3,r2,.constants@toc@ha
- addi r3,r3,.constants@toc@l
+ LOAD_REG_ADDR(r3, .constants)
/* Find the start of our constants */
add r3,r3,r8
@@ -506,8 +503,7 @@ FUNC_START(CRC_FUNCTION_NAME)
.Lbarrett_reduction:
/* Barrett constants */
- addis r3,r2,.barrett_constants@toc@ha
- addi r3,r3,.barrett_constants@toc@l
+ LOAD_REG_ADDR(r3, .barrett_constants)
lvx const1,0,r3
lvx const2,off16,r3
@@ -610,8 +606,7 @@ FUNC_START(CRC_FUNCTION_NAME)
cmpdi r5,0
beq .Lzero
- addis r3,r2,.short_constants@toc@ha
- addi r3,r3,.short_constants@toc@l
+ LOAD_REG_ADDR(r3, .short_constants)
/* Calculate where in the constant table we need to start */
subfic r6,r5,256
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index e80b2c0e9315..b95b666f0374 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -35,9 +35,9 @@
* However, on CPUs that don't support lwsync, lwsync actually maps to a
* heavy-weight sync, so smp_wmb() can be a lighter-weight eieio.
*/
-#define mb() __asm__ __volatile__ ("sync" : : : "memory")
-#define rmb() __asm__ __volatile__ ("sync" : : : "memory")
-#define wmb() __asm__ __volatile__ ("sync" : : : "memory")
+#define __mb() __asm__ __volatile__ ("sync" : : : "memory")
+#define __rmb() __asm__ __volatile__ ("sync" : : : "memory")
+#define __wmb() __asm__ __volatile__ ("sync" : : : "memory")
/* The sub-arch has lwsync */
#if defined(CONFIG_PPC64) || defined(CONFIG_PPC_E500MC)
@@ -51,12 +51,12 @@
/* clang defines this macro for a builtin, which will not work with runtime patching */
#undef __lwsync
#define __lwsync() __asm__ __volatile__ (stringify_in_c(LWSYNC) : : :"memory")
-#define dma_rmb() __lwsync()
-#define dma_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
+#define __dma_rmb() __lwsync()
+#define __dma_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
#define __smp_lwsync() __lwsync()
-#define __smp_mb() mb()
+#define __smp_mb() __mb()
#define __smp_rmb() __lwsync()
#define __smp_wmb() __asm__ __volatile__ (stringify_in_c(SMPWMB) : : :"memory")
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 95fd7f9485d5..c099780385dd 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -335,6 +335,7 @@
#define H_RPT_INVALIDATE 0x448
#define H_SCM_FLUSH 0x44C
#define H_GET_ENERGY_SCALE_INFO 0x450
+#define H_PKS_SIGNED_UPDATE 0x454
#define H_WATCHDOG 0x45C
#define MAX_HCALL_OPCODE H_WATCHDOG
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index eb6d094083fd..317659fdeacf 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -36,15 +36,17 @@
#define PACA_IRQ_DEC 0x08 /* Or FIT */
#define PACA_IRQ_HMI 0x10
#define PACA_IRQ_PMI 0x20
+#define PACA_IRQ_REPLAYING 0x40
/*
* Some soft-masked interrupts must be hard masked until they are replayed
* (e.g., because the soft-masked handler does not clear the exception).
+ * Interrupt replay itself must remain hard masked too.
*/
#ifdef CONFIG_PPC_BOOK3S
-#define PACA_IRQ_MUST_HARD_MASK (PACA_IRQ_EE|PACA_IRQ_PMI)
+#define PACA_IRQ_MUST_HARD_MASK (PACA_IRQ_EE|PACA_IRQ_PMI|PACA_IRQ_REPLAYING)
#else
-#define PACA_IRQ_MUST_HARD_MASK (PACA_IRQ_EE)
+#define PACA_IRQ_MUST_HARD_MASK (PACA_IRQ_EE|PACA_IRQ_REPLAYING)
#endif
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
index 6d8492b6e2b8..a4196ab1d016 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -74,17 +74,18 @@
#include <asm/kprobes.h>
#include <asm/runlatch.h>
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG
/*
* WARN/BUG is handled with a program interrupt so minimise checks here to
* avoid recursion and maximise the chance of getting the first oops handled.
*/
#define INT_SOFT_MASK_BUG_ON(regs, cond) \
do { \
- if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG) && \
- (user_mode(regs) || (TRAP(regs) != INTERRUPT_PROGRAM))) \
+ if ((user_mode(regs) || (TRAP(regs) != INTERRUPT_PROGRAM))) \
BUG_ON(cond); \
} while (0)
+#else
+#define INT_SOFT_MASK_BUG_ON(regs, cond)
#endif
#ifdef CONFIG_PPC_BOOK3S_64
@@ -151,28 +152,8 @@ static inline void booke_restore_dbcr0(void)
static inline void interrupt_enter_prepare(struct pt_regs *regs)
{
-#ifdef CONFIG_PPC32
- if (!arch_irq_disabled_regs(regs))
- trace_hardirqs_off();
-
- if (user_mode(regs))
- kuap_lock();
- else
- kuap_save_and_lock(regs);
-
- if (user_mode(regs))
- account_cpu_user_entry();
-#endif
-
#ifdef CONFIG_PPC64
- bool trace_enable = false;
-
- if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS)) {
- if (irq_soft_mask_set_return(IRQS_ALL_DISABLED) == IRQS_ENABLED)
- trace_enable = true;
- } else {
- irq_soft_mask_set(IRQS_ALL_DISABLED);
- }
+ irq_soft_mask_set(IRQS_ALL_DISABLED);
/*
* If the interrupt was taken with HARD_DIS clear, then enable MSR[EE].
@@ -188,9 +169,10 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs)
} else {
__hard_RI_enable();
}
+ /* Enable MSR[RI] early, to support kernel SLB and hash faults */
+#endif
- /* Do this when RI=1 because it can cause SLB faults */
- if (trace_enable)
+ if (!arch_irq_disabled_regs(regs))
trace_hardirqs_off();
if (user_mode(regs)) {
@@ -215,7 +197,6 @@ static inline void interrupt_enter_prepare(struct pt_regs *regs)
}
INT_SOFT_MASK_BUG_ON(regs, !arch_irq_disabled_regs(regs) &&
!(regs->msr & MSR_EE));
-#endif
booke_restore_dbcr0();
}
diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h
index 5c1516a5ba8f..deadd2149426 100644
--- a/arch/powerpc/include/asm/irq.h
+++ b/arch/powerpc/include/asm/irq.h
@@ -16,9 +16,6 @@
extern atomic_t ppc_n_lost_interrupts;
-/* This number is used when no interrupt has been assigned */
-#define NO_IRQ (0)
-
/* Total number of virq in the platform */
#define NR_IRQS CONFIG_NR_IRQS
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index caea15dcb91d..959f566a455c 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -876,13 +876,10 @@ struct kvm_vcpu_arch {
#define __KVM_HAVE_ARCH_WQP
#define __KVM_HAVE_CREATE_DEVICE
-static inline void kvm_arch_hardware_disable(void) {}
-static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
-static inline void kvm_arch_exit(void) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index eae9619b6190..6bef23d6d0e3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -118,7 +118,6 @@ extern int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr,
extern int kvmppc_core_vcpu_create(struct kvm_vcpu *vcpu);
extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu);
extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu);
-extern int kvmppc_core_check_processor_compat(void);
extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
struct kvm_translation *tr);
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 378b8d5836a7..459736d5e511 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -3,6 +3,7 @@
#define _ASM_POWERPC_MACHDEP_H
#ifdef __KERNEL__
+#include <linux/compiler.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/dma-mapping.h>
@@ -220,11 +221,16 @@ extern struct machdep_calls *machine_id;
EXPORT_SYMBOL(mach_##name); \
struct machdep_calls mach_##name __machine_desc =
-#define machine_is(name) \
- ({ \
- extern struct machdep_calls mach_##name \
- __attribute__((weak)); \
- machine_id == &mach_##name; \
+static inline bool __machine_is(const struct machdep_calls *md)
+{
+ WARN_ON(!machine_id); // complain if used before probe_machine()
+ return machine_id == md;
+}
+
+#define machine_is(name) \
+ ({ \
+ extern struct machdep_calls mach_##name __weak; \
+ __machine_is(&mach_##name); \
})
static inline void log_error(char *buf, unsigned int err_type, int fatal)
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 09f1790d0ae1..0ab3511a47d7 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -295,7 +295,6 @@ extern void free_unused_pacas(void);
#else /* CONFIG_PPC64 */
-static inline void allocate_paca_ptrs(void) { }
static inline void allocate_paca(int cpu) { }
static inline void free_unused_pacas(void) { }
diff --git a/arch/powerpc/include/asm/papr-sysparm.h b/arch/powerpc/include/asm/papr-sysparm.h
new file mode 100644
index 000000000000..f5fdbd8ae9db
--- /dev/null
+++ b/arch/powerpc/include/asm/papr-sysparm.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_POWERPC_PAPR_SYSPARM_H
+#define _ASM_POWERPC_PAPR_SYSPARM_H
+
+typedef struct {
+ const u32 token;
+} papr_sysparm_t;
+
+#define mk_papr_sysparm(x_) ((papr_sysparm_t){ .token = x_, })
+
+/*
+ * Derived from the "Defined Parameters" table in PAPR 7.3.16 System
+ * Parameters Option. Where the spec says "characteristics", we use
+ * "attrs" in the symbolic names to keep them from getting too
+ * unwieldy.
+ */
+#define PAPR_SYSPARM_SHARED_PROC_LPAR_ATTRS mk_papr_sysparm(20)
+#define PAPR_SYSPARM_PROC_MODULE_INFO mk_papr_sysparm(43)
+#define PAPR_SYSPARM_COOP_MEM_OVERCOMMIT_ATTRS mk_papr_sysparm(44)
+#define PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS mk_papr_sysparm(50)
+#define PAPR_SYSPARM_LPAR_NAME mk_papr_sysparm(55)
+
+enum {
+ PAPR_SYSPARM_MAX_INPUT = 1024,
+ PAPR_SYSPARM_MAX_OUTPUT = 4000,
+};
+
+struct papr_sysparm_buf {
+ __be16 len;
+ char val[PAPR_SYSPARM_MAX_OUTPUT];
+};
+
+struct papr_sysparm_buf *papr_sysparm_buf_alloc(void);
+void papr_sysparm_buf_free(struct papr_sysparm_buf *buf);
+int papr_sysparm_set(papr_sysparm_t param, const struct papr_sysparm_buf *buf);
+int papr_sysparm_get(papr_sysparm_t param, struct papr_sysparm_buf *buf);
+
+#endif /* _ASM_POWERPC_PAPR_SYSPARM_H */
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index e18c95f4e1d4..71c1d26f2400 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -176,8 +176,10 @@ extern int pci_device_from_OF_node(struct device_node *node,
#endif
#ifndef CONFIG_PPC64
-#ifdef CONFIG_PPC_CHRP
+#ifdef CONFIG_PPC_PCI_OF_BUS_MAP
extern void pci_create_OF_bus_map(void);
+#else
+static inline void pci_create_OF_bus_map(void) {}
#endif
#else /* CONFIG_PPC64 */
diff --git a/arch/powerpc/include/asm/plpks.h b/arch/powerpc/include/asm/plpks.h
new file mode 100644
index 000000000000..23b77027c916
--- /dev/null
+++ b/arch/powerpc/include/asm/plpks.h
@@ -0,0 +1,195 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022 IBM Corporation
+ * Author: Nayna Jain <nayna@linux.ibm.com>
+ *
+ * Platform keystore for pseries LPAR(PLPKS).
+ */
+
+#ifndef _ASM_POWERPC_PLPKS_H
+#define _ASM_POWERPC_PLPKS_H
+
+#ifdef CONFIG_PSERIES_PLPKS
+
+#include <linux/types.h>
+#include <linux/list.h>
+
+// Object policy flags from supported_policies
+#define PLPKS_OSSECBOOTAUDIT PPC_BIT32(1) // OS secure boot must be audit/enforce
+#define PLPKS_OSSECBOOTENFORCE PPC_BIT32(2) // OS secure boot must be enforce
+#define PLPKS_PWSET PPC_BIT32(3) // No access without password set
+#define PLPKS_WORLDREADABLE PPC_BIT32(4) // Readable without authentication
+#define PLPKS_IMMUTABLE PPC_BIT32(5) // Once written, object cannot be removed
+#define PLPKS_TRANSIENT PPC_BIT32(6) // Object does not persist through reboot
+#define PLPKS_SIGNEDUPDATE PPC_BIT32(7) // Object can only be modified by signed updates
+#define PLPKS_HVPROVISIONED PPC_BIT32(28) // Hypervisor has provisioned this object
+
+// Signature algorithm flags from signed_update_algorithms
+#define PLPKS_ALG_RSA2048 PPC_BIT(0)
+#define PLPKS_ALG_RSA4096 PPC_BIT(1)
+
+// Object label OS metadata flags
+#define PLPKS_VAR_LINUX 0x02
+#define PLPKS_VAR_COMMON 0x04
+
+// Flags for which consumer owns an object is owned by
+#define PLPKS_FW_OWNER 0x1
+#define PLPKS_BOOTLOADER_OWNER 0x2
+#define PLPKS_OS_OWNER 0x3
+
+// Flags for label metadata fields
+#define PLPKS_LABEL_VERSION 0
+#define PLPKS_MAX_LABEL_ATTR_SIZE 16
+#define PLPKS_MAX_NAME_SIZE 239
+#define PLPKS_MAX_DATA_SIZE 4000
+
+// Timeouts for PLPKS operations
+#define PLPKS_MAX_TIMEOUT 5000 // msec
+#define PLPKS_FLUSH_SLEEP 10 // msec
+#define PLPKS_FLUSH_SLEEP_RANGE 400
+
+struct plpks_var {
+ char *component;
+ u8 *name;
+ u8 *data;
+ u32 policy;
+ u16 namelen;
+ u16 datalen;
+ u8 os;
+};
+
+struct plpks_var_name {
+ u8 *name;
+ u16 namelen;
+};
+
+struct plpks_var_name_list {
+ u32 varcount;
+ struct plpks_var_name varlist[];
+};
+
+/**
+ * Updates the authenticated variable. It expects NULL as the component.
+ */
+int plpks_signed_update_var(struct plpks_var *var, u64 flags);
+
+/**
+ * Writes the specified var and its data to PKS.
+ * Any caller of PKS driver should present a valid component type for
+ * their variable.
+ */
+int plpks_write_var(struct plpks_var var);
+
+/**
+ * Removes the specified var and its data from PKS.
+ */
+int plpks_remove_var(char *component, u8 varos,
+ struct plpks_var_name vname);
+
+/**
+ * Returns the data for the specified os variable.
+ *
+ * Caller must allocate a buffer in var->data with length in var->datalen.
+ * If no buffer is provided, var->datalen will be populated with the object's
+ * size.
+ */
+int plpks_read_os_var(struct plpks_var *var);
+
+/**
+ * Returns the data for the specified firmware variable.
+ *
+ * Caller must allocate a buffer in var->data with length in var->datalen.
+ * If no buffer is provided, var->datalen will be populated with the object's
+ * size.
+ */
+int plpks_read_fw_var(struct plpks_var *var);
+
+/**
+ * Returns the data for the specified bootloader variable.
+ *
+ * Caller must allocate a buffer in var->data with length in var->datalen.
+ * If no buffer is provided, var->datalen will be populated with the object's
+ * size.
+ */
+int plpks_read_bootloader_var(struct plpks_var *var);
+
+/**
+ * Returns if PKS is available on this LPAR.
+ */
+bool plpks_is_available(void);
+
+/**
+ * Returns version of the Platform KeyStore.
+ */
+u8 plpks_get_version(void);
+
+/**
+ * Returns hypervisor storage overhead per object, not including the size of
+ * the object or label. Only valid for config version >= 2
+ */
+u16 plpks_get_objoverhead(void);
+
+/**
+ * Returns maximum password size. Must be >= 32 bytes
+ */
+u16 plpks_get_maxpwsize(void);
+
+/**
+ * Returns maximum object size supported by Platform KeyStore.
+ */
+u16 plpks_get_maxobjectsize(void);
+
+/**
+ * Returns maximum object label size supported by Platform KeyStore.
+ */
+u16 plpks_get_maxobjectlabelsize(void);
+
+/**
+ * Returns total size of the configured Platform KeyStore.
+ */
+u32 plpks_get_totalsize(void);
+
+/**
+ * Returns used space from the total size of the Platform KeyStore.
+ */
+u32 plpks_get_usedspace(void);
+
+/**
+ * Returns bitmask of policies supported by the hypervisor.
+ */
+u32 plpks_get_supportedpolicies(void);
+
+/**
+ * Returns maximum byte size of a single object supported by the hypervisor.
+ * Only valid for config version >= 3
+ */
+u32 plpks_get_maxlargeobjectsize(void);
+
+/**
+ * Returns bitmask of signature algorithms supported for signed updates.
+ * Only valid for config version >= 3
+ */
+u64 plpks_get_signedupdatealgorithms(void);
+
+/**
+ * Returns the length of the PLPKS password in bytes.
+ */
+u16 plpks_get_passwordlen(void);
+
+/**
+ * Called in early init to retrieve and clear the PLPKS password from the DT.
+ */
+void plpks_early_init_devtree(void);
+
+/**
+ * Populates the FDT with the PLPKS password to prepare for kexec.
+ */
+int plpks_populate_fdt(void *fdt);
+#else // CONFIG_PSERIES_PLPKS
+static inline bool plpks_is_available(void) { return false; }
+static inline u16 plpks_get_passwordlen(void) { BUILD_BUG(); }
+static inline void plpks_early_init_devtree(void) { }
+static inline int plpks_populate_fdt(void *fdt) { BUILD_BUG(); }
+#endif // CONFIG_PSERIES_PLPKS
+
+#endif // _ASM_POWERPC_PLPKS_H
diff --git a/arch/powerpc/include/asm/rtas-types.h b/arch/powerpc/include/asm/rtas-types.h
index 8df6235d64d1..f2ad4a96cbc5 100644
--- a/arch/powerpc/include/asm/rtas-types.h
+++ b/arch/powerpc/include/asm/rtas-types.h
@@ -18,8 +18,6 @@ struct rtas_t {
unsigned long entry; /* physical address pointer */
unsigned long base; /* physical address pointer */
unsigned long size;
- arch_spinlock_t lock;
- struct rtas_args args;
struct device_node *dev; /* virtual address pointer */
};
diff --git a/arch/powerpc/include/asm/rtas-work-area.h b/arch/powerpc/include/asm/rtas-work-area.h
new file mode 100644
index 000000000000..251a395dbd2e
--- /dev/null
+++ b/arch/powerpc/include/asm/rtas-work-area.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_POWERPC_RTAS_WORK_AREA_H
+#define _ASM_POWERPC_RTAS_WORK_AREA_H
+
+#include <linux/build_bug.h>
+#include <linux/sizes.h>
+#include <linux/types.h>
+
+#include <asm/page.h>
+
+/**
+ * struct rtas_work_area - RTAS work area descriptor.
+ *
+ * Descriptor for a "work area" in PAPR terminology that satisfies
+ * RTAS addressing requirements.
+ */
+struct rtas_work_area {
+ /* private: Use the APIs provided below. */
+ char *buf;
+ size_t size;
+};
+
+enum {
+ /* Maximum allocation size, enforced at build time. */
+ RTAS_WORK_AREA_MAX_ALLOC_SZ = SZ_128K,
+};
+
+/**
+ * rtas_work_area_alloc() - Acquire a work area of the requested size.
+ * @size_: Allocation size. Must be compile-time constant and not more
+ * than %RTAS_WORK_AREA_MAX_ALLOC_SZ.
+ *
+ * Allocate a buffer suitable for passing to RTAS functions that have
+ * a memory address parameter, often (but not always) referred to as a
+ * "work area" in PAPR. Although callers are allowed to block while
+ * holding a work area, the amount of memory reserved for this purpose
+ * is limited, and allocations should be short-lived. A good guideline
+ * is to release any allocated work area before returning from a
+ * system call.
+ *
+ * This function does not fail. It blocks until the allocation
+ * succeeds. To prevent deadlocks, callers are discouraged from
+ * allocating more than one work area simultaneously in a single task
+ * context.
+ *
+ * Context: This function may sleep.
+ * Return: A &struct rtas_work_area descriptor for the allocated work area.
+ */
+#define rtas_work_area_alloc(size_) ({ \
+ static_assert(__builtin_constant_p(size_)); \
+ static_assert((size_) > 0); \
+ static_assert((size_) <= RTAS_WORK_AREA_MAX_ALLOC_SZ); \
+ __rtas_work_area_alloc(size_); \
+})
+
+/*
+ * Do not call __rtas_work_area_alloc() directly. Use
+ * rtas_work_area_alloc().
+ */
+struct rtas_work_area *__rtas_work_area_alloc(size_t size);
+
+/**
+ * rtas_work_area_free() - Release a work area.
+ * @area: Work area descriptor as returned from rtas_work_area_alloc().
+ *
+ * Return a work area buffer to the pool.
+ */
+void rtas_work_area_free(struct rtas_work_area *area);
+
+static inline char *rtas_work_area_raw_buf(const struct rtas_work_area *area)
+{
+ return area->buf;
+}
+
+static inline size_t rtas_work_area_size(const struct rtas_work_area *area)
+{
+ return area->size;
+}
+
+static inline phys_addr_t rtas_work_area_phys(const struct rtas_work_area *area)
+{
+ return __pa(area->buf);
+}
+
+/*
+ * Early setup for the work area allocator. Call from
+ * rtas_initialize() only.
+ */
+
+#ifdef CONFIG_PPC_PSERIES
+void rtas_work_area_reserve_arena(phys_addr_t limit);
+#else /* CONFIG_PPC_PSERIES */
+static inline void rtas_work_area_reserve_arena(phys_addr_t limit) {}
+#endif /* CONFIG_PPC_PSERIES */
+
+#endif /* _ASM_POWERPC_RTAS_WORK_AREA_H */
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index 479a95cb2770..3abe15ac79db 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -16,6 +16,185 @@
* Copyright (C) 2001 PPC 64 Team, IBM Corp
*/
+enum rtas_function_index {
+ RTAS_FNIDX__CHECK_EXCEPTION,
+ RTAS_FNIDX__DISPLAY_CHARACTER,
+ RTAS_FNIDX__EVENT_SCAN,
+ RTAS_FNIDX__FREEZE_TIME_BASE,
+ RTAS_FNIDX__GET_POWER_LEVEL,
+ RTAS_FNIDX__GET_SENSOR_STATE,
+ RTAS_FNIDX__GET_TERM_CHAR,
+ RTAS_FNIDX__GET_TIME_OF_DAY,
+ RTAS_FNIDX__IBM_ACTIVATE_FIRMWARE,
+ RTAS_FNIDX__IBM_CBE_START_PTCAL,
+ RTAS_FNIDX__IBM_CBE_STOP_PTCAL,
+ RTAS_FNIDX__IBM_CHANGE_MSI,
+ RTAS_FNIDX__IBM_CLOSE_ERRINJCT,
+ RTAS_FNIDX__IBM_CONFIGURE_BRIDGE,
+ RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR,
+ RTAS_FNIDX__IBM_CONFIGURE_KERNEL_DUMP,
+ RTAS_FNIDX__IBM_CONFIGURE_PE,
+ RTAS_FNIDX__IBM_CREATE_PE_DMA_WINDOW,
+ RTAS_FNIDX__IBM_DISPLAY_MESSAGE,
+ RTAS_FNIDX__IBM_ERRINJCT,
+ RTAS_FNIDX__IBM_EXTI2C,
+ RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO,
+ RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO2,
+ RTAS_FNIDX__IBM_GET_DYNAMIC_SENSOR_STATE,
+ RTAS_FNIDX__IBM_GET_INDICES,
+ RTAS_FNIDX__IBM_GET_RIO_TOPOLOGY,
+ RTAS_FNIDX__IBM_GET_SYSTEM_PARAMETER,
+ RTAS_FNIDX__IBM_GET_VPD,
+ RTAS_FNIDX__IBM_GET_XIVE,
+ RTAS_FNIDX__IBM_INT_OFF,
+ RTAS_FNIDX__IBM_INT_ON,
+ RTAS_FNIDX__IBM_IO_QUIESCE_ACK,
+ RTAS_FNIDX__IBM_LPAR_PERFTOOLS,
+ RTAS_FNIDX__IBM_MANAGE_FLASH_IMAGE,
+ RTAS_FNIDX__IBM_MANAGE_STORAGE_PRESERVATION,
+ RTAS_FNIDX__IBM_NMI_INTERLOCK,
+ RTAS_FNIDX__IBM_NMI_REGISTER,
+ RTAS_FNIDX__IBM_OPEN_ERRINJCT,
+ RTAS_FNIDX__IBM_OPEN_SRIOV_ALLOW_UNFREEZE,
+ RTAS_FNIDX__IBM_OPEN_SRIOV_MAP_PE_NUMBER,
+ RTAS_FNIDX__IBM_OS_TERM,
+ RTAS_FNIDX__IBM_PARTNER_CONTROL,
+ RTAS_FNIDX__IBM_PHYSICAL_ATTESTATION,
+ RTAS_FNIDX__IBM_PLATFORM_DUMP,
+ RTAS_FNIDX__IBM_POWER_OFF_UPS,
+ RTAS_FNIDX__IBM_QUERY_INTERRUPT_SOURCE_NUMBER,
+ RTAS_FNIDX__IBM_QUERY_PE_DMA_WINDOW,
+ RTAS_FNIDX__IBM_READ_PCI_CONFIG,
+ RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE,
+ RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2,
+ RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW,
+ RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS,
+ RTAS_FNIDX__IBM_SCAN_LOG_DUMP,
+ RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR,
+ RTAS_FNIDX__IBM_SET_EEH_OPTION,
+ RTAS_FNIDX__IBM_SET_SLOT_RESET,
+ RTAS_FNIDX__IBM_SET_SYSTEM_PARAMETER,
+ RTAS_FNIDX__IBM_SET_XIVE,
+ RTAS_FNIDX__IBM_SLOT_ERROR_DETAIL,
+ RTAS_FNIDX__IBM_SUSPEND_ME,
+ RTAS_FNIDX__IBM_TUNE_DMA_PARMS,
+ RTAS_FNIDX__IBM_UPDATE_FLASH_64_AND_REBOOT,
+ RTAS_FNIDX__IBM_UPDATE_NODES,
+ RTAS_FNIDX__IBM_UPDATE_PROPERTIES,
+ RTAS_FNIDX__IBM_VALIDATE_FLASH_IMAGE,
+ RTAS_FNIDX__IBM_WRITE_PCI_CONFIG,
+ RTAS_FNIDX__NVRAM_FETCH,
+ RTAS_FNIDX__NVRAM_STORE,
+ RTAS_FNIDX__POWER_OFF,
+ RTAS_FNIDX__PUT_TERM_CHAR,
+ RTAS_FNIDX__QUERY_CPU_STOPPED_STATE,
+ RTAS_FNIDX__READ_PCI_CONFIG,
+ RTAS_FNIDX__RTAS_LAST_ERROR,
+ RTAS_FNIDX__SET_INDICATOR,
+ RTAS_FNIDX__SET_POWER_LEVEL,
+ RTAS_FNIDX__SET_TIME_FOR_POWER_ON,
+ RTAS_FNIDX__SET_TIME_OF_DAY,
+ RTAS_FNIDX__START_CPU,
+ RTAS_FNIDX__STOP_SELF,
+ RTAS_FNIDX__SYSTEM_REBOOT,
+ RTAS_FNIDX__THAW_TIME_BASE,
+ RTAS_FNIDX__WRITE_PCI_CONFIG,
+};
+
+/*
+ * Opaque handle for client code to refer to RTAS functions. All valid
+ * function handles are build-time constants prefixed with RTAS_FN_.
+ */
+typedef struct {
+ const enum rtas_function_index index;
+} rtas_fn_handle_t;
+
+
+#define rtas_fn_handle(x_) ((const rtas_fn_handle_t) { .index = x_, })
+
+#define RTAS_FN_CHECK_EXCEPTION rtas_fn_handle(RTAS_FNIDX__CHECK_EXCEPTION)
+#define RTAS_FN_DISPLAY_CHARACTER rtas_fn_handle(RTAS_FNIDX__DISPLAY_CHARACTER)
+#define RTAS_FN_EVENT_SCAN rtas_fn_handle(RTAS_FNIDX__EVENT_SCAN)
+#define RTAS_FN_FREEZE_TIME_BASE rtas_fn_handle(RTAS_FNIDX__FREEZE_TIME_BASE)
+#define RTAS_FN_GET_POWER_LEVEL rtas_fn_handle(RTAS_FNIDX__GET_POWER_LEVEL)
+#define RTAS_FN_GET_SENSOR_STATE rtas_fn_handle(RTAS_FNIDX__GET_SENSOR_STATE)
+#define RTAS_FN_GET_TERM_CHAR rtas_fn_handle(RTAS_FNIDX__GET_TERM_CHAR)
+#define RTAS_FN_GET_TIME_OF_DAY rtas_fn_handle(RTAS_FNIDX__GET_TIME_OF_DAY)
+#define RTAS_FN_IBM_ACTIVATE_FIRMWARE rtas_fn_handle(RTAS_FNIDX__IBM_ACTIVATE_FIRMWARE)
+#define RTAS_FN_IBM_CBE_START_PTCAL rtas_fn_handle(RTAS_FNIDX__IBM_CBE_START_PTCAL)
+#define RTAS_FN_IBM_CBE_STOP_PTCAL rtas_fn_handle(RTAS_FNIDX__IBM_CBE_STOP_PTCAL)
+#define RTAS_FN_IBM_CHANGE_MSI rtas_fn_handle(RTAS_FNIDX__IBM_CHANGE_MSI)
+#define RTAS_FN_IBM_CLOSE_ERRINJCT rtas_fn_handle(RTAS_FNIDX__IBM_CLOSE_ERRINJCT)
+#define RTAS_FN_IBM_CONFIGURE_BRIDGE rtas_fn_handle(RTAS_FNIDX__IBM_CONFIGURE_BRIDGE)
+#define RTAS_FN_IBM_CONFIGURE_CONNECTOR rtas_fn_handle(RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR)
+#define RTAS_FN_IBM_CONFIGURE_KERNEL_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_CONFIGURE_KERNEL_DUMP)
+#define RTAS_FN_IBM_CONFIGURE_PE rtas_fn_handle(RTAS_FNIDX__IBM_CONFIGURE_PE)
+#define RTAS_FN_IBM_CREATE_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_CREATE_PE_DMA_WINDOW)
+#define RTAS_FN_IBM_DISPLAY_MESSAGE rtas_fn_handle(RTAS_FNIDX__IBM_DISPLAY_MESSAGE)
+#define RTAS_FN_IBM_ERRINJCT rtas_fn_handle(RTAS_FNIDX__IBM_ERRINJCT)
+#define RTAS_FN_IBM_EXTI2C rtas_fn_handle(RTAS_FNIDX__IBM_EXTI2C)
+#define RTAS_FN_IBM_GET_CONFIG_ADDR_INFO rtas_fn_handle(RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO)
+#define RTAS_FN_IBM_GET_CONFIG_ADDR_INFO2 rtas_fn_handle(RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO2)
+#define RTAS_FN_IBM_GET_DYNAMIC_SENSOR_STATE rtas_fn_handle(RTAS_FNIDX__IBM_GET_DYNAMIC_SENSOR_STATE)
+#define RTAS_FN_IBM_GET_INDICES rtas_fn_handle(RTAS_FNIDX__IBM_GET_INDICES)
+#define RTAS_FN_IBM_GET_RIO_TOPOLOGY rtas_fn_handle(RTAS_FNIDX__IBM_GET_RIO_TOPOLOGY)
+#define RTAS_FN_IBM_GET_SYSTEM_PARAMETER rtas_fn_handle(RTAS_FNIDX__IBM_GET_SYSTEM_PARAMETER)
+#define RTAS_FN_IBM_GET_VPD rtas_fn_handle(RTAS_FNIDX__IBM_GET_VPD)
+#define RTAS_FN_IBM_GET_XIVE rtas_fn_handle(RTAS_FNIDX__IBM_GET_XIVE)
+#define RTAS_FN_IBM_INT_OFF rtas_fn_handle(RTAS_FNIDX__IBM_INT_OFF)
+#define RTAS_FN_IBM_INT_ON rtas_fn_handle(RTAS_FNIDX__IBM_INT_ON)
+#define RTAS_FN_IBM_IO_QUIESCE_ACK rtas_fn_handle(RTAS_FNIDX__IBM_IO_QUIESCE_ACK)
+#define RTAS_FN_IBM_LPAR_PERFTOOLS rtas_fn_handle(RTAS_FNIDX__IBM_LPAR_PERFTOOLS)
+#define RTAS_FN_IBM_MANAGE_FLASH_IMAGE rtas_fn_handle(RTAS_FNIDX__IBM_MANAGE_FLASH_IMAGE)
+#define RTAS_FN_IBM_MANAGE_STORAGE_PRESERVATION rtas_fn_handle(RTAS_FNIDX__IBM_MANAGE_STORAGE_PRESERVATION)
+#define RTAS_FN_IBM_NMI_INTERLOCK rtas_fn_handle(RTAS_FNIDX__IBM_NMI_INTERLOCK)
+#define RTAS_FN_IBM_NMI_REGISTER rtas_fn_handle(RTAS_FNIDX__IBM_NMI_REGISTER)
+#define RTAS_FN_IBM_OPEN_ERRINJCT rtas_fn_handle(RTAS_FNIDX__IBM_OPEN_ERRINJCT)
+#define RTAS_FN_IBM_OPEN_SRIOV_ALLOW_UNFREEZE rtas_fn_handle(RTAS_FNIDX__IBM_OPEN_SRIOV_ALLOW_UNFREEZE)
+#define RTAS_FN_IBM_OPEN_SRIOV_MAP_PE_NUMBER rtas_fn_handle(RTAS_FNIDX__IBM_OPEN_SRIOV_MAP_PE_NUMBER)
+#define RTAS_FN_IBM_OS_TERM rtas_fn_handle(RTAS_FNIDX__IBM_OS_TERM)
+#define RTAS_FN_IBM_PARTNER_CONTROL rtas_fn_handle(RTAS_FNIDX__IBM_PARTNER_CONTROL)
+#define RTAS_FN_IBM_PHYSICAL_ATTESTATION rtas_fn_handle(RTAS_FNIDX__IBM_PHYSICAL_ATTESTATION)
+#define RTAS_FN_IBM_PLATFORM_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_PLATFORM_DUMP)
+#define RTAS_FN_IBM_POWER_OFF_UPS rtas_fn_handle(RTAS_FNIDX__IBM_POWER_OFF_UPS)
+#define RTAS_FN_IBM_QUERY_INTERRUPT_SOURCE_NUMBER rtas_fn_handle(RTAS_FNIDX__IBM_QUERY_INTERRUPT_SOURCE_NUMBER)
+#define RTAS_FN_IBM_QUERY_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_QUERY_PE_DMA_WINDOW)
+#define RTAS_FN_IBM_READ_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__IBM_READ_PCI_CONFIG)
+#define RTAS_FN_IBM_READ_SLOT_RESET_STATE rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE)
+#define RTAS_FN_IBM_READ_SLOT_RESET_STATE2 rtas_fn_handle(RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2)
+#define RTAS_FN_IBM_REMOVE_PE_DMA_WINDOW rtas_fn_handle(RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW)
+#define RTAS_FN_IBM_RESET_PE_DMA_WINDOWS rtas_fn_handle(RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS)
+#define RTAS_FN_IBM_SCAN_LOG_DUMP rtas_fn_handle(RTAS_FNIDX__IBM_SCAN_LOG_DUMP)
+#define RTAS_FN_IBM_SET_DYNAMIC_INDICATOR rtas_fn_handle(RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR)
+#define RTAS_FN_IBM_SET_EEH_OPTION rtas_fn_handle(RTAS_FNIDX__IBM_SET_EEH_OPTION)
+#define RTAS_FN_IBM_SET_SLOT_RESET rtas_fn_handle(RTAS_FNIDX__IBM_SET_SLOT_RESET)
+#define RTAS_FN_IBM_SET_SYSTEM_PARAMETER rtas_fn_handle(RTAS_FNIDX__IBM_SET_SYSTEM_PARAMETER)
+#define RTAS_FN_IBM_SET_XIVE rtas_fn_handle(RTAS_FNIDX__IBM_SET_XIVE)
+#define RTAS_FN_IBM_SLOT_ERROR_DETAIL rtas_fn_handle(RTAS_FNIDX__IBM_SLOT_ERROR_DETAIL)
+#define RTAS_FN_IBM_SUSPEND_ME rtas_fn_handle(RTAS_FNIDX__IBM_SUSPEND_ME)
+#define RTAS_FN_IBM_TUNE_DMA_PARMS rtas_fn_handle(RTAS_FNIDX__IBM_TUNE_DMA_PARMS)
+#define RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT rtas_fn_handle(RTAS_FNIDX__IBM_UPDATE_FLASH_64_AND_REBOOT)
+#define RTAS_FN_IBM_UPDATE_NODES rtas_fn_handle(RTAS_FNIDX__IBM_UPDATE_NODES)
+#define RTAS_FN_IBM_UPDATE_PROPERTIES rtas_fn_handle(RTAS_FNIDX__IBM_UPDATE_PROPERTIES)
+#define RTAS_FN_IBM_VALIDATE_FLASH_IMAGE rtas_fn_handle(RTAS_FNIDX__IBM_VALIDATE_FLASH_IMAGE)
+#define RTAS_FN_IBM_WRITE_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__IBM_WRITE_PCI_CONFIG)
+#define RTAS_FN_NVRAM_FETCH rtas_fn_handle(RTAS_FNIDX__NVRAM_FETCH)
+#define RTAS_FN_NVRAM_STORE rtas_fn_handle(RTAS_FNIDX__NVRAM_STORE)
+#define RTAS_FN_POWER_OFF rtas_fn_handle(RTAS_FNIDX__POWER_OFF)
+#define RTAS_FN_PUT_TERM_CHAR rtas_fn_handle(RTAS_FNIDX__PUT_TERM_CHAR)
+#define RTAS_FN_QUERY_CPU_STOPPED_STATE rtas_fn_handle(RTAS_FNIDX__QUERY_CPU_STOPPED_STATE)
+#define RTAS_FN_READ_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__READ_PCI_CONFIG)
+#define RTAS_FN_RTAS_LAST_ERROR rtas_fn_handle(RTAS_FNIDX__RTAS_LAST_ERROR)
+#define RTAS_FN_SET_INDICATOR rtas_fn_handle(RTAS_FNIDX__SET_INDICATOR)
+#define RTAS_FN_SET_POWER_LEVEL rtas_fn_handle(RTAS_FNIDX__SET_POWER_LEVEL)
+#define RTAS_FN_SET_TIME_FOR_POWER_ON rtas_fn_handle(RTAS_FNIDX__SET_TIME_FOR_POWER_ON)
+#define RTAS_FN_SET_TIME_OF_DAY rtas_fn_handle(RTAS_FNIDX__SET_TIME_OF_DAY)
+#define RTAS_FN_START_CPU rtas_fn_handle(RTAS_FNIDX__START_CPU)
+#define RTAS_FN_STOP_SELF rtas_fn_handle(RTAS_FNIDX__STOP_SELF)
+#define RTAS_FN_SYSTEM_REBOOT rtas_fn_handle(RTAS_FNIDX__SYSTEM_REBOOT)
+#define RTAS_FN_THAW_TIME_BASE rtas_fn_handle(RTAS_FNIDX__THAW_TIME_BASE)
+#define RTAS_FN_WRITE_PCI_CONFIG rtas_fn_handle(RTAS_FNIDX__WRITE_PCI_CONFIG)
+
#define RTAS_UNKNOWN_SERVICE (-1)
#define RTAS_INSTANTIATE_MAX (1ULL<<30) /* Don't instantiate rtas at/above this value */
@@ -222,6 +401,11 @@ extern void (*rtas_flash_term_hook)(int);
extern struct rtas_t rtas;
+s32 rtas_function_token(const rtas_fn_handle_t handle);
+static inline bool rtas_function_implemented(const rtas_fn_handle_t handle)
+{
+ return rtas_function_token(handle) != RTAS_UNKNOWN_SERVICE;
+}
extern int rtas_token(const char *service);
extern int rtas_service_present(const char *service);
extern int rtas_call(int token, int, int, int *, ...);
diff --git a/arch/powerpc/include/asm/secvar.h b/arch/powerpc/include/asm/secvar.h
index 4cc35b58b986..4828e0ab7e3c 100644
--- a/arch/powerpc/include/asm/secvar.h
+++ b/arch/powerpc/include/asm/secvar.h
@@ -10,25 +10,30 @@
#include <linux/types.h>
#include <linux/errno.h>
+#include <linux/sysfs.h>
extern const struct secvar_operations *secvar_ops;
struct secvar_operations {
- int (*get)(const char *key, uint64_t key_len, u8 *data,
- uint64_t *data_size);
- int (*get_next)(const char *key, uint64_t *key_len,
- uint64_t keybufsize);
- int (*set)(const char *key, uint64_t key_len, u8 *data,
- uint64_t data_size);
+ int (*get)(const char *key, u64 key_len, u8 *data, u64 *data_size);
+ int (*get_next)(const char *key, u64 *key_len, u64 keybufsize);
+ int (*set)(const char *key, u64 key_len, u8 *data, u64 data_size);
+ ssize_t (*format)(char *buf, size_t bufsize);
+ int (*max_size)(u64 *max_size);
+ const struct attribute **config_attrs;
+
+ // NULL-terminated array of fixed variable names
+ // Only used if get_next() isn't provided
+ const char * const *var_names;
};
#ifdef CONFIG_PPC_SECURE_BOOT
-extern void set_secvar_ops(const struct secvar_operations *ops);
+int set_secvar_ops(const struct secvar_operations *ops);
#else
-static inline void set_secvar_ops(const struct secvar_operations *ops) { }
+static inline int set_secvar_ops(const struct secvar_operations *ops) { return 0; }
#endif
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index f63505d74932..6c6cb53d7045 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -26,6 +26,7 @@
#include <asm/percpu.h>
extern int boot_cpuid;
+extern int boot_cpu_hwid; /* PPC64 only */
extern int spinning_secondaries;
extern u32 *cpu_to_phys_id;
extern bool coregroup_enabled;
diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index 08cd60cd70b7..82cc2c6704e6 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -119,6 +119,109 @@ TRACE_EVENT_FN_COND(hcall_exit,
);
#endif
+#ifdef CONFIG_PPC_RTAS
+
+#include <asm/rtas-types.h>
+
+TRACE_EVENT(rtas_input,
+
+ TP_PROTO(struct rtas_args *rtas_args, const char *name),
+
+ TP_ARGS(rtas_args, name),
+
+ TP_STRUCT__entry(
+ __field(__u32, nargs)
+ __string(name, name)
+ __dynamic_array(__u32, inputs, be32_to_cpu(rtas_args->nargs))
+ ),
+
+ TP_fast_assign(
+ __entry->nargs = be32_to_cpu(rtas_args->nargs);
+ __assign_str(name, name);
+ be32_to_cpu_array(__get_dynamic_array(inputs), rtas_args->args, __entry->nargs);
+ ),
+
+ TP_printk("%s arguments: %s", __get_str(name),
+ __print_array(__get_dynamic_array(inputs), __entry->nargs, 4)
+ )
+);
+
+TRACE_EVENT(rtas_output,
+
+ TP_PROTO(struct rtas_args *rtas_args, const char *name),
+
+ TP_ARGS(rtas_args, name),
+
+ TP_STRUCT__entry(
+ __field(__u32, nr_other)
+ __field(__s32, status)
+ __string(name, name)
+ __dynamic_array(__u32, other_outputs, be32_to_cpu(rtas_args->nret) - 1)
+ ),
+
+ TP_fast_assign(
+ __entry->nr_other = be32_to_cpu(rtas_args->nret) - 1;
+ __entry->status = be32_to_cpu(rtas_args->rets[0]);
+ __assign_str(name, name);
+ be32_to_cpu_array(__get_dynamic_array(other_outputs),
+ &rtas_args->rets[1], __entry->nr_other);
+ ),
+
+ TP_printk("%s status: %d, other outputs: %s", __get_str(name), __entry->status,
+ __print_array(__get_dynamic_array(other_outputs),
+ __entry->nr_other, 4)
+ )
+);
+
+DECLARE_EVENT_CLASS(rtas_parameter_block,
+
+ TP_PROTO(struct rtas_args *rtas_args),
+
+ TP_ARGS(rtas_args),
+
+ TP_STRUCT__entry(
+ __field(u32, token)
+ __field(u32, nargs)
+ __field(u32, nret)
+ __array(__u32, params, 16)
+ ),
+
+ TP_fast_assign(
+ __entry->token = be32_to_cpu(rtas_args->token);
+ __entry->nargs = be32_to_cpu(rtas_args->nargs);
+ __entry->nret = be32_to_cpu(rtas_args->nret);
+ be32_to_cpu_array(__entry->params, rtas_args->args, ARRAY_SIZE(rtas_args->args));
+ ),
+
+ TP_printk("token=%u nargs=%u nret=%u params:"
+ " [0]=0x%08x [1]=0x%08x [2]=0x%08x [3]=0x%08x"
+ " [4]=0x%08x [5]=0x%08x [6]=0x%08x [7]=0x%08x"
+ " [8]=0x%08x [9]=0x%08x [10]=0x%08x [11]=0x%08x"
+ " [12]=0x%08x [13]=0x%08x [14]=0x%08x [15]=0x%08x",
+ __entry->token, __entry->nargs, __entry->nret,
+ __entry->params[0], __entry->params[1], __entry->params[2], __entry->params[3],
+ __entry->params[4], __entry->params[5], __entry->params[6], __entry->params[7],
+ __entry->params[8], __entry->params[9], __entry->params[10], __entry->params[11],
+ __entry->params[12], __entry->params[13], __entry->params[14], __entry->params[15]
+ )
+);
+
+DEFINE_EVENT(rtas_parameter_block, rtas_ll_entry,
+
+ TP_PROTO(struct rtas_args *rtas_args),
+
+ TP_ARGS(rtas_args)
+);
+
+DEFINE_EVENT(rtas_parameter_block, rtas_ll_exit,
+
+ TP_PROTO(struct rtas_args *rtas_args),
+
+ TP_ARGS(rtas_args)
+);
+
+#endif /* CONFIG_PPC_RTAS */
+
#ifdef CONFIG_PPC_POWERNV
extern int opal_tracepoint_regfunc(void);
extern void opal_tracepoint_unregfunc(void);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 9b6146056e48..9bf2be123093 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -54,6 +54,13 @@ CFLAGS_cputable.o += -DDISABLE_BRANCH_PROFILING
CFLAGS_btext.o += -DDISABLE_BRANCH_PROFILING
endif
+KCSAN_SANITIZE_early_32.o := n
+KCSAN_SANITIZE_early_64.o := n
+KCSAN_SANITIZE_cputable.o := n
+KCSAN_SANITIZE_btext.o := n
+KCSAN_SANITIZE_paca.o := n
+KCSAN_SANITIZE_setup_64.o := n
+
#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET
# Remove stack protector to avoid triggering unneeded stack canary
# checks due to randomize_kstack_offset.
@@ -177,12 +184,15 @@ obj-$(CONFIG_PPC_SECVAR_SYSFS) += secvar-sysfs.o
# Disable GCOV, KCOV & sanitizers in odd or sensitive code
GCOV_PROFILE_prom_init.o := n
KCOV_INSTRUMENT_prom_init.o := n
+KCSAN_SANITIZE_prom_init.o := n
UBSAN_SANITIZE_prom_init.o := n
GCOV_PROFILE_kprobes.o := n
KCOV_INSTRUMENT_kprobes.o := n
+KCSAN_SANITIZE_kprobes.o := n
UBSAN_SANITIZE_kprobes.o := n
GCOV_PROFILE_kprobes-ftrace.o := n
KCOV_INSTRUMENT_kprobes-ftrace.o := n
+KCSAN_SANITIZE_kprobes-ftrace.o := n
UBSAN_SANITIZE_kprobes-ftrace.o := n
GCOV_PROFILE_syscall_64.o := n
KCOV_INSTRUMENT_syscall_64.o := n
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index f279295179bd..438568a472d0 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1065,10 +1065,10 @@ recover_failed:
eeh_slot_error_detail(pe, EEH_LOG_PERM);
/* Notify all devices that they're about to go down. */
- eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_set_irq_state(pe, false);
eeh_pe_report("error_detected(permanent failure)", pe,
eeh_report_failure, NULL);
+ eeh_set_channel_state(pe, pci_channel_io_perm_failure);
/* Mark the PE to be removed permanently */
eeh_pe_state_mark(pe, EEH_PE_REMOVED);
@@ -1185,10 +1185,10 @@ void eeh_handle_special_event(void)
/* Notify all devices to be down */
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
- eeh_set_channel_state(pe, pci_channel_io_perm_failure);
eeh_pe_report(
"error_detected(permanent failure)", pe,
eeh_report_failure, NULL);
+ eeh_set_channel_state(pe, pci_channel_io_perm_failure);
pci_lock_rescan_remove();
list_for_each_entry(hose, &hose_list, list_node) {
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
index 69a912550577..033116e465d0 100644
--- a/arch/powerpc/kernel/epapr_hcalls.S
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -21,7 +21,13 @@ _GLOBAL(epapr_ev_idle)
ori r4, r4,_TLF_NAPPING /* so when we take an exception */
PPC_STL r4, TI_LOCAL_FLAGS(r2) /* it will return to our caller */
+#ifdef CONFIG_BOOKE_OR_40x
wrteei 1
+#else
+ mfmsr r4
+ ori r4, r4, MSR_EE
+ mtmsr r4
+#endif
idle_loop:
LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 7558ba4eb864..1febb56ebaeb 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -160,12 +160,8 @@ __secondary_hold:
std r24,(ABS_ADDR(__secondary_hold_acknowledge, first_256B))(0)
sync
- li r26,0
-#ifdef CONFIG_PPC_BOOK3E_64
- tovirt(r26,r26)
-#endif
/* All secondary cpus wait here until told to start. */
-100: ld r12,(ABS_ADDR(__secondary_hold_spinloop, first_256B))(r26)
+100: ld r12,(ABS_ADDR(__secondary_hold_spinloop, first_256B))(0)
cmpdi 0,r12,0
beq 100b
@@ -475,8 +471,31 @@ SYM_FUNC_START_LOCAL(__mmu_off)
rfid
b . /* prevent speculative execution */
SYM_FUNC_END(__mmu_off)
-#endif
+SYM_FUNC_START_LOCAL(start_initialization_book3s)
+ mflr r25
+
+ /* Setup some critical 970 SPRs before switching MMU off */
+ mfspr r0,SPRN_PVR
+ srwi r0,r0,16
+ cmpwi r0,0x39 /* 970 */
+ beq 1f
+ cmpwi r0,0x3c /* 970FX */
+ beq 1f
+ cmpwi r0,0x44 /* 970MP */
+ beq 1f
+ cmpwi r0,0x45 /* 970GX */
+ bne 2f
+1: bl __cpu_preinit_ppc970
+2:
+
+ /* Switch off MMU if not already off */
+ bl __mmu_off
+
+ mtlr r25
+ blr
+SYM_FUNC_END(start_initialization_book3s)
+#endif
/*
* Here is our main kernel entry point. We support currently 2 kind of entries
@@ -523,26 +542,10 @@ __start_initialization_multiplatform:
#ifdef CONFIG_PPC_BOOK3E_64
bl start_initialization_book3e
- b __after_prom_start
#else
- /* Setup some critical 970 SPRs before switching MMU off */
- mfspr r0,SPRN_PVR
- srwi r0,r0,16
- cmpwi r0,0x39 /* 970 */
- beq 1f
- cmpwi r0,0x3c /* 970FX */
- beq 1f
- cmpwi r0,0x44 /* 970MP */
- beq 1f
- cmpwi r0,0x45 /* 970GX */
- bne 2f
-1: bl __cpu_preinit_ppc970
-2:
-
- /* Switch off MMU if not already off */
- bl __mmu_off
- b __after_prom_start
+ bl start_initialization_book3s
#endif /* CONFIG_PPC_BOOK3E_64 */
+ b __after_prom_start
__REF
__boot_from_prom:
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index caebe1431596..ee95937bdaf1 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -67,11 +67,9 @@ static void iommu_debugfs_add(struct iommu_table *tbl)
static void iommu_debugfs_del(struct iommu_table *tbl)
{
char name[10];
- struct dentry *liobn_entry;
sprintf(name, "%08lx", tbl->it_index);
- liobn_entry = debugfs_lookup(name, iommu_debugfs_dir);
- debugfs_remove(liobn_entry);
+ debugfs_lookup_and_remove(name, iommu_debugfs_dir);
}
#else
static void iommu_debugfs_add(struct iommu_table *tbl){}
diff --git a/arch/powerpc/kernel/irq_64.c b/arch/powerpc/kernel/irq_64.c
index eb2b380e52a0..c788c55512ed 100644
--- a/arch/powerpc/kernel/irq_64.c
+++ b/arch/powerpc/kernel/irq_64.c
@@ -70,22 +70,19 @@ int distribute_irqs = 1;
static inline void next_interrupt(struct pt_regs *regs)
{
- /*
- * Softirq processing can enable/disable irqs, which will leave
- * MSR[EE] enabled and the soft mask set to IRQS_DISABLED. Fix
- * this up.
- */
- if (!(local_paca->irq_happened & PACA_IRQ_HARD_DIS))
- hard_irq_disable();
- else
- irq_soft_mask_set(IRQS_ALL_DISABLED);
+ if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) {
+ WARN_ON(!(local_paca->irq_happened & PACA_IRQ_HARD_DIS));
+ WARN_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED);
+ }
/*
* We are responding to the next interrupt, so interrupt-off
* latencies should be reset here.
*/
+ lockdep_hardirq_exit();
trace_hardirqs_on();
trace_hardirqs_off();
+ lockdep_hardirq_enter();
}
static inline bool irq_happened_test_and_clear(u8 irq)
@@ -97,22 +94,11 @@ static inline bool irq_happened_test_and_clear(u8 irq)
return false;
}
-void replay_soft_interrupts(void)
+static __no_kcsan void __replay_soft_interrupts(void)
{
struct pt_regs regs;
/*
- * Be careful here, calling these interrupt handlers can cause
- * softirqs to be raised, which they may run when calling irq_exit,
- * which will cause local_irq_enable() to be run, which can then
- * recurse into this function. Don't keep any state across
- * interrupt handler calls which may change underneath us.
- *
- * Softirqs can not be disabled over replay to stop this recursion
- * because interrupts taken in idle code may require RCU softirq
- * to run in the irq RCU tracking context. This is a hard problem
- * to fix without changes to the softirq or idle layer.
- *
* We use local_paca rather than get_paca() to avoid all the
* debug_smp_processor_id() business in this low level function.
*/
@@ -120,13 +106,20 @@ void replay_soft_interrupts(void)
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) {
WARN_ON_ONCE(mfmsr() & MSR_EE);
WARN_ON(!(local_paca->irq_happened & PACA_IRQ_HARD_DIS));
+ WARN_ON(local_paca->irq_happened & PACA_IRQ_REPLAYING);
}
+ /*
+ * PACA_IRQ_REPLAYING prevents interrupt handlers from enabling
+ * MSR[EE] to get PMIs, which can result in more IRQs becoming
+ * pending.
+ */
+ local_paca->irq_happened |= PACA_IRQ_REPLAYING;
+
ppc_save_regs(&regs);
regs.softe = IRQS_ENABLED;
regs.msr |= MSR_EE;
-again:
/*
* Force the delivery of pending soft-disabled interrupts on PS3.
* Any HV call will have this side effect.
@@ -175,17 +168,18 @@ again:
next_interrupt(&regs);
}
- /*
- * Softirq processing can enable and disable interrupts, which can
- * result in new irqs becoming pending. Must keep looping until we
- * have cleared out all pending interrupts.
- */
- if (local_paca->irq_happened & ~PACA_IRQ_HARD_DIS)
- goto again;
+ local_paca->irq_happened &= ~PACA_IRQ_REPLAYING;
+}
+
+__no_kcsan void replay_soft_interrupts(void)
+{
+ irq_enter(); /* See comment in arch_local_irq_restore */
+ __replay_soft_interrupts();
+ irq_exit();
}
#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_KUAP)
-static inline void replay_soft_interrupts_irqrestore(void)
+static inline __no_kcsan void replay_soft_interrupts_irqrestore(void)
{
unsigned long kuap_state = get_kuap();
@@ -200,16 +194,16 @@ static inline void replay_soft_interrupts_irqrestore(void)
if (kuap_state != AMR_KUAP_BLOCKED)
set_kuap(AMR_KUAP_BLOCKED);
- replay_soft_interrupts();
+ __replay_soft_interrupts();
if (kuap_state != AMR_KUAP_BLOCKED)
set_kuap(kuap_state);
}
#else
-#define replay_soft_interrupts_irqrestore() replay_soft_interrupts()
+#define replay_soft_interrupts_irqrestore() __replay_soft_interrupts()
#endif
-notrace void arch_local_irq_restore(unsigned long mask)
+notrace __no_kcsan void arch_local_irq_restore(unsigned long mask)
{
unsigned char irq_happened;
@@ -219,9 +213,13 @@ notrace void arch_local_irq_restore(unsigned long mask)
return;
}
- if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
- WARN_ON_ONCE(in_nmi() || in_hardirq());
+ if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) {
+ WARN_ON_ONCE(in_nmi());
+ WARN_ON_ONCE(in_hardirq());
+ WARN_ON_ONCE(local_paca->irq_happened & PACA_IRQ_REPLAYING);
+ }
+again:
/*
* After the stb, interrupts are unmasked and there are no interrupts
* pending replay. The restart sequence makes this atomic with
@@ -248,6 +246,12 @@ notrace void arch_local_irq_restore(unsigned long mask)
if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
WARN_ON_ONCE(!(mfmsr() & MSR_EE));
+ /*
+ * If we came here from the replay below, we might have a preempt
+ * pending (due to preempt_enable_no_resched()). Have to check now.
+ */
+ preempt_check_resched();
+
return;
happened:
@@ -261,6 +265,7 @@ happened:
irq_soft_mask_set(IRQS_ENABLED);
local_paca->irq_happened = 0;
__hard_irq_enable();
+ preempt_check_resched();
return;
}
@@ -296,12 +301,38 @@ happened:
irq_soft_mask_set(IRQS_ALL_DISABLED);
trace_hardirqs_off();
+ /*
+ * Now enter interrupt context. The interrupt handlers themselves
+ * also call irq_enter/exit (which is okay, they can nest). But call
+ * it here now to hold off softirqs until the below irq_exit(). If
+ * we allowed replayed handlers to run softirqs, that enables irqs,
+ * which must replay interrupts, which recurses in here and makes
+ * things more complicated. The recursion is limited to 2, and it can
+ * be made to work, but it's complicated.
+ *
+ * local_bh_disable can not be used here because interrupts taken in
+ * idle are not in the right context (RCU, tick, etc) to run softirqs
+ * so irq_enter must be called.
+ */
+ irq_enter();
+
replay_soft_interrupts_irqrestore();
+ irq_exit();
+
+ if (unlikely(local_paca->irq_happened != PACA_IRQ_HARD_DIS)) {
+ /*
+ * The softirq processing in irq_exit() may enable interrupts
+ * temporarily, which can result in MSR[EE] being enabled and
+ * more irqs becoming pending. Go around again if that happens.
+ */
+ trace_hardirqs_on();
+ preempt_enable_no_resched();
+ goto again;
+ }
+
trace_hardirqs_on();
irq_soft_mask_set(IRQS_ENABLED);
- if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
- WARN_ON(local_paca->irq_happened != PACA_IRQ_HARD_DIS);
local_paca->irq_happened = 0;
__hard_irq_enable();
preempt_enable();
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6c5d30fba766..219f28637a3e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -131,6 +131,13 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (mce->error_type == MCE_ERROR_TYPE_UE)
mce->u.ue_error.ignore_event = mce_err->ignore_event;
+ /*
+ * Raise irq work, So that we don't miss to log the error for
+ * unrecoverable errors.
+ */
+ if (mce->disposition == MCE_DISPOSITION_NOT_RECOVERED)
+ mce_irq_work_queue();
+
if (!addr)
return;
@@ -233,9 +240,6 @@ static void machine_check_ue_event(struct machine_check_event *evt)
}
memcpy(&local_paca->mce_info->mce_ue_event_queue[index],
evt, sizeof(*evt));
-
- /* Queue work to process this event later. */
- mce_irq_work_queue();
}
/*
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index ff045644f13f..2ac78d207f77 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -502,9 +502,10 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs,
static int restore_r2(const char *name, u32 *instruction, struct module *me)
{
u32 *prev_insn = instruction - 1;
+ u32 insn_val = *instruction;
if (is_mprofile_ftrace_call(name))
- return 1;
+ return 0;
/*
* Make sure the branch isn't a sibling call. Sibling calls aren't
@@ -512,19 +513,25 @@ static int restore_r2(const char *name, u32 *instruction, struct module *me)
* restore afterwards.
*/
if (!instr_is_relative_link_branch(ppc_inst(*prev_insn)))
- return 1;
+ return 0;
- if (*instruction != PPC_RAW_NOP()) {
- pr_err("%s: Expected nop after call, got %08x at %pS\n",
- me->name, *instruction, instruction);
+ /*
+ * For livepatch, the restore r2 instruction might have already been
+ * written previously, if the referenced symbol is in a previously
+ * unloaded module which is now being loaded again. In that case, skip
+ * the warning and the instruction write.
+ */
+ if (insn_val == PPC_INST_LD_TOC)
return 0;
+
+ if (insn_val != PPC_RAW_NOP()) {
+ pr_err("%s: Expected nop after call, got %08x at %pS\n",
+ me->name, insn_val, instruction);
+ return -ENOEXEC;
}
/* ld r2,R2_STACK_OFFSET(r1) */
- if (patch_instruction(instruction, ppc_inst(PPC_INST_LD_TOC)))
- return 0;
-
- return 1;
+ return patch_instruction(instruction, ppc_inst(PPC_INST_LD_TOC));
}
int apply_relocate_add(Elf64_Shdr *sechdrs,
@@ -648,8 +655,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
strtab + sym->st_name);
if (!value)
return -ENOENT;
- if (!restore_r2(strtab + sym->st_name,
- (u32 *)location + 1, me))
+ if (restore_r2(strtab + sym->st_name,
+ (u32 *)location + 1, me))
return -ENOEXEC;
} else
value += local_entry_offset(sym);
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index 855b59892c5c..ce0c8623e563 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -62,7 +62,7 @@ fixup_cpc710_pci64(struct pci_dev* dev)
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CPC710_PCI64, fixup_cpc710_pci64);
-#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_CHRP)
+#ifdef CONFIG_PPC_PCI_OF_BUS_MAP
static u8* pci_to_OF_bus_map;
static int pci_bus_count;
@@ -152,6 +152,7 @@ pcibios_make_OF_bus_map(void)
}
#endif
}
+#endif // CONFIG_PPC_PCI_OF_BUS_MAP
#ifdef CONFIG_PPC_PMAC
@@ -160,7 +161,9 @@ pcibios_make_OF_bus_map(void)
*/
int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn)
{
+#ifdef CONFIG_PPC_PCI_OF_BUS_MAP
struct pci_dev *dev = NULL;
+#endif
const __be32 *reg;
int size;
@@ -175,6 +178,9 @@ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn)
*bus = (be32_to_cpup(&reg[0]) >> 16) & 0xff;
*devfn = (be32_to_cpup(&reg[0]) >> 8) & 0xff;
+#ifndef CONFIG_PPC_PCI_OF_BUS_MAP
+ return 0;
+#else
/* Ok, here we need some tweak. If we have already renumbered
* all busses, we can't rely on the OF bus number any more.
* the pci_to_OF_bus_map is not enough as several PCI busses
@@ -192,11 +198,12 @@ int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn)
}
return -ENODEV;
+#endif // CONFIG_PPC_PCI_OF_BUS_MAP
}
EXPORT_SYMBOL(pci_device_from_OF_node);
#endif
-#ifdef CONFIG_PPC_CHRP
+#ifdef CONFIG_PPC_PCI_OF_BUS_MAP
/* We create the "pci-OF-bus-map" property now so it appears in the
* /proc device tree
*/
@@ -221,9 +228,7 @@ pci_create_OF_bus_map(void)
of_node_put(dn);
}
}
-#endif
-
-#endif /* defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_CHRP) */
+#endif // CONFIG_PPC_PCI_OF_BUS_MAP
void pcibios_setup_phb_io_space(struct pci_controller *hose)
{
@@ -273,6 +278,7 @@ static int __init pcibios_init(void)
}
#if defined(CONFIG_PPC_PMAC) || defined(CONFIG_PPC_CHRP)
+#ifdef CONFIG_PPC_PCI_OF_BUS_MAP
pci_bus_count = next_busno;
/* OpenFirmware based machines need a map of OF bus
@@ -282,6 +288,7 @@ static int __init pcibios_init(void)
if (pci_assign_all_buses)
pcibios_make_OF_bus_map();
#endif
+#endif
/* Call common code to handle resource allocation */
pcibios_resource_survey();
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index c22cc234672f..4b29ac5ddac6 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1405,8 +1405,7 @@ static void show_instructions(struct pt_regs *regs)
for (i = 0; i < NR_INSN_TO_PRINT; i++) {
int instr;
- if (!__kernel_text_address(pc) ||
- get_kernel_nofault(instr, (const void *)pc)) {
+ if (get_kernel_nofault(instr, (const void *)pc)) {
pr_cont("XXXXXXXX ");
} else {
if (nip == pc)
@@ -2118,6 +2117,9 @@ static inline int valid_irq_stack(unsigned long sp, struct task_struct *p,
unsigned long stack_page;
unsigned long cpu = task_cpu(p);
+ if (!hardirq_ctx[cpu] || !softirq_ctx[cpu])
+ return 0;
+
stack_page = (unsigned long)hardirq_ctx[cpu];
if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
return 1;
@@ -2139,6 +2141,14 @@ static inline int valid_emergency_stack(unsigned long sp, struct task_struct *p,
if (!paca_ptrs)
return 0;
+ if (!paca_ptrs[cpu]->emergency_sp)
+ return 0;
+
+# ifdef CONFIG_PPC_BOOK3S_64
+ if (!paca_ptrs[cpu]->nmi_emergency_sp || !paca_ptrs[cpu]->mc_emergency_sp)
+ return 0;
+#endif
+
stack_page = (unsigned long)paca_ptrs[cpu]->emergency_sp - THREAD_SIZE;
if (sp >= stack_page && sp <= stack_page + THREAD_SIZE - nbytes)
return 1;
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 4f1c920aa13e..9d9ee4e9e1a1 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -56,6 +56,7 @@
#include <asm/drmem.h>
#include <asm/ultravisor.h>
#include <asm/prom.h>
+#include <asm/plpks.h>
#include <mm/mmu_decl.h>
@@ -370,8 +371,8 @@ static int __init early_init_dt_scan_cpus(unsigned long node,
be32_to_cpu(intserv[found_thread]));
boot_cpuid = found;
- // Pass the boot CPU's hard CPU id back to our caller
- *((u32 *)data) = be32_to_cpu(intserv[found_thread]);
+ if (IS_ENABLED(CONFIG_PPC64))
+ boot_cpu_hwid = be32_to_cpu(intserv[found_thread]);
/*
* PAPR defines "logical" PVR values for cpus that
@@ -755,7 +756,6 @@ static inline void save_fscr_to_task(void) {}
void __init early_init_devtree(void *params)
{
- u32 boot_cpu_hwid;
phys_addr_t limit;
DBG(" -> early_init_devtree(%px)\n", params);
@@ -851,7 +851,7 @@ void __init early_init_devtree(void *params)
/* Retrieve CPU related informations from the flat tree
* (altivec support, boot CPU ID, ...)
*/
- of_scan_flat_dt(early_init_dt_scan_cpus, &boot_cpu_hwid);
+ of_scan_flat_dt(early_init_dt_scan_cpus, NULL);
if (boot_cpuid < 0) {
printk("Failed to identify boot CPU !\n");
BUG();
@@ -868,11 +868,6 @@ void __init early_init_devtree(void *params)
mmu_early_init_devtree();
- // NB. paca is not installed until later in early_setup()
- allocate_paca_ptrs();
- allocate_paca(boot_cpuid);
- set_hard_smp_processor_id(boot_cpuid, boot_cpu_hwid);
-
#ifdef CONFIG_PPC_POWERNV
/* Scan and build the list of machine check recoverable ranges */
of_scan_flat_dt(early_init_dt_scan_recoverable_ranges, NULL);
@@ -893,6 +888,9 @@ void __init early_init_devtree(void *params)
powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE;
#endif
+ /* If kexec left a PLPKS password in the DT, get it and clear it */
+ plpks_early_init_devtree();
+
tm_init();
DBG(" <- early_init_devtree()\n");
diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh
index 311890d71c4c..5a319863f289 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -51,11 +51,10 @@ do
# a leading . on the name, so strip it off here.
UNDEF="${UNDEF#.}"
- if [ $KBUILD_VERBOSE ]; then
- if [ $KBUILD_VERBOSE -ne 0 ]; then
- echo "Checking prom_init.o symbol '$UNDEF'"
- fi
- fi
+ case "$KBUILD_VERBOSE" in
+ *1*)
+ echo "Checking prom_init.o symbol '$UNDEF'" ;;
+ esac
OK=0
for WHITE in $WHITELIST
diff --git a/arch/powerpc/kernel/rtas-proc.c b/arch/powerpc/kernel/rtas-proc.c
index 081b2b741a8c..9454b8395b6a 100644
--- a/arch/powerpc/kernel/rtas-proc.c
+++ b/arch/powerpc/kernel/rtas-proc.c
@@ -287,9 +287,9 @@ static ssize_t ppc_rtas_poweron_write(struct file *file,
rtc_time64_to_tm(nowtime, &tm);
- error = rtas_call(rtas_token("set-time-for-power-on"), 7, 1, NULL,
- tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
- tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */);
+ error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_FOR_POWER_ON), 7, 1, NULL,
+ tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec, 0 /* nano */);
if (error)
printk(KERN_WARNING "error: setting poweron time returned: %s\n",
ppc_rtas_process_error(error));
@@ -350,9 +350,9 @@ static ssize_t ppc_rtas_clock_write(struct file *file,
return error;
rtc_time64_to_tm(nowtime, &tm);
- error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL,
- tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
- tm.tm_hour, tm.tm_min, tm.tm_sec, 0);
+ error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_OF_DAY), 7, 1, NULL,
+ tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec, 0);
if (error)
printk(KERN_WARNING "error: setting the clock returned: %s\n",
ppc_rtas_process_error(error));
@@ -362,7 +362,7 @@ static ssize_t ppc_rtas_clock_write(struct file *file,
static int ppc_rtas_clock_show(struct seq_file *m, void *v)
{
int ret[8];
- int error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+ int error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret);
if (error) {
printk(KERN_WARNING "error: reading the clock returned: %s\n",
@@ -385,7 +385,7 @@ static int ppc_rtas_sensors_show(struct seq_file *m, void *v)
{
int i,j;
int state, error;
- int get_sensor_state = rtas_token("get-sensor-state");
+ int get_sensor_state = rtas_function_token(RTAS_FN_GET_SENSOR_STATE);
seq_printf(m, "RTAS (RunTime Abstraction Services) Sensor Information\n");
seq_printf(m, "Sensor\t\tValue\t\tCondition\tLocation\n");
@@ -708,8 +708,8 @@ static ssize_t ppc_rtas_tone_freq_write(struct file *file,
return error;
rtas_tone_frequency = freq; /* save it for later */
- error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL,
- TONE_FREQUENCY, 0, freq);
+ error = rtas_call(rtas_function_token(RTAS_FN_SET_INDICATOR), 3, 1, NULL,
+ TONE_FREQUENCY, 0, freq);
if (error)
printk(KERN_WARNING "error: setting tone frequency returned: %s\n",
ppc_rtas_process_error(error));
@@ -736,8 +736,8 @@ static ssize_t ppc_rtas_tone_volume_write(struct file *file,
volume = 100;
rtas_tone_volume = volume; /* save it for later */
- error = rtas_call(rtas_token("set-indicator"), 3, 1, NULL,
- TONE_VOLUME, 0, volume);
+ error = rtas_call(rtas_function_token(RTAS_FN_SET_INDICATOR), 3, 1, NULL,
+ TONE_VOLUME, 0, volume);
if (error)
printk(KERN_WARNING "error: setting tone volume returned: %s\n",
ppc_rtas_process_error(error));
diff --git a/arch/powerpc/kernel/rtas-rtc.c b/arch/powerpc/kernel/rtas-rtc.c
index 5a31d1829bca..6996214532bd 100644
--- a/arch/powerpc/kernel/rtas-rtc.c
+++ b/arch/powerpc/kernel/rtas-rtc.c
@@ -21,7 +21,7 @@ time64_t __init rtas_get_boot_time(void)
max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
do {
- error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+ error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret);
wait_time = rtas_busy_delay_time(error);
if (wait_time) {
@@ -53,7 +53,7 @@ void rtas_get_rtc_time(struct rtc_time *rtc_tm)
max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
do {
- error = rtas_call(rtas_token("get-time-of-day"), 0, 8, ret);
+ error = rtas_call(rtas_function_token(RTAS_FN_GET_TIME_OF_DAY), 0, 8, ret);
wait_time = rtas_busy_delay_time(error);
if (wait_time) {
@@ -90,7 +90,7 @@ int rtas_set_rtc_time(struct rtc_time *tm)
max_wait_tb = get_tb() + tb_ticks_per_usec * 1000 * MAX_RTC_WAIT;
do {
- error = rtas_call(rtas_token("set-time-of-day"), 7, 1, NULL,
+ error = rtas_call(rtas_function_token(RTAS_FN_SET_TIME_OF_DAY), 7, 1, NULL,
tm->tm_year + 1900, tm->tm_mon + 1,
tm->tm_mday, tm->tm_hour, tm->tm_min,
tm->tm_sec, 0);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index deded51a7978..31175b34856a 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -9,10 +9,12 @@
#define pr_fmt(fmt) "rtas: " fmt
+#include <linux/bsearch.h>
#include <linux/capability.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/init.h>
+#include <linux/kconfig.h>
#include <linux/kernel.h>
#include <linux/memblock.h>
#include <linux/of.h>
@@ -26,6 +28,7 @@
#include <linux/syscalls.h>
#include <linux/types.h>
#include <linux/uaccess.h>
+#include <linux/xarray.h>
#include <asm/delay.h>
#include <asm/firmware.h>
@@ -33,43 +36,604 @@
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/page.h>
+#include <asm/rtas-work-area.h>
#include <asm/rtas.h>
#include <asm/time.h>
+#include <asm/trace.h>
#include <asm/udbg.h>
+struct rtas_filter {
+ /* Indexes into the args buffer, -1 if not used */
+ const int buf_idx1;
+ const int size_idx1;
+ const int buf_idx2;
+ const int size_idx2;
+ /*
+ * Assumed buffer size per the spec if the function does not
+ * have a size parameter, e.g. ibm,errinjct. 0 if unused.
+ */
+ const int fixed_size;
+};
+
+/**
+ * struct rtas_function - Descriptor for RTAS functions.
+ *
+ * @token: Value of @name if it exists under the /rtas node.
+ * @name: Function name.
+ * @filter: If non-NULL, invoking this function via the rtas syscall is
+ * generally allowed, and @filter describes constraints on the
+ * arguments. See also @banned_for_syscall_on_le.
+ * @banned_for_syscall_on_le: Set when call via sys_rtas is generally allowed
+ * but specifically restricted on ppc64le. Such
+ * functions are believed to have no users on
+ * ppc64le, and we want to keep it that way. It does
+ * not make sense for this to be set when @filter
+ * is false.
+ */
+struct rtas_function {
+ s32 token;
+ const bool banned_for_syscall_on_le:1;
+ const char * const name;
+ const struct rtas_filter *filter;
+};
+
+static struct rtas_function rtas_function_table[] __ro_after_init = {
+ [RTAS_FNIDX__CHECK_EXCEPTION] = {
+ .name = "check-exception",
+ },
+ [RTAS_FNIDX__DISPLAY_CHARACTER] = {
+ .name = "display-character",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__EVENT_SCAN] = {
+ .name = "event-scan",
+ },
+ [RTAS_FNIDX__FREEZE_TIME_BASE] = {
+ .name = "freeze-time-base",
+ },
+ [RTAS_FNIDX__GET_POWER_LEVEL] = {
+ .name = "get-power-level",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__GET_SENSOR_STATE] = {
+ .name = "get-sensor-state",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__GET_TERM_CHAR] = {
+ .name = "get-term-char",
+ },
+ [RTAS_FNIDX__GET_TIME_OF_DAY] = {
+ .name = "get-time-of-day",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_ACTIVATE_FIRMWARE] = {
+ .name = "ibm,activate-firmware",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_CBE_START_PTCAL] = {
+ .name = "ibm,cbe-start-ptcal",
+ },
+ [RTAS_FNIDX__IBM_CBE_STOP_PTCAL] = {
+ .name = "ibm,cbe-stop-ptcal",
+ },
+ [RTAS_FNIDX__IBM_CHANGE_MSI] = {
+ .name = "ibm,change-msi",
+ },
+ [RTAS_FNIDX__IBM_CLOSE_ERRINJCT] = {
+ .name = "ibm,close-errinjct",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_CONFIGURE_BRIDGE] = {
+ .name = "ibm,configure-bridge",
+ },
+ [RTAS_FNIDX__IBM_CONFIGURE_CONNECTOR] = {
+ .name = "ibm,configure-connector",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 0, .size_idx1 = -1,
+ .buf_idx2 = 1, .size_idx2 = -1,
+ .fixed_size = 4096,
+ },
+ },
+ [RTAS_FNIDX__IBM_CONFIGURE_KERNEL_DUMP] = {
+ .name = "ibm,configure-kernel-dump",
+ },
+ [RTAS_FNIDX__IBM_CONFIGURE_PE] = {
+ .name = "ibm,configure-pe",
+ },
+ [RTAS_FNIDX__IBM_CREATE_PE_DMA_WINDOW] = {
+ .name = "ibm,create-pe-dma-window",
+ },
+ [RTAS_FNIDX__IBM_DISPLAY_MESSAGE] = {
+ .name = "ibm,display-message",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 0, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_ERRINJCT] = {
+ .name = "ibm,errinjct",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 2, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ .fixed_size = 1024,
+ },
+ },
+ [RTAS_FNIDX__IBM_EXTI2C] = {
+ .name = "ibm,exti2c",
+ },
+ [RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO] = {
+ .name = "ibm,get-config-addr-info",
+ },
+ [RTAS_FNIDX__IBM_GET_CONFIG_ADDR_INFO2] = {
+ .name = "ibm,get-config-addr-info2",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_GET_DYNAMIC_SENSOR_STATE] = {
+ .name = "ibm,get-dynamic-sensor-state",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_GET_INDICES] = {
+ .name = "ibm,get-indices",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 2, .size_idx1 = 3,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_GET_RIO_TOPOLOGY] = {
+ .name = "ibm,get-rio-topology",
+ },
+ [RTAS_FNIDX__IBM_GET_SYSTEM_PARAMETER] = {
+ .name = "ibm,get-system-parameter",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 1, .size_idx1 = 2,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_GET_VPD] = {
+ .name = "ibm,get-vpd",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 0, .size_idx1 = -1,
+ .buf_idx2 = 1, .size_idx2 = 2,
+ },
+ },
+ [RTAS_FNIDX__IBM_GET_XIVE] = {
+ .name = "ibm,get-xive",
+ },
+ [RTAS_FNIDX__IBM_INT_OFF] = {
+ .name = "ibm,int-off",
+ },
+ [RTAS_FNIDX__IBM_INT_ON] = {
+ .name = "ibm,int-on",
+ },
+ [RTAS_FNIDX__IBM_IO_QUIESCE_ACK] = {
+ .name = "ibm,io-quiesce-ack",
+ },
+ [RTAS_FNIDX__IBM_LPAR_PERFTOOLS] = {
+ .name = "ibm,lpar-perftools",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 2, .size_idx1 = 3,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_MANAGE_FLASH_IMAGE] = {
+ .name = "ibm,manage-flash-image",
+ },
+ [RTAS_FNIDX__IBM_MANAGE_STORAGE_PRESERVATION] = {
+ .name = "ibm,manage-storage-preservation",
+ },
+ [RTAS_FNIDX__IBM_NMI_INTERLOCK] = {
+ .name = "ibm,nmi-interlock",
+ },
+ [RTAS_FNIDX__IBM_NMI_REGISTER] = {
+ .name = "ibm,nmi-register",
+ },
+ [RTAS_FNIDX__IBM_OPEN_ERRINJCT] = {
+ .name = "ibm,open-errinjct",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_OPEN_SRIOV_ALLOW_UNFREEZE] = {
+ .name = "ibm,open-sriov-allow-unfreeze",
+ },
+ [RTAS_FNIDX__IBM_OPEN_SRIOV_MAP_PE_NUMBER] = {
+ .name = "ibm,open-sriov-map-pe-number",
+ },
+ [RTAS_FNIDX__IBM_OS_TERM] = {
+ .name = "ibm,os-term",
+ },
+ [RTAS_FNIDX__IBM_PARTNER_CONTROL] = {
+ .name = "ibm,partner-control",
+ },
+ [RTAS_FNIDX__IBM_PHYSICAL_ATTESTATION] = {
+ .name = "ibm,physical-attestation",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 0, .size_idx1 = 1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_PLATFORM_DUMP] = {
+ .name = "ibm,platform-dump",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 4, .size_idx1 = 5,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_POWER_OFF_UPS] = {
+ .name = "ibm,power-off-ups",
+ },
+ [RTAS_FNIDX__IBM_QUERY_INTERRUPT_SOURCE_NUMBER] = {
+ .name = "ibm,query-interrupt-source-number",
+ },
+ [RTAS_FNIDX__IBM_QUERY_PE_DMA_WINDOW] = {
+ .name = "ibm,query-pe-dma-window",
+ },
+ [RTAS_FNIDX__IBM_READ_PCI_CONFIG] = {
+ .name = "ibm,read-pci-config",
+ },
+ [RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE] = {
+ .name = "ibm,read-slot-reset-state",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_READ_SLOT_RESET_STATE2] = {
+ .name = "ibm,read-slot-reset-state2",
+ },
+ [RTAS_FNIDX__IBM_REMOVE_PE_DMA_WINDOW] = {
+ .name = "ibm,remove-pe-dma-window",
+ },
+ [RTAS_FNIDX__IBM_RESET_PE_DMA_WINDOWS] = {
+ .name = "ibm,reset-pe-dma-windows",
+ },
+ [RTAS_FNIDX__IBM_SCAN_LOG_DUMP] = {
+ .name = "ibm,scan-log-dump",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 0, .size_idx1 = 1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_SET_DYNAMIC_INDICATOR] = {
+ .name = "ibm,set-dynamic-indicator",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 2, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_SET_EEH_OPTION] = {
+ .name = "ibm,set-eeh-option",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_SET_SLOT_RESET] = {
+ .name = "ibm,set-slot-reset",
+ },
+ [RTAS_FNIDX__IBM_SET_SYSTEM_PARAMETER] = {
+ .name = "ibm,set-system-parameter",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_SET_XIVE] = {
+ .name = "ibm,set-xive",
+ },
+ [RTAS_FNIDX__IBM_SLOT_ERROR_DETAIL] = {
+ .name = "ibm,slot-error-detail",
+ },
+ [RTAS_FNIDX__IBM_SUSPEND_ME] = {
+ .name = "ibm,suspend-me",
+ .banned_for_syscall_on_le = true,
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__IBM_TUNE_DMA_PARMS] = {
+ .name = "ibm,tune-dma-parms",
+ },
+ [RTAS_FNIDX__IBM_UPDATE_FLASH_64_AND_REBOOT] = {
+ .name = "ibm,update-flash-64-and-reboot",
+ },
+ [RTAS_FNIDX__IBM_UPDATE_NODES] = {
+ .name = "ibm,update-nodes",
+ .banned_for_syscall_on_le = true,
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 0, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ .fixed_size = 4096,
+ },
+ },
+ [RTAS_FNIDX__IBM_UPDATE_PROPERTIES] = {
+ .name = "ibm,update-properties",
+ .banned_for_syscall_on_le = true,
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = 0, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ .fixed_size = 4096,
+ },
+ },
+ [RTAS_FNIDX__IBM_VALIDATE_FLASH_IMAGE] = {
+ .name = "ibm,validate-flash-image",
+ },
+ [RTAS_FNIDX__IBM_WRITE_PCI_CONFIG] = {
+ .name = "ibm,write-pci-config",
+ },
+ [RTAS_FNIDX__NVRAM_FETCH] = {
+ .name = "nvram-fetch",
+ },
+ [RTAS_FNIDX__NVRAM_STORE] = {
+ .name = "nvram-store",
+ },
+ [RTAS_FNIDX__POWER_OFF] = {
+ .name = "power-off",
+ },
+ [RTAS_FNIDX__PUT_TERM_CHAR] = {
+ .name = "put-term-char",
+ },
+ [RTAS_FNIDX__QUERY_CPU_STOPPED_STATE] = {
+ .name = "query-cpu-stopped-state",
+ },
+ [RTAS_FNIDX__READ_PCI_CONFIG] = {
+ .name = "read-pci-config",
+ },
+ [RTAS_FNIDX__RTAS_LAST_ERROR] = {
+ .name = "rtas-last-error",
+ },
+ [RTAS_FNIDX__SET_INDICATOR] = {
+ .name = "set-indicator",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__SET_POWER_LEVEL] = {
+ .name = "set-power-level",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__SET_TIME_FOR_POWER_ON] = {
+ .name = "set-time-for-power-on",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__SET_TIME_OF_DAY] = {
+ .name = "set-time-of-day",
+ .filter = &(const struct rtas_filter) {
+ .buf_idx1 = -1, .size_idx1 = -1,
+ .buf_idx2 = -1, .size_idx2 = -1,
+ },
+ },
+ [RTAS_FNIDX__START_CPU] = {
+ .name = "start-cpu",
+ },
+ [RTAS_FNIDX__STOP_SELF] = {
+ .name = "stop-self",
+ },
+ [RTAS_FNIDX__SYSTEM_REBOOT] = {
+ .name = "system-reboot",
+ },
+ [RTAS_FNIDX__THAW_TIME_BASE] = {
+ .name = "thaw-time-base",
+ },
+ [RTAS_FNIDX__WRITE_PCI_CONFIG] = {
+ .name = "write-pci-config",
+ },
+};
+
+/**
+ * rtas_function_token() - RTAS function token lookup.
+ * @handle: Function handle, e.g. RTAS_FN_EVENT_SCAN.
+ *
+ * Context: Any context.
+ * Return: the token value for the function if implemented by this platform,
+ * otherwise RTAS_UNKNOWN_SERVICE.
+ */
+s32 rtas_function_token(const rtas_fn_handle_t handle)
+{
+ const size_t index = handle.index;
+ const bool out_of_bounds = index >= ARRAY_SIZE(rtas_function_table);
+
+ if (WARN_ONCE(out_of_bounds, "invalid function index %zu", index))
+ return RTAS_UNKNOWN_SERVICE;
+ /*
+ * Various drivers attempt token lookups on non-RTAS
+ * platforms.
+ */
+ if (!rtas.dev)
+ return RTAS_UNKNOWN_SERVICE;
+
+ return rtas_function_table[index].token;
+}
+EXPORT_SYMBOL_GPL(rtas_function_token);
+
+static int rtas_function_cmp(const void *a, const void *b)
+{
+ const struct rtas_function *f1 = a;
+ const struct rtas_function *f2 = b;
+
+ return strcmp(f1->name, f2->name);
+}
+
+/*
+ * Boot-time initialization of the function table needs the lookup to
+ * return a non-const-qualified object. Use rtas_name_to_function()
+ * in all other contexts.
+ */
+static struct rtas_function *__rtas_name_to_function(const char *name)
+{
+ const struct rtas_function key = {
+ .name = name,
+ };
+ struct rtas_function *found;
+
+ found = bsearch(&key, rtas_function_table, ARRAY_SIZE(rtas_function_table),
+ sizeof(rtas_function_table[0]), rtas_function_cmp);
+
+ return found;
+}
+
+static const struct rtas_function *rtas_name_to_function(const char *name)
+{
+ return __rtas_name_to_function(name);
+}
+
+static DEFINE_XARRAY(rtas_token_to_function_xarray);
+
+static int __init rtas_token_to_function_xarray_init(void)
+{
+ int err = 0;
+
+ for (size_t i = 0; i < ARRAY_SIZE(rtas_function_table); ++i) {
+ const struct rtas_function *func = &rtas_function_table[i];
+ const s32 token = func->token;
+
+ if (token == RTAS_UNKNOWN_SERVICE)
+ continue;
+
+ err = xa_err(xa_store(&rtas_token_to_function_xarray,
+ token, (void *)func, GFP_KERNEL));
+ if (err)
+ break;
+ }
+
+ return err;
+}
+arch_initcall(rtas_token_to_function_xarray_init);
+
+static const struct rtas_function *rtas_token_to_function(s32 token)
+{
+ const struct rtas_function *func;
+
+ if (WARN_ONCE(token < 0, "invalid token %d", token))
+ return NULL;
+
+ func = xa_load(&rtas_token_to_function_xarray, token);
+
+ if (WARN_ONCE(!func, "unexpected failed lookup for token %d", token))
+ return NULL;
+
+ return func;
+}
+
/* This is here deliberately so it's only used in this file */
void enter_rtas(unsigned long);
-static inline void do_enter_rtas(unsigned long args)
+static void __do_enter_rtas(struct rtas_args *args)
+{
+ enter_rtas(__pa(args));
+ srr_regs_clobbered(); /* rtas uses SRRs, invalidate */
+}
+
+static void __do_enter_rtas_trace(struct rtas_args *args)
{
- unsigned long msr;
+ const char *name = NULL;
+ /*
+ * If the tracepoints that consume the function name aren't
+ * active, avoid the lookup.
+ */
+ if ((trace_rtas_input_enabled() || trace_rtas_output_enabled())) {
+ const s32 token = be32_to_cpu(args->token);
+ const struct rtas_function *func = rtas_token_to_function(token);
+
+ name = func->name;
+ }
+
+ trace_rtas_input(args, name);
+ trace_rtas_ll_entry(args);
+
+ __do_enter_rtas(args);
+
+ trace_rtas_ll_exit(args);
+ trace_rtas_output(args, name);
+}
+static void do_enter_rtas(struct rtas_args *args)
+{
+ const unsigned long msr = mfmsr();
+ /*
+ * Situations where we want to skip any active tracepoints for
+ * safety reasons:
+ *
+ * 1. The last code executed on an offline CPU as it stops,
+ * i.e. we're about to call stop-self. The tracepoints'
+ * function name lookup uses xarray, which uses RCU, which
+ * isn't valid to call on an offline CPU. Any events
+ * emitted on an offline CPU will be discarded anyway.
+ *
+ * 2. In real mode, as when invoking ibm,nmi-interlock from
+ * the pseries MCE handler. We cannot count on trace
+ * buffers or the entries in rtas_token_to_function_xarray
+ * to be contained in the RMO.
+ */
+ const unsigned long mask = MSR_IR | MSR_DR;
+ const bool can_trace = likely(cpu_online(raw_smp_processor_id()) &&
+ (msr & mask) == mask);
/*
* Make sure MSR[RI] is currently enabled as it will be forced later
* in enter_rtas.
*/
- msr = mfmsr();
BUG_ON(!(msr & MSR_RI));
BUG_ON(!irqs_disabled());
hard_irq_disable(); /* Ensure MSR[EE] is disabled on PPC64 */
- enter_rtas(args);
-
- srr_regs_clobbered(); /* rtas uses SRRs, invalidate */
+ if (can_trace)
+ __do_enter_rtas_trace(args);
+ else
+ __do_enter_rtas(args);
}
-struct rtas_t rtas = {
- .lock = __ARCH_SPIN_LOCK_UNLOCKED
-};
-EXPORT_SYMBOL(rtas);
+struct rtas_t rtas;
+
+/*
+ * Nearly all RTAS calls need to be serialized. All uses of the
+ * default rtas_args block must hold rtas_lock.
+ *
+ * Exceptions to the RTAS serialization requirement (e.g. stop-self)
+ * must use a separate rtas_args structure.
+ */
+static DEFINE_RAW_SPINLOCK(rtas_lock);
+static struct rtas_args rtas_args;
DEFINE_SPINLOCK(rtas_data_buf_lock);
-EXPORT_SYMBOL(rtas_data_buf_lock);
+EXPORT_SYMBOL_GPL(rtas_data_buf_lock);
-char rtas_data_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned;
-EXPORT_SYMBOL(rtas_data_buf);
+char rtas_data_buf[RTAS_DATA_BUF_SIZE] __aligned(SZ_4K);
+EXPORT_SYMBOL_GPL(rtas_data_buf);
unsigned long rtas_rmo_buf;
@@ -78,29 +642,7 @@ unsigned long rtas_rmo_buf;
* This is done like this so rtas_flash can be a module.
*/
void (*rtas_flash_term_hook)(int);
-EXPORT_SYMBOL(rtas_flash_term_hook);
-
-/* RTAS use home made raw locking instead of spin_lock_irqsave
- * because those can be called from within really nasty contexts
- * such as having the timebase stopped which would lockup with
- * normal locks and spinlock debugging enabled
- */
-static unsigned long lock_rtas(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- preempt_disable();
- arch_spin_lock(&rtas.lock);
- return flags;
-}
-
-static void unlock_rtas(unsigned long flags)
-{
- arch_spin_unlock(&rtas.lock);
- local_irq_restore(flags);
- preempt_enable();
-}
+EXPORT_SYMBOL_GPL(rtas_flash_term_hook);
/*
* call_rtas_display_status and call_rtas_display_status_delay
@@ -109,14 +651,14 @@ static void unlock_rtas(unsigned long flags)
*/
static void call_rtas_display_status(unsigned char c)
{
- unsigned long s;
+ unsigned long flags;
if (!rtas.base)
return;
- s = lock_rtas();
- rtas_call_unlocked(&rtas.args, 10, 1, 1, NULL, c);
- unlock_rtas(s);
+ raw_spin_lock_irqsave(&rtas_lock, flags);
+ rtas_call_unlocked(&rtas_args, 10, 1, 1, NULL, c);
+ raw_spin_unlock_irqrestore(&rtas_lock, flags);
}
static void call_rtas_display_status_delay(char c)
@@ -240,8 +782,8 @@ void rtas_progress(char *s, unsigned short hex)
"ibm,display-truncation-length", NULL);
of_node_put(root);
}
- display_character = rtas_token("display-character");
- set_indicator = rtas_token("set-indicator");
+ display_character = rtas_function_token(RTAS_FN_DISPLAY_CHARACTER);
+ set_indicator = rtas_function_token(RTAS_FN_SET_INDICATOR);
}
if (display_character == RTAS_UNKNOWN_SERVICE) {
@@ -326,23 +868,38 @@ void rtas_progress(char *s, unsigned short hex)
spin_unlock(&progress_lock);
}
-EXPORT_SYMBOL(rtas_progress); /* needed by rtas_flash module */
+EXPORT_SYMBOL_GPL(rtas_progress); /* needed by rtas_flash module */
int rtas_token(const char *service)
{
+ const struct rtas_function *func;
const __be32 *tokp;
+
if (rtas.dev == NULL)
return RTAS_UNKNOWN_SERVICE;
+
+ func = rtas_name_to_function(service);
+ if (func)
+ return func->token;
+ /*
+ * The caller is looking up a name that is not known to be an
+ * RTAS function. Either it's a function that needs to be
+ * added to the table, or they're misusing rtas_token() to
+ * access non-function properties of the /rtas node. Warn and
+ * fall back to the legacy behavior.
+ */
+ WARN_ONCE(1, "unknown function `%s`, should it be added to rtas_function_table?\n",
+ service);
+
tokp = of_get_property(rtas.dev, service, NULL);
return tokp ? be32_to_cpu(*tokp) : RTAS_UNKNOWN_SERVICE;
}
-EXPORT_SYMBOL(rtas_token);
+EXPORT_SYMBOL_GPL(rtas_token);
int rtas_service_present(const char *service)
{
return rtas_token(service) != RTAS_UNKNOWN_SERVICE;
}
-EXPORT_SYMBOL(rtas_service_present);
#ifdef CONFIG_RTAS_ERROR_LOGGING
@@ -357,7 +914,6 @@ int rtas_get_error_log_max(void)
{
return rtas_error_log_max;
}
-EXPORT_SYMBOL(rtas_get_error_log_max);
static void __init init_error_log_max(void)
{
@@ -381,39 +937,39 @@ static void __init init_error_log_max(void)
static char rtas_err_buf[RTAS_ERROR_LOG_MAX];
-static int rtas_last_error_token;
/** Return a copy of the detailed error text associated with the
* most recent failed call to rtas. Because the error text
* might go stale if there are any other intervening rtas calls,
* this routine must be called atomically with whatever produced
- * the error (i.e. with rtas.lock still held from the previous call).
+ * the error (i.e. with rtas_lock still held from the previous call).
*/
static char *__fetch_rtas_last_error(char *altbuf)
{
+ const s32 token = rtas_function_token(RTAS_FN_RTAS_LAST_ERROR);
struct rtas_args err_args, save_args;
u32 bufsz;
char *buf = NULL;
- if (rtas_last_error_token == -1)
+ if (token == -1)
return NULL;
bufsz = rtas_get_error_log_max();
- err_args.token = cpu_to_be32(rtas_last_error_token);
+ err_args.token = cpu_to_be32(token);
err_args.nargs = cpu_to_be32(2);
err_args.nret = cpu_to_be32(1);
err_args.args[0] = cpu_to_be32(__pa(rtas_err_buf));
err_args.args[1] = cpu_to_be32(bufsz);
err_args.args[2] = 0;
- save_args = rtas.args;
- rtas.args = err_args;
+ save_args = rtas_args;
+ rtas_args = err_args;
- do_enter_rtas(__pa(&rtas.args));
+ do_enter_rtas(&rtas_args);
- err_args = rtas.args;
- rtas.args = save_args;
+ err_args = rtas_args;
+ rtas_args = save_args;
/* Log the error in the unlikely case that there was one. */
if (unlikely(err_args.args[2] == 0)) {
@@ -457,7 +1013,7 @@ va_rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
for (i = 0; i < nret; ++i)
args->rets[i] = 0;
- do_enter_rtas(__pa(args));
+ do_enter_rtas(args);
}
void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret, ...)
@@ -469,8 +1025,11 @@ void rtas_call_unlocked(struct rtas_args *args, int token, int nargs, int nret,
va_end(list);
}
-static int ibm_open_errinjct_token;
-static int ibm_errinjct_token;
+static bool token_is_restricted_errinjct(s32 token)
+{
+ return token == rtas_function_token(RTAS_FN_IBM_OPEN_ERRINJCT) ||
+ token == rtas_function_token(RTAS_FN_IBM_ERRINJCT);
+}
/**
* rtas_call() - Invoke an RTAS firmware function.
@@ -481,7 +1040,7 @@ static int ibm_errinjct_token;
* @....: List of @nargs input parameters.
*
* Invokes the RTAS function indicated by @token, which the caller
- * should obtain via rtas_token().
+ * should obtain via rtas_function_token().
*
* The @nargs and @nret arguments must match the number of input and
* output parameters specified for the RTAS function.
@@ -534,15 +1093,15 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...)
{
va_list list;
int i;
- unsigned long s;
- struct rtas_args *rtas_args;
+ unsigned long flags;
+ struct rtas_args *args;
char *buff_copy = NULL;
int ret;
if (!rtas.entry || token == RTAS_UNKNOWN_SERVICE)
return -1;
- if (token == ibm_open_errinjct_token || token == ibm_errinjct_token) {
+ if (token_is_restricted_errinjct(token)) {
/*
* It would be nicer to not discard the error value
* from security_locked_down(), but callers expect an
@@ -557,26 +1116,25 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...)
return -1;
}
- s = lock_rtas();
-
+ raw_spin_lock_irqsave(&rtas_lock, flags);
/* We use the global rtas args buffer */
- rtas_args = &rtas.args;
+ args = &rtas_args;
va_start(list, outputs);
- va_rtas_call_unlocked(rtas_args, token, nargs, nret, list);
+ va_rtas_call_unlocked(args, token, nargs, nret, list);
va_end(list);
/* A -1 return code indicates that the last command couldn't
be completed due to a hardware error. */
- if (be32_to_cpu(rtas_args->rets[0]) == -1)
+ if (be32_to_cpu(args->rets[0]) == -1)
buff_copy = __fetch_rtas_last_error(NULL);
if (nret > 1 && outputs != NULL)
for (i = 0; i < nret-1; ++i)
- outputs[i] = be32_to_cpu(rtas_args->rets[i+1]);
- ret = (nret > 0)? be32_to_cpu(rtas_args->rets[0]): 0;
+ outputs[i] = be32_to_cpu(args->rets[i + 1]);
+ ret = (nret > 0) ? be32_to_cpu(args->rets[0]) : 0;
- unlock_rtas(s);
+ raw_spin_unlock_irqrestore(&rtas_lock, flags);
if (buff_copy) {
log_error(buff_copy, ERR_TYPE_RTAS_LOG, 0);
@@ -585,7 +1143,7 @@ int rtas_call(int token, int nargs, int nret, int *outputs, ...)
}
return ret;
}
-EXPORT_SYMBOL(rtas_call);
+EXPORT_SYMBOL_GPL(rtas_call);
/**
* rtas_busy_delay_time() - From an RTAS status value, calculate the
@@ -623,7 +1181,47 @@ unsigned int rtas_busy_delay_time(int status)
return ms;
}
-EXPORT_SYMBOL(rtas_busy_delay_time);
+
+/*
+ * Early boot fallback for rtas_busy_delay().
+ */
+static bool __init rtas_busy_delay_early(int status)
+{
+ static size_t successive_ext_delays __initdata;
+ bool retry;
+
+ switch (status) {
+ case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX:
+ /*
+ * In the unlikely case that we receive an extended
+ * delay status in early boot, the OS is probably not
+ * the cause, and there's nothing we can do to clear
+ * the condition. Best we can do is delay for a bit
+ * and hope it's transient. Lie to the caller if it
+ * seems like we're stuck in a retry loop.
+ */
+ mdelay(1);
+ retry = true;
+ successive_ext_delays += 1;
+ if (successive_ext_delays > 1000) {
+ pr_err("too many extended delays, giving up\n");
+ dump_stack();
+ retry = false;
+ successive_ext_delays = 0;
+ }
+ break;
+ case RTAS_BUSY:
+ retry = true;
+ successive_ext_delays = 0;
+ break;
+ default:
+ retry = false;
+ successive_ext_delays = 0;
+ break;
+ }
+
+ return retry;
+}
/**
* rtas_busy_delay() - helper for RTAS busy and extended delay statuses
@@ -643,11 +1241,17 @@ EXPORT_SYMBOL(rtas_busy_delay_time);
* * false - @status is not @RTAS_BUSY nor an extended delay hint. The
* caller is responsible for handling @status.
*/
-bool rtas_busy_delay(int status)
+bool __ref rtas_busy_delay(int status)
{
unsigned int ms;
bool ret;
+ /*
+ * Can't do timed sleeps before timekeeping is up.
+ */
+ if (system_state < SYSTEM_SCHEDULING)
+ return rtas_busy_delay_early(status);
+
switch (status) {
case RTAS_EXTENDED_DELAY_MIN...RTAS_EXTENDED_DELAY_MAX:
ret = true;
@@ -697,7 +1301,7 @@ bool rtas_busy_delay(int status)
return ret;
}
-EXPORT_SYMBOL(rtas_busy_delay);
+EXPORT_SYMBOL_GPL(rtas_busy_delay);
static int rtas_error_rc(int rtas_rc)
{
@@ -729,7 +1333,7 @@ static int rtas_error_rc(int rtas_rc)
int rtas_get_power_level(int powerdomain, int *level)
{
- int token = rtas_token("get-power-level");
+ int token = rtas_function_token(RTAS_FN_GET_POWER_LEVEL);
int rc;
if (token == RTAS_UNKNOWN_SERVICE)
@@ -742,11 +1346,11 @@ int rtas_get_power_level(int powerdomain, int *level)
return rtas_error_rc(rc);
return rc;
}
-EXPORT_SYMBOL(rtas_get_power_level);
+EXPORT_SYMBOL_GPL(rtas_get_power_level);
int rtas_set_power_level(int powerdomain, int level, int *setlevel)
{
- int token = rtas_token("set-power-level");
+ int token = rtas_function_token(RTAS_FN_SET_POWER_LEVEL);
int rc;
if (token == RTAS_UNKNOWN_SERVICE)
@@ -760,11 +1364,11 @@ int rtas_set_power_level(int powerdomain, int level, int *setlevel)
return rtas_error_rc(rc);
return rc;
}
-EXPORT_SYMBOL(rtas_set_power_level);
+EXPORT_SYMBOL_GPL(rtas_set_power_level);
int rtas_get_sensor(int sensor, int index, int *state)
{
- int token = rtas_token("get-sensor-state");
+ int token = rtas_function_token(RTAS_FN_GET_SENSOR_STATE);
int rc;
if (token == RTAS_UNKNOWN_SERVICE)
@@ -778,11 +1382,11 @@ int rtas_get_sensor(int sensor, int index, int *state)
return rtas_error_rc(rc);
return rc;
}
-EXPORT_SYMBOL(rtas_get_sensor);
+EXPORT_SYMBOL_GPL(rtas_get_sensor);
int rtas_get_sensor_fast(int sensor, int index, int *state)
{
- int token = rtas_token("get-sensor-state");
+ int token = rtas_function_token(RTAS_FN_GET_SENSOR_STATE);
int rc;
if (token == RTAS_UNKNOWN_SERVICE)
@@ -821,11 +1425,10 @@ bool rtas_indicator_present(int token, int *maxindex)
return false;
}
-EXPORT_SYMBOL(rtas_indicator_present);
int rtas_set_indicator(int indicator, int index, int new_value)
{
- int token = rtas_token("set-indicator");
+ int token = rtas_function_token(RTAS_FN_SET_INDICATOR);
int rc;
if (token == RTAS_UNKNOWN_SERVICE)
@@ -839,15 +1442,15 @@ int rtas_set_indicator(int indicator, int index, int new_value)
return rtas_error_rc(rc);
return rc;
}
-EXPORT_SYMBOL(rtas_set_indicator);
+EXPORT_SYMBOL_GPL(rtas_set_indicator);
/*
* Ignoring RTAS extended delay
*/
int rtas_set_indicator_fast(int indicator, int index, int new_value)
{
+ int token = rtas_function_token(RTAS_FN_SET_INDICATOR);
int rc;
- int token = rtas_token("set-indicator");
if (token == RTAS_UNKNOWN_SERVICE)
return -ENOENT;
@@ -889,10 +1492,11 @@ int rtas_set_indicator_fast(int indicator, int index, int new_value)
*/
int rtas_ibm_suspend_me(int *fw_status)
{
+ int token = rtas_function_token(RTAS_FN_IBM_SUSPEND_ME);
int fwrc;
int ret;
- fwrc = rtas_call(rtas_token("ibm,suspend-me"), 0, 1, NULL);
+ fwrc = rtas_call(token, 0, 1, NULL);
switch (fwrc) {
case 0:
@@ -925,7 +1529,7 @@ void __noreturn rtas_restart(char *cmd)
if (rtas_flash_term_hook)
rtas_flash_term_hook(SYS_RESTART);
pr_emerg("system-reboot returned %d\n",
- rtas_call(rtas_token("system-reboot"), 0, 1, NULL));
+ rtas_call(rtas_function_token(RTAS_FN_SYSTEM_REBOOT), 0, 1, NULL));
for (;;);
}
@@ -935,7 +1539,7 @@ void rtas_power_off(void)
rtas_flash_term_hook(SYS_POWER_OFF);
/* allow power on only with power button press */
pr_emerg("power-off returned %d\n",
- rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1));
+ rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1));
for (;;);
}
@@ -945,16 +1549,17 @@ void __noreturn rtas_halt(void)
rtas_flash_term_hook(SYS_HALT);
/* allow power on only with power button press */
pr_emerg("power-off returned %d\n",
- rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1));
+ rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1));
for (;;);
}
/* Must be in the RMO region, so we place it here */
static char rtas_os_term_buf[2048];
-static s32 ibm_os_term_token = RTAS_UNKNOWN_SERVICE;
+static bool ibm_extended_os_term;
void rtas_os_term(char *str)
{
+ s32 token = rtas_function_token(RTAS_FN_IBM_OS_TERM);
int status;
/*
@@ -963,7 +1568,8 @@ void rtas_os_term(char *str)
* this property may terminate the partition which we want to avoid
* since it interferes with panic_timeout.
*/
- if (ibm_os_term_token == RTAS_UNKNOWN_SERVICE)
+
+ if (token == RTAS_UNKNOWN_SERVICE || !ibm_extended_os_term)
return;
snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str);
@@ -974,8 +1580,7 @@ void rtas_os_term(char *str)
* schedules.
*/
do {
- status = rtas_call(ibm_os_term_token, 1, 1, NULL,
- __pa(rtas_os_term_buf));
+ status = rtas_call(token, 1, 1, NULL, __pa(rtas_os_term_buf));
} while (rtas_busy_delay_time(status));
if (status != 0)
@@ -995,10 +1600,9 @@ void rtas_os_term(char *str)
*/
void rtas_activate_firmware(void)
{
- int token;
+ int token = rtas_function_token(RTAS_FN_IBM_ACTIVATE_FIRMWARE);
int fwrc;
- token = rtas_token("ibm,activate-firmware");
if (token == RTAS_UNKNOWN_SERVICE) {
pr_notice("ibm,activate-firmware method unavailable\n");
return;
@@ -1063,56 +1667,12 @@ noinstr struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log
*
* Accordingly, we filter RTAS requests to check that the call is
* permitted, and that provided pointers fall within the RMO buffer.
- * The rtas_filters list contains an entry for each permitted call,
- * with the indexes of the parameters which are expected to contain
- * addresses and sizes of buffers allocated inside the RMO buffer.
+ * If a function is allowed to be invoked via the syscall, then its
+ * entry in the rtas_functions table points to a rtas_filter that
+ * describes its constraints, with the indexes of the parameters which
+ * are expected to contain addresses and sizes of buffers allocated
+ * inside the RMO buffer.
*/
-struct rtas_filter {
- const char *name;
- int token;
- /* Indexes into the args buffer, -1 if not used */
- int buf_idx1;
- int size_idx1;
- int buf_idx2;
- int size_idx2;
-
- int fixed_size;
-};
-
-static struct rtas_filter rtas_filters[] __ro_after_init = {
- { "ibm,activate-firmware", -1, -1, -1, -1, -1 },
- { "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 }, /* Special cased */
- { "display-character", -1, -1, -1, -1, -1 },
- { "ibm,display-message", -1, 0, -1, -1, -1 },
- { "ibm,errinjct", -1, 2, -1, -1, -1, 1024 },
- { "ibm,close-errinjct", -1, -1, -1, -1, -1 },
- { "ibm,open-errinjct", -1, -1, -1, -1, -1 },
- { "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 },
- { "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 },
- { "ibm,get-indices", -1, 2, 3, -1, -1 },
- { "get-power-level", -1, -1, -1, -1, -1 },
- { "get-sensor-state", -1, -1, -1, -1, -1 },
- { "ibm,get-system-parameter", -1, 1, 2, -1, -1 },
- { "get-time-of-day", -1, -1, -1, -1, -1 },
- { "ibm,get-vpd", -1, 0, -1, 1, 2 },
- { "ibm,lpar-perftools", -1, 2, 3, -1, -1 },
- { "ibm,platform-dump", -1, 4, 5, -1, -1 }, /* Special cased */
- { "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 },
- { "ibm,scan-log-dump", -1, 0, 1, -1, -1 },
- { "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 },
- { "ibm,set-eeh-option", -1, -1, -1, -1, -1 },
- { "set-indicator", -1, -1, -1, -1, -1 },
- { "set-power-level", -1, -1, -1, -1, -1 },
- { "set-time-for-power-on", -1, -1, -1, -1, -1 },
- { "ibm,set-system-parameter", -1, 1, -1, -1, -1 },
- { "set-time-of-day", -1, -1, -1, -1, -1 },
-#ifdef CONFIG_CPU_BIG_ENDIAN
- { "ibm,suspend-me", -1, -1, -1, -1, -1 },
- { "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 },
- { "ibm,update-properties", -1, 0, -1, -1, -1, 4096 },
-#endif
- { "ibm,physical-attestation", -1, 0, 1, -1, -1 },
-};
static bool in_rmo_buf(u32 base, u32 end)
{
@@ -1126,63 +1686,75 @@ static bool in_rmo_buf(u32 base, u32 end)
static bool block_rtas_call(int token, int nargs,
struct rtas_args *args)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
- struct rtas_filter *f = &rtas_filters[i];
- u32 base, size, end;
+ const struct rtas_function *func;
+ const struct rtas_filter *f;
+ const bool is_platform_dump = token == rtas_function_token(RTAS_FN_IBM_PLATFORM_DUMP);
+ const bool is_config_conn = token == rtas_function_token(RTAS_FN_IBM_CONFIGURE_CONNECTOR);
+ u32 base, size, end;
- if (token != f->token)
- continue;
-
- if (f->buf_idx1 != -1) {
- base = be32_to_cpu(args->args[f->buf_idx1]);
- if (f->size_idx1 != -1)
- size = be32_to_cpu(args->args[f->size_idx1]);
- else if (f->fixed_size)
- size = f->fixed_size;
- else
- size = 1;
-
- end = base + size - 1;
+ /*
+ * If this token doesn't correspond to a function the kernel
+ * understands, you're not allowed to call it.
+ */
+ func = rtas_token_to_function(token);
+ if (!func)
+ goto err;
+ /*
+ * And only functions with filters attached are allowed.
+ */
+ f = func->filter;
+ if (!f)
+ goto err;
+ /*
+ * And some functions aren't allowed on LE.
+ */
+ if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) && func->banned_for_syscall_on_le)
+ goto err;
+
+ if (f->buf_idx1 != -1) {
+ base = be32_to_cpu(args->args[f->buf_idx1]);
+ if (f->size_idx1 != -1)
+ size = be32_to_cpu(args->args[f->size_idx1]);
+ else if (f->fixed_size)
+ size = f->fixed_size;
+ else
+ size = 1;
- /*
- * Special case for ibm,platform-dump - NULL buffer
- * address is used to indicate end of dump processing
- */
- if (!strcmp(f->name, "ibm,platform-dump") &&
- base == 0)
- return false;
+ end = base + size - 1;
- if (!in_rmo_buf(base, end))
- goto err;
- }
+ /*
+ * Special case for ibm,platform-dump - NULL buffer
+ * address is used to indicate end of dump processing
+ */
+ if (is_platform_dump && base == 0)
+ return false;
- if (f->buf_idx2 != -1) {
- base = be32_to_cpu(args->args[f->buf_idx2]);
- if (f->size_idx2 != -1)
- size = be32_to_cpu(args->args[f->size_idx2]);
- else if (f->fixed_size)
- size = f->fixed_size;
- else
- size = 1;
- end = base + size - 1;
+ if (!in_rmo_buf(base, end))
+ goto err;
+ }
- /*
- * Special case for ibm,configure-connector where the
- * address can be 0
- */
- if (!strcmp(f->name, "ibm,configure-connector") &&
- base == 0)
- return false;
+ if (f->buf_idx2 != -1) {
+ base = be32_to_cpu(args->args[f->buf_idx2]);
+ if (f->size_idx2 != -1)
+ size = be32_to_cpu(args->args[f->size_idx2]);
+ else if (f->fixed_size)
+ size = f->fixed_size;
+ else
+ size = 1;
+ end = base + size - 1;
- if (!in_rmo_buf(base, end))
- goto err;
- }
+ /*
+ * Special case for ibm,configure-connector where the
+ * address can be 0
+ */
+ if (is_config_conn && base == 0)
+ return false;
- return false;
+ if (!in_rmo_buf(base, end))
+ goto err;
}
+ return false;
err:
pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n");
pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n",
@@ -1190,14 +1762,6 @@ err:
return true;
}
-static void __init rtas_syscall_filter_init(void)
-{
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(rtas_filters); i++)
- rtas_filters[i].token = rtas_token(rtas_filters[i].name);
-}
-
/* We assume to be passed big endian arguments */
SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
{
@@ -1238,7 +1802,7 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
if (block_rtas_call(token, nargs, &args))
return -EINVAL;
- if (token == ibm_open_errinjct_token || token == ibm_errinjct_token) {
+ if (token_is_restricted_errinjct(token)) {
int err;
err = security_locked_down(LOCKDOWN_RTAS_ERROR_INJECTION);
@@ -1247,7 +1811,7 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
}
/* Need to handle ibm,suspend_me call specially */
- if (token == rtas_token("ibm,suspend-me")) {
+ if (token == rtas_function_token(RTAS_FN_IBM_SUSPEND_ME)) {
/*
* rtas_ibm_suspend_me assumes the streamid handle is in cpu
@@ -1268,18 +1832,18 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
buff_copy = get_errorlog_buffer();
- flags = lock_rtas();
+ raw_spin_lock_irqsave(&rtas_lock, flags);
- rtas.args = args;
- do_enter_rtas(__pa(&rtas.args));
- args = rtas.args;
+ rtas_args = args;
+ do_enter_rtas(&rtas_args);
+ args = rtas_args;
/* A -1 return code indicates that the last command couldn't
be completed due to a hardware error. */
if (be32_to_cpu(args.rets[0]) == -1)
errbuf = __fetch_rtas_last_error(buff_copy);
- unlock_rtas(flags);
+ raw_spin_unlock_irqrestore(&rtas_lock, flags);
if (buff_copy) {
if (errbuf)
@@ -1297,6 +1861,54 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
return 0;
}
+static void __init rtas_function_table_init(void)
+{
+ struct property *prop;
+
+ for (size_t i = 0; i < ARRAY_SIZE(rtas_function_table); ++i) {
+ struct rtas_function *curr = &rtas_function_table[i];
+ struct rtas_function *prior;
+ int cmp;
+
+ curr->token = RTAS_UNKNOWN_SERVICE;
+
+ if (i == 0)
+ continue;
+ /*
+ * Ensure table is sorted correctly for binary search
+ * on function names.
+ */
+ prior = &rtas_function_table[i - 1];
+
+ cmp = strcmp(prior->name, curr->name);
+ if (cmp < 0)
+ continue;
+
+ if (cmp == 0) {
+ pr_err("'%s' has duplicate function table entries\n",
+ curr->name);
+ } else {
+ pr_err("function table unsorted: '%s' wrongly precedes '%s'\n",
+ prior->name, curr->name);
+ }
+ }
+
+ for_each_property_of_node(rtas.dev, prop) {
+ struct rtas_function *func;
+
+ if (prop->length != sizeof(u32))
+ continue;
+
+ func = __rtas_name_to_function(prop->name);
+ if (!func)
+ continue;
+
+ func->token = be32_to_cpup((__be32 *)prop->value);
+
+ pr_debug("function %s has token %u\n", func->name, func->token);
+ }
+}
+
/*
* Call early during boot, before mem init, to retrieve the RTAS
* information from the device-tree and allocate the RMO buffer for userland
@@ -1330,12 +1942,14 @@ void __init rtas_initialize(void)
init_error_log_max();
+ /* Must be called before any function token lookups */
+ rtas_function_table_init();
+
/*
- * Discover these now to avoid device tree lookups in the
+ * Discover this now to avoid a device tree lookup in the
* panic path.
*/
- if (of_property_read_bool(rtas.dev, "ibm,extended-os-term"))
- ibm_os_term_token = rtas_token("ibm,os-term");
+ ibm_extended_os_term = of_property_read_bool(rtas.dev, "ibm,extended-os-term");
/* If RTAS was found, allocate the RMO buffer for it and look for
* the stop-self token if any
@@ -1350,12 +1964,7 @@ void __init rtas_initialize(void)
panic("ERROR: RTAS: Failed to allocate %lx bytes below %pa\n",
PAGE_SIZE, &rtas_region);
-#ifdef CONFIG_RTAS_ERROR_LOGGING
- rtas_last_error_token = rtas_token("rtas-last-error");
-#endif
- ibm_open_errinjct_token = rtas_token("ibm,open-errinjct");
- ibm_errinjct_token = rtas_token("ibm,errinjct");
- rtas_syscall_filter_init();
+ rtas_work_area_reserve_arena(rtas_region);
}
int __init early_init_dt_scan_rtas(unsigned long node,
@@ -1401,23 +2010,22 @@ int __init early_init_dt_scan_rtas(unsigned long node,
return 1;
}
-static arch_spinlock_t timebase_lock;
+static DEFINE_RAW_SPINLOCK(timebase_lock);
static u64 timebase = 0;
void rtas_give_timebase(void)
{
unsigned long flags;
- local_irq_save(flags);
+ raw_spin_lock_irqsave(&timebase_lock, flags);
hard_irq_disable();
- arch_spin_lock(&timebase_lock);
- rtas_call(rtas_token("freeze-time-base"), 0, 1, NULL);
+ rtas_call(rtas_function_token(RTAS_FN_FREEZE_TIME_BASE), 0, 1, NULL);
timebase = get_tb();
- arch_spin_unlock(&timebase_lock);
+ raw_spin_unlock(&timebase_lock);
while (timebase)
barrier();
- rtas_call(rtas_token("thaw-time-base"), 0, 1, NULL);
+ rtas_call(rtas_function_token(RTAS_FN_THAW_TIME_BASE), 0, 1, NULL);
local_irq_restore(flags);
}
@@ -1425,8 +2033,8 @@ void rtas_take_timebase(void)
{
while (!timebase)
barrier();
- arch_spin_lock(&timebase_lock);
+ raw_spin_lock(&timebase_lock);
set_tb(timebase >> 32, timebase & 0xffffffff);
timebase = 0;
- arch_spin_unlock(&timebase_lock);
+ raw_spin_unlock(&timebase_lock);
}
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index bc817a5619d6..4caf5e3079eb 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -376,7 +376,7 @@ static void manage_flash(struct rtas_manage_flash_t *args_buf, unsigned int op)
s32 rc;
do {
- rc = rtas_call(rtas_token("ibm,manage-flash-image"), 1, 1,
+ rc = rtas_call(rtas_function_token(RTAS_FN_IBM_MANAGE_FLASH_IMAGE), 1, 1,
NULL, op);
} while (rtas_busy_delay(rc));
@@ -444,7 +444,7 @@ error:
*/
static void validate_flash(struct rtas_validate_flash_t *args_buf)
{
- int token = rtas_token("ibm,validate-flash-image");
+ int token = rtas_function_token(RTAS_FN_IBM_VALIDATE_FLASH_IMAGE);
int update_results;
s32 rc;
@@ -570,7 +570,7 @@ static void rtas_flash_firmware(int reboot_type)
return;
}
- update_token = rtas_token("ibm,update-flash-64-and-reboot");
+ update_token = rtas_function_token(RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT);
if (update_token == RTAS_UNKNOWN_SERVICE) {
printk(KERN_ALERT "FLASH: ibm,update-flash-64-and-reboot "
"is not available -- not a service partition?\n");
@@ -653,7 +653,7 @@ static void rtas_flash_firmware(int reboot_type)
*/
struct rtas_flash_file {
const char *filename;
- const char *rtas_call_name;
+ const rtas_fn_handle_t handle;
int *status;
const struct proc_ops ops;
};
@@ -661,7 +661,7 @@ struct rtas_flash_file {
static const struct rtas_flash_file rtas_flash_files[] = {
{
.filename = "powerpc/rtas/" FIRMWARE_FLASH_NAME,
- .rtas_call_name = "ibm,update-flash-64-and-reboot",
+ .handle = RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT,
.status = &rtas_update_flash_data.status,
.ops.proc_read = rtas_flash_read_msg,
.ops.proc_write = rtas_flash_write,
@@ -670,7 +670,7 @@ static const struct rtas_flash_file rtas_flash_files[] = {
},
{
.filename = "powerpc/rtas/" FIRMWARE_UPDATE_NAME,
- .rtas_call_name = "ibm,update-flash-64-and-reboot",
+ .handle = RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT,
.status = &rtas_update_flash_data.status,
.ops.proc_read = rtas_flash_read_num,
.ops.proc_write = rtas_flash_write,
@@ -679,7 +679,7 @@ static const struct rtas_flash_file rtas_flash_files[] = {
},
{
.filename = "powerpc/rtas/" VALIDATE_FLASH_NAME,
- .rtas_call_name = "ibm,validate-flash-image",
+ .handle = RTAS_FN_IBM_VALIDATE_FLASH_IMAGE,
.status = &rtas_validate_flash_data.status,
.ops.proc_read = validate_flash_read,
.ops.proc_write = validate_flash_write,
@@ -688,7 +688,7 @@ static const struct rtas_flash_file rtas_flash_files[] = {
},
{
.filename = "powerpc/rtas/" MANAGE_FLASH_NAME,
- .rtas_call_name = "ibm,manage-flash-image",
+ .handle = RTAS_FN_IBM_MANAGE_FLASH_IMAGE,
.status = &rtas_manage_flash_data.status,
.ops.proc_read = manage_flash_read,
.ops.proc_write = manage_flash_write,
@@ -700,8 +700,7 @@ static int __init rtas_flash_init(void)
{
int i;
- if (rtas_token("ibm,update-flash-64-and-reboot") ==
- RTAS_UNKNOWN_SERVICE) {
+ if (rtas_function_token(RTAS_FN_IBM_UPDATE_FLASH_64_AND_REBOOT) == RTAS_UNKNOWN_SERVICE) {
pr_info("rtas_flash: no firmware flash support\n");
return -EINVAL;
}
@@ -730,7 +729,7 @@ static int __init rtas_flash_init(void)
* This code assumes that the status int is the first member of the
* struct
*/
- token = rtas_token(f->rtas_call_name);
+ token = rtas_function_token(f->handle);
if (token == RTAS_UNKNOWN_SERVICE)
*f->status = FLASH_AUTH;
else
diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index 5a2f5ea3b054..e1fdc7473b72 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -191,10 +191,10 @@ static void python_countermeasures(struct device_node *dev)
void __init init_pci_config_tokens(void)
{
- read_pci_config = rtas_token("read-pci-config");
- write_pci_config = rtas_token("write-pci-config");
- ibm_read_pci_config = rtas_token("ibm,read-pci-config");
- ibm_write_pci_config = rtas_token("ibm,write-pci-config");
+ read_pci_config = rtas_function_token(RTAS_FN_READ_PCI_CONFIG);
+ write_pci_config = rtas_function_token(RTAS_FN_WRITE_PCI_CONFIG);
+ ibm_read_pci_config = rtas_function_token(RTAS_FN_IBM_READ_PCI_CONFIG);
+ ibm_write_pci_config = rtas_function_token(RTAS_FN_IBM_WRITE_PCI_CONFIG);
}
unsigned long get_phb_buid(struct device_node *phb)
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index cc56ac6ba4b0..9bba469239fc 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -506,7 +506,7 @@ static int __init rtas_event_scan_init(void)
return 0;
/* No RTAS */
- event_scan = rtas_token("event-scan");
+ event_scan = rtas_function_token(RTAS_FN_EVENT_SCAN);
if (event_scan == RTAS_UNKNOWN_SERVICE) {
printk(KERN_INFO "rtasd: No event-scan on system\n");
return -ENODEV;
diff --git a/arch/powerpc/kernel/secvar-ops.c b/arch/powerpc/kernel/secvar-ops.c
index 6a29777d6a2d..19172a2804f0 100644
--- a/arch/powerpc/kernel/secvar-ops.c
+++ b/arch/powerpc/kernel/secvar-ops.c
@@ -8,10 +8,16 @@
#include <linux/cache.h>
#include <asm/secvar.h>
+#include <asm/bug.h>
-const struct secvar_operations *secvar_ops __ro_after_init;
+const struct secvar_operations *secvar_ops __ro_after_init = NULL;
-void set_secvar_ops(const struct secvar_operations *ops)
+int set_secvar_ops(const struct secvar_operations *ops)
{
+ if (WARN_ON_ONCE(secvar_ops))
+ return -EBUSY;
+
secvar_ops = ops;
+
+ return 0;
}
diff --git a/arch/powerpc/kernel/secvar-sysfs.c b/arch/powerpc/kernel/secvar-sysfs.c
index 1ee4640a2641..eb3c053f323f 100644
--- a/arch/powerpc/kernel/secvar-sysfs.c
+++ b/arch/powerpc/kernel/secvar-sysfs.c
@@ -21,56 +21,48 @@ static struct kset *secvar_kset;
static ssize_t format_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- ssize_t rc = 0;
- struct device_node *node;
- const char *format;
-
- node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
- if (!of_device_is_available(node)) {
- rc = -ENODEV;
- goto out;
- }
-
- rc = of_property_read_string(node, "format", &format);
- if (rc)
- goto out;
-
- rc = sprintf(buf, "%s\n", format);
+ char tmp[32];
+ ssize_t len = secvar_ops->format(tmp, sizeof(tmp));
-out:
- of_node_put(node);
+ if (len > 0)
+ return sysfs_emit(buf, "%s\n", tmp);
+ else if (len < 0)
+ pr_err("Error %zd reading format string\n", len);
+ else
+ pr_err("Got empty format string from backend\n");
- return rc;
+ return -EIO;
}
static ssize_t size_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- uint64_t dsize;
+ u64 dsize;
int rc;
rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, NULL, &dsize);
if (rc) {
- pr_err("Error retrieving %s variable size %d\n", kobj->name,
- rc);
+ if (rc != -ENOENT)
+ pr_err("Error retrieving %s variable size %d\n", kobj->name, rc);
return rc;
}
- return sprintf(buf, "%llu\n", dsize);
+ return sysfs_emit(buf, "%llu\n", dsize);
}
static ssize_t data_read(struct file *filep, struct kobject *kobj,
struct bin_attribute *attr, char *buf, loff_t off,
size_t count)
{
- uint64_t dsize;
char *data;
+ u64 dsize;
int rc;
rc = secvar_ops->get(kobj->name, strlen(kobj->name) + 1, NULL, &dsize);
if (rc) {
- pr_err("Error getting %s variable size %d\n", kobj->name, rc);
+ if (rc != -ENOENT)
+ pr_err("Error getting %s variable size %d\n", kobj->name, rc);
return rc;
}
pr_debug("dsize is %llu\n", dsize);
@@ -141,34 +133,58 @@ static struct kobj_type secvar_ktype = {
static int update_kobj_size(void)
{
- struct device_node *node;
u64 varsize;
- int rc = 0;
-
- node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
- if (!of_device_is_available(node)) {
- rc = -ENODEV;
- goto out;
- }
+ int rc = secvar_ops->max_size(&varsize);
- rc = of_property_read_u64(node, "max-var-size", &varsize);
if (rc)
- goto out;
+ return rc;
data_attr.size = varsize;
update_attr.size = varsize;
-out:
- of_node_put(node);
+ return 0;
+}
- return rc;
+static int secvar_sysfs_config(struct kobject *kobj)
+{
+ struct attribute_group config_group = {
+ .name = "config",
+ .attrs = (struct attribute **)secvar_ops->config_attrs,
+ };
+
+ if (secvar_ops->config_attrs)
+ return sysfs_create_group(kobj, &config_group);
+
+ return 0;
+}
+
+static int add_var(const char *name)
+{
+ struct kobject *kobj;
+ int rc;
+
+ kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+ if (!kobj)
+ return -ENOMEM;
+
+ kobject_init(kobj, &secvar_ktype);
+
+ rc = kobject_add(kobj, &secvar_kset->kobj, "%s", name);
+ if (rc) {
+ pr_warn("kobject_add error %d for attribute: %s\n", rc,
+ name);
+ kobject_put(kobj);
+ return rc;
+ }
+
+ kobject_uevent(kobj, KOBJ_ADD);
+ return 0;
}
static int secvar_sysfs_load(void)
{
+ u64 namesize = 0;
char *name;
- uint64_t namesize = 0;
- struct kobject *kobj;
int rc;
name = kzalloc(NAME_MAX_SIZE, GFP_KERNEL);
@@ -179,73 +195,99 @@ static int secvar_sysfs_load(void)
rc = secvar_ops->get_next(name, &namesize, NAME_MAX_SIZE);
if (rc) {
if (rc != -ENOENT)
- pr_err("error getting secvar from firmware %d\n",
- rc);
- break;
- }
+ pr_err("error getting secvar from firmware %d\n", rc);
+ else
+ rc = 0;
- kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
- if (!kobj) {
- rc = -ENOMEM;
break;
}
- kobject_init(kobj, &secvar_ktype);
-
- rc = kobject_add(kobj, &secvar_kset->kobj, "%s", name);
- if (rc) {
- pr_warn("kobject_add error %d for attribute: %s\n", rc,
- name);
- kobject_put(kobj);
- kobj = NULL;
- }
-
- if (kobj)
- kobject_uevent(kobj, KOBJ_ADD);
-
+ rc = add_var(name);
} while (!rc);
kfree(name);
return rc;
}
+static int secvar_sysfs_load_static(void)
+{
+ const char * const *name_ptr = secvar_ops->var_names;
+ int rc;
+
+ while (*name_ptr) {
+ rc = add_var(*name_ptr);
+ if (rc)
+ return rc;
+ name_ptr++;
+ }
+
+ return 0;
+}
+
static int secvar_sysfs_init(void)
{
+ u64 max_size;
int rc;
if (!secvar_ops) {
- pr_warn("secvar: failed to retrieve secvar operations.\n");
+ pr_warn("Failed to retrieve secvar operations\n");
return -ENODEV;
}
secvar_kobj = kobject_create_and_add("secvar", firmware_kobj);
if (!secvar_kobj) {
- pr_err("secvar: Failed to create firmware kobj\n");
+ pr_err("Failed to create firmware kobj\n");
return -ENOMEM;
}
rc = sysfs_create_file(secvar_kobj, &format_attr.attr);
if (rc) {
- kobject_put(secvar_kobj);
- return -ENOMEM;
+ pr_err("Failed to create format object\n");
+ rc = -ENOMEM;
+ goto err;
}
secvar_kset = kset_create_and_add("vars", NULL, secvar_kobj);
if (!secvar_kset) {
- pr_err("secvar: sysfs kobject registration failed.\n");
- kobject_put(secvar_kobj);
- return -ENOMEM;
+ pr_err("sysfs kobject registration failed\n");
+ rc = -ENOMEM;
+ goto err;
}
rc = update_kobj_size();
if (rc) {
pr_err("Cannot read the size of the attribute\n");
- return rc;
+ goto err;
+ }
+
+ rc = secvar_sysfs_config(secvar_kobj);
+ if (rc) {
+ pr_err("Failed to create config directory\n");
+ goto err;
}
- secvar_sysfs_load();
+ if (secvar_ops->get_next)
+ rc = secvar_sysfs_load();
+ else
+ rc = secvar_sysfs_load_static();
+
+ if (rc) {
+ pr_err("Failed to create variable attributes\n");
+ goto err;
+ }
+
+ // Due to sysfs limitations, we will only ever get a write buffer of
+ // up to 1 page in size. Print a warning if this is potentially going
+ // to cause problems, so that the user is aware.
+ secvar_ops->max_size(&max_size);
+ if (max_size > PAGE_SIZE)
+ pr_warn_ratelimited("PAGE_SIZE (%lu) is smaller than maximum object size (%llu), writes are limited to PAGE_SIZE\n",
+ PAGE_SIZE, max_size);
return 0;
+err:
+ kobject_put(secvar_kobj);
+ return rc;
}
late_initcall(secvar_sysfs_init);
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 9b10e57040c6..e77734e5a127 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -87,6 +87,10 @@ EXPORT_SYMBOL(machine_id);
int boot_cpuid = -1;
EXPORT_SYMBOL_GPL(boot_cpuid);
+#ifdef CONFIG_PPC64
+int boot_cpu_hwid = -1;
+#endif
+
/*
* These are used in binfmt_elf.c to put aux entries on the stack
* for each elf executable being started.
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index a0dee7354fe6..b2e0d3ce4261 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -385,17 +385,21 @@ void __init early_setup(unsigned long dt_ptr)
/*
* Do early initialization using the flattened device
* tree, such as retrieving the physical memory map or
- * calculating/retrieving the hash table size.
+ * calculating/retrieving the hash table size, discover
+ * boot_cpuid and boot_cpu_hwid.
*/
early_init_devtree(__va(dt_ptr));
- /* Now we know the logical id of our boot cpu, setup the paca. */
- if (boot_cpuid != 0) {
- /* Poison paca_ptrs[0] again if it's not the boot cpu */
- memset(&paca_ptrs[0], 0x88, sizeof(paca_ptrs[0]));
- }
+ allocate_paca_ptrs();
+ allocate_paca(boot_cpuid);
+ set_hard_smp_processor_id(boot_cpuid, boot_cpu_hwid);
fixup_boot_paca(paca_ptrs[boot_cpuid]);
setup_paca(paca_ptrs[boot_cpuid]); /* install the paca into registers */
+ // smp_processor_id() now reports boot_cpuid
+
+#ifdef CONFIG_SMP
+ task_thread_info(current)->cpu = boot_cpuid; // fix task_cpu(current)
+#endif
/*
* Configure exception handlers. This include setting up trampolines
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e26eb6618ae5..9d8665910350 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -356,7 +356,7 @@ void vtime_flush(struct task_struct *tsk)
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-void __delay(unsigned long loops)
+void __no_kcsan __delay(unsigned long loops)
{
unsigned long start;
@@ -377,7 +377,7 @@ void __delay(unsigned long loops)
}
EXPORT_SYMBOL(__delay);
-void udelay(unsigned long usecs)
+void __no_kcsan udelay(unsigned long usecs)
{
__delay(tb_ticks_per_usec * usecs);
}
diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile
index af8527538fe4..b16a9f9c0b35 100644
--- a/arch/powerpc/kernel/trace/Makefile
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -23,4 +23,5 @@ obj-$(CONFIG_PPC32) += $(obj32-y)
# Disable GCOV, KCOV & sanitizers in odd or sensitive code
GCOV_PROFILE_ftrace.o := n
KCOV_INSTRUMENT_ftrace.o := n
+KCSAN_SANITIZE_ftrace.o := n
UBSAN_SANITIZE_ftrace.o := n
diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile
index 6a977b0d8ffc..66f723f53be2 100644
--- a/arch/powerpc/kernel/vdso/Makefile
+++ b/arch/powerpc/kernel/vdso/Makefile
@@ -16,6 +16,11 @@ ifneq ($(c-gettimeofday-y),)
CFLAGS_vgettimeofday-32.o += -ffreestanding -fasynchronous-unwind-tables
CFLAGS_REMOVE_vgettimeofday-32.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_vgettimeofday-32.o += -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc
+ # This flag is supported by clang for 64-bit but not 32-bit so it will cause
+ # an unused command line flag warning for this file.
+ ifdef CONFIG_CC_IS_CLANG
+ CFLAGS_REMOVE_vgettimeofday-32.o += -fno-stack-clash-protection
+ endif
CFLAGS_vgettimeofday-64.o += -include $(c-gettimeofday-y)
CFLAGS_vgettimeofday-64.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
CFLAGS_vgettimeofday-64.o += $(call cc-option, -fno-stack-protector)
@@ -46,15 +51,20 @@ GCOV_PROFILE := n
KCOV_INSTRUMENT := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
-ccflags-y := -shared -fno-common -fno-builtin -nostdlib -Wl,--hash-style=both
-ccflags-$(CONFIG_LD_IS_LLD) += $(call cc-option,--ld-path=$(LD),-fuse-ld=lld)
+ccflags-y := -fno-common -fno-builtin
+ldflags-y := -Wl,--hash-style=both -nostdlib -shared -z noexecstack
+ldflags-$(CONFIG_LD_IS_LLD) += $(call cc-option,--ld-path=$(LD),-fuse-ld=lld)
+# Filter flags that clang will warn are unused for linking
+ldflags-y += $(filter-out $(CC_AUTO_VAR_INIT_ZERO_ENABLER) $(CC_FLAGS_FTRACE) -Wa$(comma)%, $(KBUILD_CFLAGS))
-CC32FLAGS := -Wl,-soname=linux-vdso32.so.1 -m32
-AS32FLAGS := -D__VDSO32__ -s
+CC32FLAGS := -m32
+LD32FLAGS := -Wl,-soname=linux-vdso32.so.1
+AS32FLAGS := -D__VDSO32__
-CC64FLAGS := -Wl,-soname=linux-vdso64.so.1
-AS64FLAGS := -D__VDSO64__ -s
+LD64FLAGS := -Wl,-soname=linux-vdso64.so.1
+AS64FLAGS := -D__VDSO64__
targets += vdso32.lds
CPPFLAGS_vdso32.lds += -P -C -Upowerpc
@@ -92,15 +102,15 @@ include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE
# actual build commands
quiet_cmd_vdso32ld_and_check = VDSO32L $@
- cmd_vdso32ld_and_check = $(VDSOCC) $(c_flags) $(CC32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) -z noexecstack ; $(cmd_vdso_check)
+ cmd_vdso32ld_and_check = $(VDSOCC) $(ldflags-y) $(CC32FLAGS) $(LD32FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^); $(cmd_vdso_check)
quiet_cmd_vdso32as = VDSO32A $@
cmd_vdso32as = $(VDSOCC) $(a_flags) $(CC32FLAGS) $(AS32FLAGS) -c -o $@ $<
quiet_cmd_vdso32cc = VDSO32C $@
cmd_vdso32cc = $(VDSOCC) $(c_flags) $(CC32FLAGS) -c -o $@ $<
quiet_cmd_vdso64ld_and_check = VDSO64L $@
- cmd_vdso64ld_and_check = $(VDSOCC) $(c_flags) $(CC64FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^) -z noexecstack ; $(cmd_vdso_check)
+ cmd_vdso64ld_and_check = $(VDSOCC) $(ldflags-y) $(LD64FLAGS) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^); $(cmd_vdso_check)
quiet_cmd_vdso64as = VDSO64A $@
- cmd_vdso64as = $(VDSOCC) $(a_flags) $(CC64FLAGS) $(AS64FLAGS) -c -o $@ $<
+ cmd_vdso64as = $(VDSOCC) $(a_flags) $(AS64FLAGS) -c -o $@ $<
OBJECT_FILES_NON_STANDARD := y
diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 9be3e818a240..110d28bede2a 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -28,6 +28,7 @@
#include <asm/crashdump-ppc64.h>
#include <asm/mmzone.h>
#include <asm/prom.h>
+#include <asm/plpks.h>
struct umem_info {
u64 *buf; /* data buffer for usable-memory property */
@@ -689,7 +690,8 @@ static int update_usable_mem_fdt(void *fdt, struct crash_mem *usable_mem)
ret = fdt_setprop(fdt, node, "linux,drconf-usable-memory",
um_info.buf, (um_info.idx * sizeof(u64)));
if (ret) {
- pr_err("Failed to update fdt with linux,drconf-usable-memory property");
+ pr_err("Failed to update fdt with linux,drconf-usable-memory property: %s",
+ fdt_strerror(ret));
goto out;
}
}
@@ -978,12 +980,17 @@ static unsigned int cpu_node_size(void)
*/
unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image)
{
- unsigned int cpu_nodes, extra_size;
+ unsigned int cpu_nodes, extra_size = 0;
struct device_node *dn;
u64 usm_entries;
+ // Budget some space for the password blob. There's already extra space
+ // for the key name
+ if (plpks_is_available())
+ extra_size += (unsigned int)plpks_get_passwordlen();
+
if (image->type != KEXEC_TYPE_CRASH)
- return 0;
+ return extra_size;
/*
* For kdump kernel, account for linux,usable-memory and
@@ -993,9 +1000,7 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image)
if (drmem_lmb_size()) {
usm_entries = ((memory_hotplug_max() / drmem_lmb_size()) +
(2 * (resource_size(&crashk_res) / drmem_lmb_size())));
- extra_size = (unsigned int)(usm_entries * sizeof(u64));
- } else {
- extra_size = 0;
+ extra_size += (unsigned int)(usm_entries * sizeof(u64));
}
/*
@@ -1234,6 +1239,10 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
}
}
+ // If we have PLPKS active, we need to provide the password to the new kernel
+ if (plpks_is_available())
+ ret = plpks_populate_fdt(fdt);
+
out:
kfree(rmem);
kfree(umem);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 6d525285dbe8..57f4e7896d67 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -999,16 +999,6 @@ int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store);
-int kvmppc_core_check_processor_compat(void)
-{
- /*
- * We always return 0 for book3s. We check
- * for compatibility while loading the HV
- * or PR module
- */
- return 0;
-}
-
int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
{
return kvm->arch.kvm_ops->hcall_implemented(hcall);
@@ -1062,7 +1052,7 @@ static int kvmppc_book3s_init(void)
{
int r;
- r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
if (r)
return r;
#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index e89281d3ba28..01adffb24667 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1210,7 +1210,7 @@ int kvmppc_handle_exit(struct kvm_vcpu *vcpu, unsigned int exit_nr)
/*
* On cores with Vector category, KVM is loaded only if CONFIG_ALTIVEC,
- * see kvmppc_core_check_processor_compat().
+ * see kvmppc_e500mc_check_processor_compat().
*/
#ifdef CONFIG_ALTIVEC
case BOOKE_INTERRUPT_ALTIVEC_UNAVAIL:
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index c8b2b4478545..b0f695428733 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -314,7 +314,7 @@ static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu)
kvmppc_booke_vcpu_put(vcpu);
}
-int kvmppc_core_check_processor_compat(void)
+static int kvmppc_e500_check_processor_compat(void)
{
int r;
@@ -507,7 +507,7 @@ static int __init kvmppc_e500_init(void)
unsigned long handler_len;
unsigned long max_ivor = 0;
- r = kvmppc_core_check_processor_compat();
+ r = kvmppc_e500_check_processor_compat();
if (r)
goto err_out;
@@ -531,7 +531,7 @@ static int __init kvmppc_e500_init(void)
flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers +
ivor[max_ivor] + handler_len);
- r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+ r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
if (r)
goto err_out;
kvm_ops_e500.owner = THIS_MODULE;
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index 57e0ad6a2ca3..a309138927ff 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -168,7 +168,7 @@ static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu)
kvmppc_booke_vcpu_put(vcpu);
}
-int kvmppc_core_check_processor_compat(void)
+int kvmppc_e500mc_check_processor_compat(void)
{
int r;
@@ -388,6 +388,10 @@ static int __init kvmppc_e500mc_init(void)
{
int r;
+ r = kvmppc_e500mc_check_processor_compat();
+ if (r)
+ goto err_out;
+
r = kvmppc_booke_init();
if (r)
goto err_out;
@@ -400,7 +404,7 @@ static int __init kvmppc_e500mc_init(void)
*/
kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core);
- r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+ r = kvm_init(sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
if (r)
goto err_out;
kvm_ops_e500mc.owner = THIS_MODULE;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 04494a4fb37a..4c5405fc5538 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -435,21 +435,6 @@ int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr,
}
EXPORT_SYMBOL_GPL(kvmppc_ld);
-int kvm_arch_hardware_enable(void)
-{
- return 0;
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
- return 0;
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- return kvmppc_core_check_processor_compat();
-}
-
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
struct kvmppc_ops *kvm_ops = NULL;
@@ -2544,11 +2529,6 @@ void kvmppc_init_lpid(unsigned long nr_lpids_param)
}
EXPORT_SYMBOL_GPL(kvmppc_init_lpid);
-int kvm_arch_init(void *opaque)
-{
- return 0;
-}
-
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr);
void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry)
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 4de71cbf6e8e..c4db459d304a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -16,6 +16,8 @@ KASAN_SANITIZE_feature-fixups.o := n
# restart_table.o contains functions called in the NMI interrupt path
# which can be in real mode. Disable KASAN.
KASAN_SANITIZE_restart_table.o := n
+KCSAN_SANITIZE_code-patching.o := n
+KCSAN_SANITIZE_feature-fixups.o := n
ifdef CONFIG_KASAN
CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 44a35ed4f686..fedffe3ae136 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1051,7 +1051,8 @@ static void __init htab_initialize(void)
static_branch_enable(&stress_hpt_key);
// Too early to use nr_cpu_ids, so use NR_CPUS
tmp = memblock_phys_alloc_range(sizeof(struct stress_hpt_struct) * NR_CPUS,
- 0, 0, MEMBLOCK_ALLOC_ANYWHERE);
+ __alignof__(struct stress_hpt_struct),
+ 0, MEMBLOCK_ALLOC_ANYWHERE);
memset((void *)tmp, 0xff, sizeof(struct stress_hpt_struct) * NR_CPUS);
stress_hpt_struct = __va(tmp);
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 4e29b619578c..e50bc5fc7ddf 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -700,12 +700,13 @@ static inline void _tlbiel_va_range_multicast(struct mm_struct *mm,
*/
void radix__local_flush_tlb_mm(struct mm_struct *mm)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_TLB);
+ _tlbiel_pid(pid, RIC_FLUSH_TLB);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_tlb_mm);
@@ -713,12 +714,13 @@ EXPORT_SYMBOL(radix__local_flush_tlb_mm);
#ifndef CONFIG_SMP
void radix__local_flush_all_mm(struct mm_struct *mm)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_ALL);
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
preempt_enable();
}
EXPORT_SYMBOL(radix__local_flush_all_mm);
@@ -732,12 +734,13 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
int psize)
{
- unsigned long pid;
+ unsigned long pid = mm->context.id;
+
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
+ return;
preempt_disable();
- pid = mm->context.id;
- if (pid != MMU_NO_CONTEXT)
- _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
}
@@ -945,7 +948,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -985,7 +988,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -1024,7 +1027,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
@@ -1104,6 +1107,9 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
}
EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
+/*
+ * Doesn't appear to be used anywhere. Remove.
+ */
#define TLB_FLUSH_ALL -1UL
/*
@@ -1125,23 +1131,22 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
- bool fullmm = (end == TLB_FLUSH_ALL);
bool flush_pid, flush_pwc = false;
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
+
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- type = flush_type_needed(mm, fullmm);
+ type = flush_type_needed(mm, false);
if (type == FLUSH_TYPE_NONE)
goto out;
- if (fullmm)
- flush_pid = true;
- else if (type == FLUSH_TYPE_GLOBAL)
+ if (type == FLUSH_TYPE_GLOBAL)
flush_pid = nr_pages > tlb_single_page_flush_ceiling;
else
flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
@@ -1179,15 +1184,12 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm,
}
}
} else {
- bool hflush = false;
+ bool hflush;
unsigned long hstart, hend;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- hstart = (start + PMD_SIZE - 1) & PMD_MASK;
- hend = end & PMD_MASK;
- if (hstart < hend)
- hflush = true;
- }
+ hstart = (start + PMD_SIZE - 1) & PMD_MASK;
+ hend = end & PMD_MASK;
+ hflush = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hstart < hend;
if (type == FLUSH_TYPE_LOCAL) {
asm volatile("ptesync": : :"memory");
@@ -1302,7 +1304,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* that flushes the process table entry cache upon process teardown.
* See the comment for radix in arch_exit_mmap().
*/
- if (tlb->fullmm || tlb->need_flush_all) {
+ if (tlb->fullmm) {
__flush_all_mm(mm, true);
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->freed_tables)
@@ -1325,25 +1327,22 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned int page_shift = mmu_psize_defs[psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
- bool fullmm = (end == TLB_FLUSH_ALL);
bool flush_pid;
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
- fullmm = (end == TLB_FLUSH_ALL);
+ WARN_ON_ONCE(end == TLB_FLUSH_ALL);
preempt_disable();
smp_mb(); /* see radix__flush_tlb_mm */
- type = flush_type_needed(mm, fullmm);
+ type = flush_type_needed(mm, false);
if (type == FLUSH_TYPE_NONE)
goto out;
- if (fullmm)
- flush_pid = true;
- else if (type == FLUSH_TYPE_GLOBAL)
+ if (type == FLUSH_TYPE_GLOBAL)
flush_pid = nr_pages > tlb_single_page_flush_ceiling;
else
flush_pid = nr_pages > tlb_local_single_page_flush_ceiling;
@@ -1406,7 +1405,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
enum tlb_flush_type type;
pid = mm->context.id;
- if (unlikely(pid == MMU_NO_CONTEXT))
+ if (WARN_ON_ONCE(pid == MMU_NO_CONTEXT))
return;
/* 4k page size, just blow the world */
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index bd9784f77f2e..c6dccb4f06dc 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -120,6 +120,7 @@ extern int switch_to_as1(void);
extern void restore_to_as0(int esel, int offset, void *dt_ptr, int bootcpu);
void create_kaslr_tlb_entry(int entry, unsigned long virt, phys_addr_t phys);
void reloc_kernel_entry(void *fdt, int addr);
+void relocate_init(u64 dt_ptr, phys_addr_t start);
extern int is_second_reloc;
#endif
extern void loadcam_entry(unsigned int index);
diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
index c7d4b317a823..58c8d9849cb1 100644
--- a/arch/powerpc/mm/nohash/e500_hugetlbpage.c
+++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c
@@ -45,7 +45,9 @@ static inline void book3e_tlb_lock(void)
if (!cpu_has_feature(CPU_FTR_SMT))
return;
- asm volatile("1: lbarx %0, 0, %1;"
+ asm volatile(".machine push;"
+ ".machine e6500;"
+ "1: lbarx %0, 0, %1;"
"cmpwi %0, 0;"
"bne 2f;"
"stbcx. %2, 0, %1;"
@@ -56,6 +58,7 @@ static inline void book3e_tlb_lock(void)
"bne 2b;"
"b 1b;"
"3:"
+ ".machine pop;"
: "=&r" (tmp)
: "r" (&paca->tcd_ptr->lock), "r" (token)
: "memory");
diff --git a/arch/powerpc/mm/nohash/tlb_low_64e.S b/arch/powerpc/mm/nohash/tlb_low_64e.S
index 76cf456d7976..7e0b8fe1c279 100644
--- a/arch/powerpc/mm/nohash/tlb_low_64e.S
+++ b/arch/powerpc/mm/nohash/tlb_low_64e.S
@@ -351,7 +351,7 @@ END_FTR_SECTION_NESTED(CPU_FTR_EMB_HV,CPU_FTR_EMB_HV,532)
mfspr r15,SPRN_MAS2
isync
- tlbilxva 0,r15
+ PPC_TLBILX_VA(0,R15)
isync
mtspr SPRN_MAS6,r10
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index a4f7880f959d..d767e39d5645 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -169,7 +169,7 @@ static inline void bpf_clear_seen_register(struct codegen_context *ctx, int i)
void bpf_jit_init_reg_mapping(struct codegen_context *ctx);
int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func);
int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx,
- u32 *addrs, int pass);
+ u32 *addrs, int pass, bool extra_pass);
void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
void bpf_jit_realloc_regs(struct codegen_context *ctx);
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 43e634126514..e93aefcfb83f 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -23,74 +23,6 @@ static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
memset32(area, BREAKPOINT_INSTRUCTION, size / 4);
}
-/* Fix updated addresses (for subprog calls, ldimm64, et al) during extra pass */
-static int bpf_jit_fixup_addresses(struct bpf_prog *fp, u32 *image,
- struct codegen_context *ctx, u32 *addrs)
-{
- const struct bpf_insn *insn = fp->insnsi;
- bool func_addr_fixed;
- u64 func_addr;
- u32 tmp_idx;
- int i, j, ret;
-
- for (i = 0; i < fp->len; i++) {
- /*
- * During the extra pass, only the branch target addresses for
- * the subprog calls need to be fixed. All other instructions
- * can left untouched.
- *
- * The JITed image length does not change because we already
- * ensure that the JITed instruction sequence for these calls
- * are of fixed length by padding them with NOPs.
- */
- if (insn[i].code == (BPF_JMP | BPF_CALL) &&
- insn[i].src_reg == BPF_PSEUDO_CALL) {
- ret = bpf_jit_get_func_addr(fp, &insn[i], true,
- &func_addr,
- &func_addr_fixed);
- if (ret < 0)
- return ret;
-
- /*
- * Save ctx->idx as this would currently point to the
- * end of the JITed image and set it to the offset of
- * the instruction sequence corresponding to the
- * subprog call temporarily.
- */
- tmp_idx = ctx->idx;
- ctx->idx = addrs[i] / 4;
- ret = bpf_jit_emit_func_call_rel(image, ctx, func_addr);
- if (ret)
- return ret;
-
- /*
- * Restore ctx->idx here. This is safe as the length
- * of the JITed sequence remains unchanged.
- */
- ctx->idx = tmp_idx;
- } else if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW)) {
- tmp_idx = ctx->idx;
- ctx->idx = addrs[i] / 4;
-#ifdef CONFIG_PPC32
- PPC_LI32(bpf_to_ppc(insn[i].dst_reg) - 1, (u32)insn[i + 1].imm);
- PPC_LI32(bpf_to_ppc(insn[i].dst_reg), (u32)insn[i].imm);
- for (j = ctx->idx - addrs[i] / 4; j < 4; j++)
- EMIT(PPC_RAW_NOP());
-#else
- func_addr = ((u64)(u32)insn[i].imm) | (((u64)(u32)insn[i + 1].imm) << 32);
- PPC_LI64(bpf_to_ppc(insn[i].dst_reg), func_addr);
- /* overwrite rest with nops */
- for (j = ctx->idx - addrs[i] / 4; j < 5; j++)
- EMIT(PPC_RAW_NOP());
-#endif
- ctx->idx = tmp_idx;
- i++;
- }
- }
-
- return 0;
-}
-
int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, long exit_addr)
{
if (!exit_addr || is_offset_in_branch_range(exit_addr - (ctx->idx * 4))) {
@@ -185,7 +117,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
/* Scouting faux-generate pass 0 */
- if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0)) {
+ if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0, false)) {
/* We hit something illegal or unsupported. */
fp = org_fp;
goto out_addrs;
@@ -200,7 +132,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
*/
if (cgctx.seen & SEEN_TAILCALL || !is_offset_in_branch_range((long)cgctx.idx * 4)) {
cgctx.idx = 0;
- if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0)) {
+ if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0, false)) {
fp = org_fp;
goto out_addrs;
}
@@ -234,29 +166,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
skip_init_ctx:
code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);
- if (extra_pass) {
- /*
- * Do not touch the prologue and epilogue as they will remain
- * unchanged. Only fix the branch target address for subprog
- * calls in the body, and ldimm64 instructions.
- *
- * This does not change the offsets and lengths of the subprog
- * call instruction sequences and hence, the size of the JITed
- * image as well.
- */
- bpf_jit_fixup_addresses(fp, code_base, &cgctx, addrs);
-
- /* There is no need to perform the usual passes. */
- goto skip_codegen_passes;
- }
-
/* Code generation passes 1-2 */
for (pass = 1; pass < 3; pass++) {
/* Now build the prologue, body code & epilogue for real. */
cgctx.idx = 0;
cgctx.alt_exit_addr = 0;
bpf_jit_build_prologue(code_base, &cgctx);
- if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, pass)) {
+ if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, pass, extra_pass)) {
bpf_jit_binary_free(bpf_hdr);
fp = org_fp;
goto out_addrs;
@@ -268,7 +184,6 @@ skip_init_ctx:
proglen - (cgctx.idx * 4), cgctx.seen);
}
-skip_codegen_passes:
if (bpf_jit_enable > 1)
/*
* Note that we output the base address of the code_base
diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
index a379b0ce19ff..7f91ea064c08 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -79,6 +79,20 @@ static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg)
#define SEEN_NVREG_FULL_MASK 0x0003ffff /* Non volatile registers r14-r31 */
#define SEEN_NVREG_TEMP_MASK 0x00001e01 /* BPF_REG_5, BPF_REG_AX, TMP_REG */
+static inline bool bpf_has_stack_frame(struct codegen_context *ctx)
+{
+ /*
+ * We only need a stack frame if:
+ * - we call other functions (kernel helpers), or
+ * - we use non volatile registers, or
+ * - we use tail call counter
+ * - the bpf program uses its stack area
+ * The latter condition is deduced from the usage of BPF_REG_FP
+ */
+ return ctx->seen & (SEEN_FUNC | SEEN_TAILCALL | SEEN_NVREG_FULL_MASK) ||
+ bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_FP));
+}
+
void bpf_jit_realloc_regs(struct codegen_context *ctx)
{
unsigned int nvreg_mask;
@@ -114,11 +128,15 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
int i;
/* Initialize tail_call_cnt, to be skipped if we do tail calls. */
- EMIT(PPC_RAW_LI(_R4, 0));
+ if (ctx->seen & SEEN_TAILCALL)
+ EMIT(PPC_RAW_LI(_R4, 0));
+ else
+ EMIT(PPC_RAW_NOP());
#define BPF_TAILCALL_PROLOGUE_SIZE 4
- EMIT(PPC_RAW_STWU(_R1, _R1, -BPF_PPC_STACKFRAME(ctx)));
+ if (bpf_has_stack_frame(ctx))
+ EMIT(PPC_RAW_STWU(_R1, _R1, -BPF_PPC_STACKFRAME(ctx)));
if (ctx->seen & SEEN_TAILCALL)
EMIT(PPC_RAW_STW(_R4, _R1, bpf_jit_stack_offsetof(ctx, BPF_PPC_TC)));
@@ -141,12 +159,6 @@ void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
if (bpf_is_seen_register(ctx, i))
EMIT(PPC_RAW_STW(i, _R1, bpf_jit_stack_offsetof(ctx, i)));
- /* If needed retrieve arguments 9 and 10, ie 5th 64 bits arg.*/
- if (bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_5))) {
- EMIT(PPC_RAW_LWZ(bpf_to_ppc(BPF_REG_5) - 1, _R1, BPF_PPC_STACKFRAME(ctx)) + 8);
- EMIT(PPC_RAW_LWZ(bpf_to_ppc(BPF_REG_5), _R1, BPF_PPC_STACKFRAME(ctx)) + 12);
- }
-
/* Setup frame pointer to point to the bpf stack area */
if (bpf_is_seen_register(ctx, bpf_to_ppc(BPF_REG_FP))) {
EMIT(PPC_RAW_LI(bpf_to_ppc(BPF_REG_FP) - 1, 0));
@@ -171,7 +183,8 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx
EMIT(PPC_RAW_LWZ(_R0, _R1, BPF_PPC_STACKFRAME(ctx) + PPC_LR_STKOFF));
/* Tear down our stack frame */
- EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME(ctx)));
+ if (bpf_has_stack_frame(ctx))
+ EMIT(PPC_RAW_ADDI(_R1, _R1, BPF_PPC_STACKFRAME(ctx)));
if (ctx->seen & SEEN_FUNC)
EMIT(PPC_RAW_MTLR(_R0));
@@ -193,9 +206,6 @@ int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func
if (image && rel < 0x2000000 && rel >= -0x2000000) {
PPC_BL(func);
- EMIT(PPC_RAW_NOP());
- EMIT(PPC_RAW_NOP());
- EMIT(PPC_RAW_NOP());
} else {
/* Load function address into r0 */
EMIT(PPC_RAW_LIS(_R0, IMM_H(func)));
@@ -269,7 +279,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o
/* Assemble the body code between the prologue & epilogue */
int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx,
- u32 *addrs, int pass)
+ u32 *addrs, int pass, bool extra_pass)
{
const struct bpf_insn *insn = fp->insnsi;
int flen = fp->len;
@@ -280,10 +290,13 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
for (i = 0; i < flen; i++) {
u32 code = insn[i].code;
+ u32 prevcode = i ? insn[i - 1].code : 0;
u32 dst_reg = bpf_to_ppc(insn[i].dst_reg);
u32 dst_reg_h = dst_reg - 1;
u32 src_reg = bpf_to_ppc(insn[i].src_reg);
u32 src_reg_h = src_reg - 1;
+ u32 src2_reg = dst_reg;
+ u32 src2_reg_h = dst_reg_h;
u32 ax_reg = bpf_to_ppc(BPF_REG_AX);
u32 tmp_reg = bpf_to_ppc(TMP_REG);
u32 size = BPF_SIZE(code);
@@ -296,6 +309,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
u32 tmp_idx;
int j;
+ if (i && (BPF_CLASS(code) == BPF_ALU64 || BPF_CLASS(code) == BPF_ALU) &&
+ (BPF_CLASS(prevcode) == BPF_ALU64 || BPF_CLASS(prevcode) == BPF_ALU) &&
+ BPF_OP(prevcode) == BPF_MOV && BPF_SRC(prevcode) == BPF_X &&
+ insn[i - 1].dst_reg == insn[i].dst_reg && insn[i - 1].imm != 1) {
+ src2_reg = bpf_to_ppc(insn[i - 1].src_reg);
+ src2_reg_h = src2_reg - 1;
+ ctx->idx = addrs[i - 1] / 4;
+ }
+
/*
* addrs[] maps a BPF bytecode address into a real offset from
* the start of the body code.
@@ -328,95 +350,111 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
* Arithmetic operations: ADD/SUB/MUL/DIV/MOD/NEG
*/
case BPF_ALU | BPF_ADD | BPF_X: /* (u32) dst += (u32) src */
- EMIT(PPC_RAW_ADD(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_ADD(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU64 | BPF_ADD | BPF_X: /* dst += src */
- EMIT(PPC_RAW_ADDC(dst_reg, dst_reg, src_reg));
- EMIT(PPC_RAW_ADDE(dst_reg_h, dst_reg_h, src_reg_h));
+ EMIT(PPC_RAW_ADDC(dst_reg, src2_reg, src_reg));
+ EMIT(PPC_RAW_ADDE(dst_reg_h, src2_reg_h, src_reg_h));
break;
case BPF_ALU | BPF_SUB | BPF_X: /* (u32) dst -= (u32) src */
- EMIT(PPC_RAW_SUB(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_SUB(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU64 | BPF_SUB | BPF_X: /* dst -= src */
- EMIT(PPC_RAW_SUBFC(dst_reg, src_reg, dst_reg));
- EMIT(PPC_RAW_SUBFE(dst_reg_h, src_reg_h, dst_reg_h));
+ EMIT(PPC_RAW_SUBFC(dst_reg, src_reg, src2_reg));
+ EMIT(PPC_RAW_SUBFE(dst_reg_h, src_reg_h, src2_reg_h));
break;
case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */
imm = -imm;
fallthrough;
case BPF_ALU | BPF_ADD | BPF_K: /* (u32) dst += (u32) imm */
- if (IMM_HA(imm) & 0xffff)
- EMIT(PPC_RAW_ADDIS(dst_reg, dst_reg, IMM_HA(imm)));
+ if (!imm) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ } else if (IMM_HA(imm) & 0xffff) {
+ EMIT(PPC_RAW_ADDIS(dst_reg, src2_reg, IMM_HA(imm)));
+ src2_reg = dst_reg;
+ }
if (IMM_L(imm))
- EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm)));
+ EMIT(PPC_RAW_ADDI(dst_reg, src2_reg, IMM_L(imm)));
break;
case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */
imm = -imm;
fallthrough;
case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */
- if (!imm)
+ if (!imm) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h));
break;
-
+ }
if (imm >= -32768 && imm < 32768) {
- EMIT(PPC_RAW_ADDIC(dst_reg, dst_reg, imm));
+ EMIT(PPC_RAW_ADDIC(dst_reg, src2_reg, imm));
} else {
PPC_LI32(_R0, imm);
- EMIT(PPC_RAW_ADDC(dst_reg, dst_reg, _R0));
+ EMIT(PPC_RAW_ADDC(dst_reg, src2_reg, _R0));
}
if (imm >= 0 || (BPF_OP(code) == BPF_SUB && imm == 0x80000000))
- EMIT(PPC_RAW_ADDZE(dst_reg_h, dst_reg_h));
+ EMIT(PPC_RAW_ADDZE(dst_reg_h, src2_reg_h));
else
- EMIT(PPC_RAW_ADDME(dst_reg_h, dst_reg_h));
+ EMIT(PPC_RAW_ADDME(dst_reg_h, src2_reg_h));
break;
case BPF_ALU64 | BPF_MUL | BPF_X: /* dst *= src */
bpf_set_seen_register(ctx, tmp_reg);
- EMIT(PPC_RAW_MULW(_R0, dst_reg, src_reg_h));
- EMIT(PPC_RAW_MULW(dst_reg_h, dst_reg_h, src_reg));
- EMIT(PPC_RAW_MULHWU(tmp_reg, dst_reg, src_reg));
- EMIT(PPC_RAW_MULW(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_MULW(_R0, src2_reg, src_reg_h));
+ EMIT(PPC_RAW_MULW(dst_reg_h, src2_reg_h, src_reg));
+ EMIT(PPC_RAW_MULHWU(tmp_reg, src2_reg, src_reg));
+ EMIT(PPC_RAW_MULW(dst_reg, src2_reg, src_reg));
EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, _R0));
EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, tmp_reg));
break;
case BPF_ALU | BPF_MUL | BPF_X: /* (u32) dst *= (u32) src */
- EMIT(PPC_RAW_MULW(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_MULW(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU | BPF_MUL | BPF_K: /* (u32) dst *= (u32) imm */
- if (imm >= -32768 && imm < 32768) {
- EMIT(PPC_RAW_MULI(dst_reg, dst_reg, imm));
+ if (imm == 1) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ } else if (imm == -1) {
+ EMIT(PPC_RAW_SUBFIC(dst_reg, src2_reg, 0));
+ } else if (is_power_of_2((u32)imm)) {
+ EMIT(PPC_RAW_SLWI(dst_reg, src2_reg, ilog2(imm)));
+ } else if (imm >= -32768 && imm < 32768) {
+ EMIT(PPC_RAW_MULI(dst_reg, src2_reg, imm));
} else {
PPC_LI32(_R0, imm);
- EMIT(PPC_RAW_MULW(dst_reg, dst_reg, _R0));
+ EMIT(PPC_RAW_MULW(dst_reg, src2_reg, _R0));
}
break;
case BPF_ALU64 | BPF_MUL | BPF_K: /* dst *= imm */
if (!imm) {
PPC_LI32(dst_reg, 0);
PPC_LI32(dst_reg_h, 0);
- break;
- }
- if (imm == 1)
- break;
- if (imm == -1) {
- EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0));
- EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h));
- break;
+ } else if (imm == 1) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h));
+ } else if (imm == -1) {
+ EMIT(PPC_RAW_SUBFIC(dst_reg, src2_reg, 0));
+ EMIT(PPC_RAW_SUBFZE(dst_reg_h, src2_reg_h));
+ } else if (imm > 0 && is_power_of_2(imm)) {
+ imm = ilog2(imm);
+ EMIT(PPC_RAW_RLWINM(dst_reg_h, src2_reg_h, imm, 0, 31 - imm));
+ EMIT(PPC_RAW_RLWIMI(dst_reg_h, dst_reg, imm, 32 - imm, 31));
+ EMIT(PPC_RAW_SLWI(dst_reg, src2_reg, imm));
+ } else {
+ bpf_set_seen_register(ctx, tmp_reg);
+ PPC_LI32(tmp_reg, imm);
+ EMIT(PPC_RAW_MULW(dst_reg_h, src2_reg_h, tmp_reg));
+ if (imm < 0)
+ EMIT(PPC_RAW_SUB(dst_reg_h, dst_reg_h, src2_reg));
+ EMIT(PPC_RAW_MULHWU(_R0, src2_reg, tmp_reg));
+ EMIT(PPC_RAW_MULW(dst_reg, src2_reg, tmp_reg));
+ EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, _R0));
}
- bpf_set_seen_register(ctx, tmp_reg);
- PPC_LI32(tmp_reg, imm);
- EMIT(PPC_RAW_MULW(dst_reg_h, dst_reg_h, tmp_reg));
- if (imm < 0)
- EMIT(PPC_RAW_SUB(dst_reg_h, dst_reg_h, dst_reg));
- EMIT(PPC_RAW_MULHWU(_R0, dst_reg, tmp_reg));
- EMIT(PPC_RAW_MULW(dst_reg, dst_reg, tmp_reg));
- EMIT(PPC_RAW_ADD(dst_reg_h, dst_reg_h, _R0));
break;
case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */
- EMIT(PPC_RAW_DIVWU(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_DIVWU(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */
- EMIT(PPC_RAW_DIVWU(_R0, dst_reg, src_reg));
+ EMIT(PPC_RAW_DIVWU(_R0, src2_reg, src_reg));
EMIT(PPC_RAW_MULW(_R0, src_reg, _R0));
- EMIT(PPC_RAW_SUB(dst_reg, dst_reg, _R0));
+ EMIT(PPC_RAW_SUB(dst_reg, src2_reg, _R0));
break;
case BPF_ALU64 | BPF_DIV | BPF_X: /* dst /= src */
return -EOPNOTSUPP;
@@ -425,11 +463,14 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
case BPF_ALU | BPF_DIV | BPF_K: /* (u32) dst /= (u32) imm */
if (!imm)
return -EINVAL;
- if (imm == 1)
- break;
-
- PPC_LI32(_R0, imm);
- EMIT(PPC_RAW_DIVWU(dst_reg, dst_reg, _R0));
+ if (imm == 1) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ } else if (is_power_of_2((u32)imm)) {
+ EMIT(PPC_RAW_SRWI(dst_reg, src2_reg, ilog2(imm)));
+ } else {
+ PPC_LI32(_R0, imm);
+ EMIT(PPC_RAW_DIVWU(dst_reg, src2_reg, _R0));
+ }
break;
case BPF_ALU | BPF_MOD | BPF_K: /* (u32) dst %= (u32) imm */
if (!imm)
@@ -438,16 +479,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
if (!is_power_of_2((u32)imm)) {
bpf_set_seen_register(ctx, tmp_reg);
PPC_LI32(tmp_reg, imm);
- EMIT(PPC_RAW_DIVWU(_R0, dst_reg, tmp_reg));
+ EMIT(PPC_RAW_DIVWU(_R0, src2_reg, tmp_reg));
EMIT(PPC_RAW_MULW(_R0, tmp_reg, _R0));
- EMIT(PPC_RAW_SUB(dst_reg, dst_reg, _R0));
- break;
- }
- if (imm == 1)
+ EMIT(PPC_RAW_SUB(dst_reg, src2_reg, _R0));
+ } else if (imm == 1) {
EMIT(PPC_RAW_LI(dst_reg, 0));
- else
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 32 - ilog2((u32)imm), 31));
-
+ } else {
+ imm = ilog2((u32)imm);
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 32 - imm, 31));
+ }
break;
case BPF_ALU64 | BPF_MOD | BPF_K: /* dst %= imm */
if (!imm)
@@ -459,7 +499,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
if (imm == 1)
EMIT(PPC_RAW_LI(dst_reg, 0));
else
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 32 - ilog2(imm), 31));
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 32 - ilog2(imm), 31));
EMIT(PPC_RAW_LI(dst_reg_h, 0));
break;
case BPF_ALU64 | BPF_DIV | BPF_K: /* dst /= imm */
@@ -469,34 +509,38 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
return -EOPNOTSUPP;
if (imm < 0) {
- EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0));
- EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h));
+ EMIT(PPC_RAW_SUBFIC(dst_reg, src2_reg, 0));
+ EMIT(PPC_RAW_SUBFZE(dst_reg_h, src2_reg_h));
imm = -imm;
+ src2_reg = dst_reg;
+ }
+ if (imm == 1) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h));
+ } else {
+ imm = ilog2(imm);
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 32 - imm, imm, 31));
+ EMIT(PPC_RAW_RLWIMI(dst_reg, src2_reg_h, 32 - imm, 0, imm - 1));
+ EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, imm));
}
- if (imm == 1)
- break;
- imm = ilog2(imm);
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31));
- EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1));
- EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, imm));
break;
case BPF_ALU | BPF_NEG: /* (u32) dst = -dst */
- EMIT(PPC_RAW_NEG(dst_reg, dst_reg));
+ EMIT(PPC_RAW_NEG(dst_reg, src2_reg));
break;
case BPF_ALU64 | BPF_NEG: /* dst = -dst */
- EMIT(PPC_RAW_SUBFIC(dst_reg, dst_reg, 0));
- EMIT(PPC_RAW_SUBFZE(dst_reg_h, dst_reg_h));
+ EMIT(PPC_RAW_SUBFIC(dst_reg, src2_reg, 0));
+ EMIT(PPC_RAW_SUBFZE(dst_reg_h, src2_reg_h));
break;
/*
* Logical operations: AND/OR/XOR/[A]LSH/[A]RSH
*/
case BPF_ALU64 | BPF_AND | BPF_X: /* dst = dst & src */
- EMIT(PPC_RAW_AND(dst_reg, dst_reg, src_reg));
- EMIT(PPC_RAW_AND(dst_reg_h, dst_reg_h, src_reg_h));
+ EMIT(PPC_RAW_AND(dst_reg, src2_reg, src_reg));
+ EMIT(PPC_RAW_AND(dst_reg_h, src2_reg_h, src_reg_h));
break;
case BPF_ALU | BPF_AND | BPF_X: /* (u32) dst = dst & src */
- EMIT(PPC_RAW_AND(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_AND(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU64 | BPF_AND | BPF_K: /* dst = dst & imm */
if (imm >= 0)
@@ -504,23 +548,23 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
fallthrough;
case BPF_ALU | BPF_AND | BPF_K: /* (u32) dst = dst & imm */
if (!IMM_H(imm)) {
- EMIT(PPC_RAW_ANDI(dst_reg, dst_reg, IMM_L(imm)));
+ EMIT(PPC_RAW_ANDI(dst_reg, src2_reg, IMM_L(imm)));
} else if (!IMM_L(imm)) {
- EMIT(PPC_RAW_ANDIS(dst_reg, dst_reg, IMM_H(imm)));
+ EMIT(PPC_RAW_ANDIS(dst_reg, src2_reg, IMM_H(imm)));
} else if (imm == (((1 << fls(imm)) - 1) ^ ((1 << (ffs(i) - 1)) - 1))) {
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0,
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0,
32 - fls(imm), 32 - ffs(imm)));
} else {
PPC_LI32(_R0, imm);
- EMIT(PPC_RAW_AND(dst_reg, dst_reg, _R0));
+ EMIT(PPC_RAW_AND(dst_reg, src2_reg, _R0));
}
break;
case BPF_ALU64 | BPF_OR | BPF_X: /* dst = dst | src */
- EMIT(PPC_RAW_OR(dst_reg, dst_reg, src_reg));
- EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, src_reg_h));
+ EMIT(PPC_RAW_OR(dst_reg, src2_reg, src_reg));
+ EMIT(PPC_RAW_OR(dst_reg_h, src2_reg_h, src_reg_h));
break;
case BPF_ALU | BPF_OR | BPF_X: /* dst = (u32) dst | (u32) src */
- EMIT(PPC_RAW_OR(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_OR(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU64 | BPF_OR | BPF_K:/* dst = dst | imm */
/* Sign-extended */
@@ -528,145 +572,154 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
EMIT(PPC_RAW_LI(dst_reg_h, -1));
fallthrough;
case BPF_ALU | BPF_OR | BPF_K:/* dst = (u32) dst | (u32) imm */
- if (IMM_L(imm))
- EMIT(PPC_RAW_ORI(dst_reg, dst_reg, IMM_L(imm)));
+ if (IMM_L(imm)) {
+ EMIT(PPC_RAW_ORI(dst_reg, src2_reg, IMM_L(imm)));
+ src2_reg = dst_reg;
+ }
if (IMM_H(imm))
- EMIT(PPC_RAW_ORIS(dst_reg, dst_reg, IMM_H(imm)));
+ EMIT(PPC_RAW_ORIS(dst_reg, src2_reg, IMM_H(imm)));
break;
case BPF_ALU64 | BPF_XOR | BPF_X: /* dst ^= src */
if (dst_reg == src_reg) {
EMIT(PPC_RAW_LI(dst_reg, 0));
EMIT(PPC_RAW_LI(dst_reg_h, 0));
} else {
- EMIT(PPC_RAW_XOR(dst_reg, dst_reg, src_reg));
- EMIT(PPC_RAW_XOR(dst_reg_h, dst_reg_h, src_reg_h));
+ EMIT(PPC_RAW_XOR(dst_reg, src2_reg, src_reg));
+ EMIT(PPC_RAW_XOR(dst_reg_h, src2_reg_h, src_reg_h));
}
break;
case BPF_ALU | BPF_XOR | BPF_X: /* (u32) dst ^= src */
if (dst_reg == src_reg)
EMIT(PPC_RAW_LI(dst_reg, 0));
else
- EMIT(PPC_RAW_XOR(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_XOR(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU64 | BPF_XOR | BPF_K: /* dst ^= imm */
if (imm < 0)
- EMIT(PPC_RAW_NOR(dst_reg_h, dst_reg_h, dst_reg_h));
+ EMIT(PPC_RAW_NOR(dst_reg_h, src2_reg_h, src2_reg_h));
fallthrough;
case BPF_ALU | BPF_XOR | BPF_K: /* (u32) dst ^= (u32) imm */
- if (IMM_L(imm))
- EMIT(PPC_RAW_XORI(dst_reg, dst_reg, IMM_L(imm)));
+ if (IMM_L(imm)) {
+ EMIT(PPC_RAW_XORI(dst_reg, src2_reg, IMM_L(imm)));
+ src2_reg = dst_reg;
+ }
if (IMM_H(imm))
- EMIT(PPC_RAW_XORIS(dst_reg, dst_reg, IMM_H(imm)));
+ EMIT(PPC_RAW_XORIS(dst_reg, src2_reg, IMM_H(imm)));
break;
case BPF_ALU | BPF_LSH | BPF_X: /* (u32) dst <<= (u32) src */
- EMIT(PPC_RAW_SLW(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_SLW(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU64 | BPF_LSH | BPF_X: /* dst <<= src; */
bpf_set_seen_register(ctx, tmp_reg);
EMIT(PPC_RAW_SUBFIC(_R0, src_reg, 32));
- EMIT(PPC_RAW_SLW(dst_reg_h, dst_reg_h, src_reg));
+ EMIT(PPC_RAW_SLW(dst_reg_h, src2_reg_h, src_reg));
EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32));
- EMIT(PPC_RAW_SRW(_R0, dst_reg, _R0));
- EMIT(PPC_RAW_SLW(tmp_reg, dst_reg, tmp_reg));
+ EMIT(PPC_RAW_SRW(_R0, src2_reg, _R0));
+ EMIT(PPC_RAW_SLW(tmp_reg, src2_reg, tmp_reg));
EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, _R0));
- EMIT(PPC_RAW_SLW(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_SLW(dst_reg, src2_reg, src_reg));
EMIT(PPC_RAW_OR(dst_reg_h, dst_reg_h, tmp_reg));
break;
case BPF_ALU | BPF_LSH | BPF_K: /* (u32) dst <<= (u32) imm */
- if (!imm)
- break;
- EMIT(PPC_RAW_SLWI(dst_reg, dst_reg, imm));
+ if (imm)
+ EMIT(PPC_RAW_SLWI(dst_reg, src2_reg, imm));
+ else
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
break;
case BPF_ALU64 | BPF_LSH | BPF_K: /* dst <<= imm */
if (imm < 0)
return -EINVAL;
- if (!imm)
- break;
- if (imm < 32) {
- EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg_h, imm, 0, 31 - imm));
- EMIT(PPC_RAW_RLWIMI(dst_reg_h, dst_reg, imm, 32 - imm, 31));
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, imm, 0, 31 - imm));
- break;
- }
- if (imm < 64)
- EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg, imm, 0, 31 - imm));
- else
+ if (!imm) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ } else if (imm < 32) {
+ EMIT(PPC_RAW_RLWINM(dst_reg_h, src2_reg_h, imm, 0, 31 - imm));
+ EMIT(PPC_RAW_RLWIMI(dst_reg_h, src2_reg, imm, 32 - imm, 31));
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, imm, 0, 31 - imm));
+ } else if (imm < 64) {
+ EMIT(PPC_RAW_RLWINM(dst_reg_h, src2_reg, imm, 0, 31 - imm));
+ EMIT(PPC_RAW_LI(dst_reg, 0));
+ } else {
EMIT(PPC_RAW_LI(dst_reg_h, 0));
- EMIT(PPC_RAW_LI(dst_reg, 0));
+ EMIT(PPC_RAW_LI(dst_reg, 0));
+ }
break;
case BPF_ALU | BPF_RSH | BPF_X: /* (u32) dst >>= (u32) src */
- EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_SRW(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU64 | BPF_RSH | BPF_X: /* dst >>= src */
bpf_set_seen_register(ctx, tmp_reg);
EMIT(PPC_RAW_SUBFIC(_R0, src_reg, 32));
- EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_SRW(dst_reg, src2_reg, src_reg));
EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32));
- EMIT(PPC_RAW_SLW(_R0, dst_reg_h, _R0));
+ EMIT(PPC_RAW_SLW(_R0, src2_reg_h, _R0));
EMIT(PPC_RAW_SRW(tmp_reg, dst_reg_h, tmp_reg));
EMIT(PPC_RAW_OR(dst_reg, dst_reg, _R0));
- EMIT(PPC_RAW_SRW(dst_reg_h, dst_reg_h, src_reg));
+ EMIT(PPC_RAW_SRW(dst_reg_h, src2_reg_h, src_reg));
EMIT(PPC_RAW_OR(dst_reg, dst_reg, tmp_reg));
break;
case BPF_ALU | BPF_RSH | BPF_K: /* (u32) dst >>= (u32) imm */
- if (!imm)
- break;
- EMIT(PPC_RAW_SRWI(dst_reg, dst_reg, imm));
+ if (imm)
+ EMIT(PPC_RAW_SRWI(dst_reg, src2_reg, imm));
+ else
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
break;
case BPF_ALU64 | BPF_RSH | BPF_K: /* dst >>= imm */
if (imm < 0)
return -EINVAL;
- if (!imm)
- break;
- if (imm < 32) {
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31));
- EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1));
- EMIT(PPC_RAW_RLWINM(dst_reg_h, dst_reg_h, 32 - imm, imm, 31));
- break;
- }
- if (imm < 64)
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg_h, 64 - imm, imm - 32, 31));
- else
+ if (!imm) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h));
+ } else if (imm < 32) {
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 32 - imm, imm, 31));
+ EMIT(PPC_RAW_RLWIMI(dst_reg, src2_reg_h, 32 - imm, 0, imm - 1));
+ EMIT(PPC_RAW_RLWINM(dst_reg_h, src2_reg_h, 32 - imm, imm, 31));
+ } else if (imm < 64) {
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg_h, 64 - imm, imm - 32, 31));
+ EMIT(PPC_RAW_LI(dst_reg_h, 0));
+ } else {
EMIT(PPC_RAW_LI(dst_reg, 0));
- EMIT(PPC_RAW_LI(dst_reg_h, 0));
+ EMIT(PPC_RAW_LI(dst_reg_h, 0));
+ }
break;
case BPF_ALU | BPF_ARSH | BPF_X: /* (s32) dst >>= src */
- EMIT(PPC_RAW_SRAW(dst_reg, dst_reg, src_reg));
+ EMIT(PPC_RAW_SRAW(dst_reg, src2_reg, src_reg));
break;
case BPF_ALU64 | BPF_ARSH | BPF_X: /* (s64) dst >>= src */
bpf_set_seen_register(ctx, tmp_reg);
EMIT(PPC_RAW_SUBFIC(_R0, src_reg, 32));
- EMIT(PPC_RAW_SRW(dst_reg, dst_reg, src_reg));
- EMIT(PPC_RAW_SLW(_R0, dst_reg_h, _R0));
+ EMIT(PPC_RAW_SRW(dst_reg, src2_reg, src_reg));
+ EMIT(PPC_RAW_SLW(_R0, src2_reg_h, _R0));
EMIT(PPC_RAW_ADDI(tmp_reg, src_reg, 32));
EMIT(PPC_RAW_OR(dst_reg, dst_reg, _R0));
EMIT(PPC_RAW_RLWINM(_R0, tmp_reg, 0, 26, 26));
- EMIT(PPC_RAW_SRAW(tmp_reg, dst_reg_h, tmp_reg));
- EMIT(PPC_RAW_SRAW(dst_reg_h, dst_reg_h, src_reg));
+ EMIT(PPC_RAW_SRAW(tmp_reg, src2_reg_h, tmp_reg));
+ EMIT(PPC_RAW_SRAW(dst_reg_h, src2_reg_h, src_reg));
EMIT(PPC_RAW_SLW(tmp_reg, tmp_reg, _R0));
EMIT(PPC_RAW_OR(dst_reg, dst_reg, tmp_reg));
break;
case BPF_ALU | BPF_ARSH | BPF_K: /* (s32) dst >>= imm */
- if (!imm)
- break;
- EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg, imm));
+ if (imm)
+ EMIT(PPC_RAW_SRAWI(dst_reg, src2_reg, imm));
+ else
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
break;
case BPF_ALU64 | BPF_ARSH | BPF_K: /* (s64) dst >>= imm */
if (imm < 0)
return -EINVAL;
- if (!imm)
- break;
- if (imm < 32) {
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 32 - imm, imm, 31));
- EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg_h, 32 - imm, 0, imm - 1));
- EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, imm));
- break;
+ if (!imm) {
+ EMIT(PPC_RAW_MR(dst_reg, src2_reg));
+ EMIT(PPC_RAW_MR(dst_reg_h, src2_reg_h));
+ } else if (imm < 32) {
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 32 - imm, imm, 31));
+ EMIT(PPC_RAW_RLWIMI(dst_reg, src2_reg_h, 32 - imm, 0, imm - 1));
+ EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, imm));
+ } else if (imm < 64) {
+ EMIT(PPC_RAW_SRAWI(dst_reg, src2_reg_h, imm - 32));
+ EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, 31));
+ } else {
+ EMIT(PPC_RAW_SRAWI(dst_reg, src2_reg_h, 31));
+ EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, 31));
}
- if (imm < 64)
- EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg_h, imm - 32));
- else
- EMIT(PPC_RAW_SRAWI(dst_reg, dst_reg_h, 31));
- EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg_h, 31));
break;
/*
@@ -700,7 +753,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
switch (imm) {
case 16:
/* Copy 16 bits to upper part */
- EMIT(PPC_RAW_RLWIMI(dst_reg, dst_reg, 16, 0, 15));
+ EMIT(PPC_RAW_RLWIMI(dst_reg, src2_reg, 16, 0, 15));
/* Rotate 8 bits right & mask */
EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 24, 16, 31));
break;
@@ -710,23 +763,23 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
* 2 bytes are already in their final position
* -- byte 2 and 4 (of bytes 1, 2, 3 and 4)
*/
- EMIT(PPC_RAW_RLWINM(_R0, dst_reg, 8, 0, 31));
+ EMIT(PPC_RAW_RLWINM(_R0, src2_reg, 8, 0, 31));
/* Rotate 24 bits and insert byte 1 */
- EMIT(PPC_RAW_RLWIMI(_R0, dst_reg, 24, 0, 7));
+ EMIT(PPC_RAW_RLWIMI(_R0, src2_reg, 24, 0, 7));
/* Rotate 24 bits and insert byte 3 */
- EMIT(PPC_RAW_RLWIMI(_R0, dst_reg, 24, 16, 23));
+ EMIT(PPC_RAW_RLWIMI(_R0, src2_reg, 24, 16, 23));
EMIT(PPC_RAW_MR(dst_reg, _R0));
break;
case 64:
bpf_set_seen_register(ctx, tmp_reg);
- EMIT(PPC_RAW_RLWINM(tmp_reg, dst_reg, 8, 0, 31));
- EMIT(PPC_RAW_RLWINM(_R0, dst_reg_h, 8, 0, 31));
+ EMIT(PPC_RAW_RLWINM(tmp_reg, src2_reg, 8, 0, 31));
+ EMIT(PPC_RAW_RLWINM(_R0, src2_reg_h, 8, 0, 31));
/* Rotate 24 bits and insert byte 1 */
- EMIT(PPC_RAW_RLWIMI(tmp_reg, dst_reg, 24, 0, 7));
- EMIT(PPC_RAW_RLWIMI(_R0, dst_reg_h, 24, 0, 7));
+ EMIT(PPC_RAW_RLWIMI(tmp_reg, src2_reg, 24, 0, 7));
+ EMIT(PPC_RAW_RLWIMI(_R0, src2_reg_h, 24, 0, 7));
/* Rotate 24 bits and insert byte 3 */
- EMIT(PPC_RAW_RLWIMI(tmp_reg, dst_reg, 24, 16, 23));
- EMIT(PPC_RAW_RLWIMI(_R0, dst_reg_h, 24, 16, 23));
+ EMIT(PPC_RAW_RLWIMI(tmp_reg, src2_reg, 24, 16, 23));
+ EMIT(PPC_RAW_RLWIMI(_R0, src2_reg_h, 24, 16, 23));
EMIT(PPC_RAW_MR(dst_reg, _R0));
EMIT(PPC_RAW_MR(dst_reg_h, tmp_reg));
break;
@@ -736,7 +789,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
switch (imm) {
case 16:
/* zero-extend 16 bits into 32 bits */
- EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 16, 31));
+ EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 16, 31));
break;
case 32:
case 64:
@@ -960,8 +1013,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm);
PPC_LI32(dst_reg, (u32)insn[i].imm);
/* padding to allow full 4 instructions for later patching */
- for (j = ctx->idx - tmp_idx; j < 4; j++)
- EMIT(PPC_RAW_NOP());
+ if (!image)
+ for (j = ctx->idx - tmp_idx; j < 4; j++)
+ EMIT(PPC_RAW_NOP());
/* Adjust for two bpf instructions */
addrs[++i] = ctx->idx * 4;
break;
@@ -989,7 +1043,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
case BPF_JMP | BPF_CALL:
ctx->seen |= SEEN_FUNC;
- ret = bpf_jit_get_func_addr(fp, &insn[i], false,
+ ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass,
&func_addr, &func_addr_fixed);
if (ret < 0)
return ret;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 29ee306d6302..8dd3cabaa83a 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -240,13 +240,14 @@ int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func
* load the callee's address, but this may optimize the number of
* instructions required based on the nature of the address.
*
- * Since we don't want the number of instructions emitted to change,
+ * Since we don't want the number of instructions emitted to increase,
* we pad the optimized PPC_LI64() call with NOPs to guarantee that
* we always have a five-instruction sequence, which is the maximum
* that PPC_LI64() can emit.
*/
- for (i = ctx->idx - ctx_idx; i < 5; i++)
- EMIT(PPC_RAW_NOP());
+ if (!image)
+ for (i = ctx->idx - ctx_idx; i < 5; i++)
+ EMIT(PPC_RAW_NOP());
EMIT(PPC_RAW_MTCTR(_R12));
EMIT(PPC_RAW_BCTRL());
@@ -343,7 +344,7 @@ asm (
/* Assemble the body code between the prologue & epilogue */
int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx,
- u32 *addrs, int pass)
+ u32 *addrs, int pass, bool extra_pass)
{
enum stf_barrier_type stf_barrier = stf_barrier_type_get();
const struct bpf_insn *insn = fp->insnsi;
@@ -938,8 +939,9 @@ emit_clear:
tmp_idx = ctx->idx;
PPC_LI64(dst_reg, imm64);
/* padding to allow full 5 instructions for later patching */
- for (j = ctx->idx - tmp_idx; j < 5; j++)
- EMIT(PPC_RAW_NOP());
+ if (!image)
+ for (j = ctx->idx - tmp_idx; j < 5; j++)
+ EMIT(PPC_RAW_NOP());
/* Adjust for two bpf instructions */
addrs[++i] = ctx->idx * 4;
break;
@@ -967,7 +969,7 @@ emit_clear:
case BPF_JMP | BPF_CALL:
ctx->seen |= SEEN_FUNC;
- ret = bpf_jit_get_func_addr(fp, &insn[i], false,
+ ret = bpf_jit_get_func_addr(fp, &insn[i], extra_pass,
&func_addr, &func_addr_fixed);
if (ret < 0)
return ret;
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 33c23225fd54..317175791d23 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -18,6 +18,7 @@
#include <asm/firmware.h>
#include <asm/hvcall.h>
#include <asm/io.h>
+#include <asm/papr-sysparm.h>
#include <linux/byteorder/generic.h>
#include <asm/rtas.h>
@@ -66,8 +67,6 @@ static bool is_physical_domain(unsigned int domain)
* Refer PAPR+ document to get parameter token value as '43'.
*/
-#define PROCESSOR_MODULE_INFO 43
-
static u32 phys_sockets; /* Physical sockets */
static u32 phys_chipspersocket; /* Physical chips per socket*/
static u32 phys_coresperchip; /* Physical cores per chip */
@@ -79,9 +78,7 @@ static u32 phys_coresperchip; /* Physical cores per chip */
*/
void read_24x7_sys_info(void)
{
- int call_status, len, ntypes;
-
- spin_lock(&rtas_data_buf_lock);
+ struct papr_sysparm_buf *buf;
/*
* Making system parameter: chips and sockets and cores per chip
@@ -91,32 +88,22 @@ void read_24x7_sys_info(void)
phys_chipspersocket = 1;
phys_coresperchip = 1;
- call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
- NULL,
- PROCESSOR_MODULE_INFO,
- __pa(rtas_data_buf),
- RTAS_DATA_BUF_SIZE);
-
- if (call_status != 0) {
- pr_err("Error calling get-system-parameter %d\n",
- call_status);
- } else {
- len = be16_to_cpup((__be16 *)&rtas_data_buf[0]);
- if (len < 8)
- goto out;
-
- ntypes = be16_to_cpup((__be16 *)&rtas_data_buf[2]);
+ buf = papr_sysparm_buf_alloc();
+ if (!buf)
+ return;
- if (!ntypes)
- goto out;
+ if (!papr_sysparm_get(PAPR_SYSPARM_PROC_MODULE_INFO, buf)) {
+ int ntypes = be16_to_cpup((__be16 *)&buf->val[0]);
+ int len = be16_to_cpu(buf->len);
- phys_sockets = be16_to_cpup((__be16 *)&rtas_data_buf[4]);
- phys_chipspersocket = be16_to_cpup((__be16 *)&rtas_data_buf[6]);
- phys_coresperchip = be16_to_cpup((__be16 *)&rtas_data_buf[8]);
+ if (len >= 8 && ntypes != 0) {
+ phys_sockets = be16_to_cpup((__be16 *)&buf->val[2]);
+ phys_chipspersocket = be16_to_cpup((__be16 *)&buf->val[4]);
+ phys_coresperchip = be16_to_cpup((__be16 *)&buf->val[6]);
+ }
}
-out:
- spin_unlock(&rtas_data_buf_lock);
+ papr_sysparm_buf_free(buf);
}
/* Domains for which more than one result element are returned for each event. */
@@ -1727,7 +1714,8 @@ static int hv_24x7_init(void)
}
/* POWER8 only supports v1, while POWER9 only supports v2. */
- if (PVR_VER(pvr) == PVR_POWER8)
+ if (PVR_VER(pvr) == PVR_POWER8 || PVR_VER(pvr) == PVR_POWER8E ||
+ PVR_VER(pvr) == PVR_POWER8NVL)
interface_version = 1;
else {
interface_version = 2;
diff --git a/arch/powerpc/platforms/44x/fsp2.c b/arch/powerpc/platforms/44x/fsp2.c
index e2e4f6d8150d..56d91dbef577 100644
--- a/arch/powerpc/platforms/44x/fsp2.c
+++ b/arch/powerpc/platforms/44x/fsp2.c
@@ -205,7 +205,7 @@ static void __init node_irq_request(const char *compat, irq_handler_t errirq_han
for_each_compatible_node(np, NULL, compat) {
irq = irq_of_parse_and_map(np, 0);
- if (irq == NO_IRQ) {
+ if (!irq) {
pr_err("device tree node %pOFn is missing a interrupt",
np);
of_node_put(np);
diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c
index e0647720ed5e..61dfec74ff85 100644
--- a/arch/powerpc/platforms/52xx/efika.c
+++ b/arch/powerpc/platforms/52xx/efika.c
@@ -41,7 +41,7 @@ static int rtas_read_config(struct pci_bus *bus, unsigned int devfn, int offset,
int ret = -1;
int rval;
- rval = rtas_call(rtas_token("read-pci-config"), 2, 2, &ret, addr, len);
+ rval = rtas_call(rtas_function_token(RTAS_FN_READ_PCI_CONFIG), 2, 2, &ret, addr, len);
*val = ret;
return rval ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL;
}
@@ -55,7 +55,7 @@ static int rtas_write_config(struct pci_bus *bus, unsigned int devfn,
| (hose->global_number << 24);
int rval;
- rval = rtas_call(rtas_token("write-pci-config"), 3, 1, NULL,
+ rval = rtas_call(rtas_function_token(RTAS_FN_WRITE_PCI_CONFIG), 3, 1, NULL,
addr, len, val);
return rval ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL;
}
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9563336e3348..046b571496b1 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -118,19 +118,18 @@ endchoice
choice
prompt "CPU selection"
- default GENERIC_CPU
help
This will create a kernel which is optimised for a particular CPU.
The resulting kernel may not run on other CPUs, so use this with care.
If unsure, select Generic.
-config GENERIC_CPU
+config POWERPC64_CPU
bool "Generic (POWER5 and PowerPC 970 and above)"
depends on PPC_BOOK3S_64 && !CPU_LITTLE_ENDIAN
select PPC_64S_HASH_MMU
-config GENERIC_CPU
+config POWERPC64_CPU
bool "Generic (POWER8 and above)"
depends on PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN
select ARCH_HAS_FAST_MULTIPLIER
@@ -144,6 +143,7 @@ config POWERPC_CPU
config CELL_CPU
bool "Cell Broadband Engine"
depends on PPC_BOOK3S_64 && !CPU_LITTLE_ENDIAN
+ depends on !CC_IS_CLANG
select PPC_64S_HASH_MMU
config PPC_970_CPU
@@ -188,11 +188,13 @@ config E5500_CPU
config E6500_CPU
bool "Freescale e6500"
depends on PPC64 && PPC_E500
+ depends on !CC_IS_CLANG
select PPC_HAS_LBARX_LHARX
config 405_CPU
bool "40x family"
depends on 40x
+ depends on !CC_IS_CLANG
config 440_CPU
bool "440 (44x family)"
@@ -201,22 +203,27 @@ config 440_CPU
config 464_CPU
bool "464 (44x family)"
depends on 44x
+ depends on !CC_IS_CLANG
config 476_CPU
bool "476 (47x family)"
depends on PPC_47x
+ depends on !CC_IS_CLANG
config 860_CPU
bool "8xx family"
depends on PPC_8xx
+ depends on !CC_IS_CLANG
config E300C2_CPU
bool "e300c2 (832x)"
depends on PPC_BOOK3S_32
+ depends on !CC_IS_CLANG
config E300C3_CPU
bool "e300c3 (831x)"
depends on PPC_BOOK3S_32
+ depends on !CC_IS_CLANG
config G4_CPU
bool "G4 (74xx)"
@@ -233,13 +240,12 @@ config E500MC_CPU
config TOOLCHAIN_DEFAULT_CPU
bool "Rely on the toolchain's implicit default CPU"
- depends on PPC32
endchoice
config TARGET_CPU_BOOL
bool
- default !GENERIC_CPU && !TOOLCHAIN_DEFAULT_CPU
+ default !TOOLCHAIN_DEFAULT_CPU
config TARGET_CPU
string
@@ -251,6 +257,10 @@ config TARGET_CPU
default "power8" if POWER8_CPU
default "power9" if POWER9_CPU
default "power10" if POWER10_CPU
+ default "e5500" if E5500_CPU
+ default "e6500" if E6500_CPU
+ default "power4" if POWERPC64_CPU && !CPU_LITTLE_ENDIAN
+ default "power8" if POWERPC64_CPU && CPU_LITTLE_ENDIAN
default "405" if 405_CPU
default "440" if 440_CPU
default "464" if 464_CPU
diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c
index 8d934ea6270c..98db63b72d56 100644
--- a/arch/powerpc/platforms/cell/ras.c
+++ b/arch/powerpc/platforms/cell/ras.c
@@ -297,8 +297,8 @@ int cbe_sysreset_hack(void)
static int __init cbe_ptcal_init(void)
{
int ret;
- ptcal_start_tok = rtas_token("ibm,cbe-start-ptcal");
- ptcal_stop_tok = rtas_token("ibm,cbe-stop-ptcal");
+ ptcal_start_tok = rtas_function_token(RTAS_FN_IBM_CBE_START_PTCAL);
+ ptcal_stop_tok = rtas_function_token(RTAS_FN_IBM_CBE_STOP_PTCAL);
if (ptcal_start_tok == RTAS_UNKNOWN_SERVICE
|| ptcal_stop_tok == RTAS_UNKNOWN_SERVICE)
diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c
index 31ce00b52a32..30394c6f8894 100644
--- a/arch/powerpc/platforms/cell/smp.c
+++ b/arch/powerpc/platforms/cell/smp.c
@@ -81,7 +81,7 @@ static inline int smp_startup_cpu(unsigned int lcpu)
* If the RTAS start-cpu token does not exist then presume the
* cpu is already spinning.
*/
- start_cpu = rtas_token("start-cpu");
+ start_cpu = rtas_function_token(RTAS_FN_START_CPU);
if (start_cpu == RTAS_UNKNOWN_SERVICE)
return 1;
@@ -152,7 +152,7 @@ void __init smp_init_cell(void)
cpumask_clear_cpu(boot_cpuid, &of_spin_map);
/* Non-lpar has additional take/give timebase */
- if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
+ if (rtas_function_token(RTAS_FN_FREEZE_TIME_BASE) != RTAS_UNKNOWN_SERVICE) {
smp_ops->give_timebase = rtas_give_timebase;
smp_ops->take_timebase = rtas_take_timebase;
}
diff --git a/arch/powerpc/platforms/chrp/nvram.c b/arch/powerpc/platforms/chrp/nvram.c
index dab78076fedb..0eedae96498c 100644
--- a/arch/powerpc/platforms/chrp/nvram.c
+++ b/arch/powerpc/platforms/chrp/nvram.c
@@ -31,7 +31,7 @@ static unsigned char chrp_nvram_read_val(int addr)
return 0xff;
}
spin_lock_irqsave(&nvram_lock, flags);
- if ((rtas_call(rtas_token("nvram-fetch"), 3, 2, &done, addr,
+ if ((rtas_call(rtas_function_token(RTAS_FN_NVRAM_FETCH), 3, 2, &done, addr,
__pa(nvram_buf), 1) != 0) || 1 != done)
ret = 0xff;
else
@@ -53,7 +53,7 @@ static void chrp_nvram_write_val(int addr, unsigned char val)
}
spin_lock_irqsave(&nvram_lock, flags);
nvram_buf[0] = val;
- if ((rtas_call(rtas_token("nvram-store"), 3, 2, &done, addr,
+ if ((rtas_call(rtas_function_token(RTAS_FN_NVRAM_STORE), 3, 2, &done, addr,
__pa(nvram_buf), 1) != 0) || 1 != done)
printk(KERN_DEBUG "rtas IO error storing 0x%02x at %d", val, addr);
spin_unlock_irqrestore(&nvram_lock, flags);
diff --git a/arch/powerpc/platforms/chrp/pci.c b/arch/powerpc/platforms/chrp/pci.c
index 6f6598e771ff..428fd2a7b3ee 100644
--- a/arch/powerpc/platforms/chrp/pci.c
+++ b/arch/powerpc/platforms/chrp/pci.c
@@ -104,7 +104,7 @@ static int rtas_read_config(struct pci_bus *bus, unsigned int devfn, int offset,
int ret = -1;
int rval;
- rval = rtas_call(rtas_token("read-pci-config"), 2, 2, &ret, addr, len);
+ rval = rtas_call(rtas_function_token(RTAS_FN_READ_PCI_CONFIG), 2, 2, &ret, addr, len);
*val = ret;
return rval? PCIBIOS_DEVICE_NOT_FOUND: PCIBIOS_SUCCESSFUL;
}
@@ -118,7 +118,7 @@ static int rtas_write_config(struct pci_bus *bus, unsigned int devfn, int offset
| (hose->global_number << 24);
int rval;
- rval = rtas_call(rtas_token("write-pci-config"), 3, 1, NULL,
+ rval = rtas_call(rtas_function_token(RTAS_FN_WRITE_PCI_CONFIG), 3, 1, NULL,
addr, len, val);
return rval? PCIBIOS_DEVICE_NOT_FOUND: PCIBIOS_SUCCESSFUL;
}
diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c
index ec63c0558db6..d9049ceb1046 100644
--- a/arch/powerpc/platforms/chrp/setup.c
+++ b/arch/powerpc/platforms/chrp/setup.c
@@ -323,11 +323,11 @@ static void __init chrp_setup_arch(void)
printk("chrp type = %x [%s]\n", _chrp_type, chrp_names[_chrp_type]);
rtas_initialize();
- if (rtas_token("display-character") >= 0)
+ if (rtas_function_token(RTAS_FN_DISPLAY_CHARACTER) >= 0)
ppc_md.progress = rtas_progress;
/* use RTAS time-of-day routines if available */
- if (rtas_token("get-time-of-day") != RTAS_UNKNOWN_SERVICE) {
+ if (rtas_function_token(RTAS_FN_GET_TIME_OF_DAY) != RTAS_UNKNOWN_SERVICE) {
ppc_md.get_boot_time = rtas_get_boot_time;
ppc_md.get_rtc_time = rtas_get_rtc_time;
ppc_md.set_rtc_time = rtas_set_rtc_time;
diff --git a/arch/powerpc/platforms/maple/setup.c b/arch/powerpc/platforms/maple/setup.c
index c26c379e1cc8..98c8e3603064 100644
--- a/arch/powerpc/platforms/maple/setup.c
+++ b/arch/powerpc/platforms/maple/setup.c
@@ -162,8 +162,8 @@ static struct smp_ops_t maple_smp_ops = {
static void __init maple_use_rtas_reboot_and_halt_if_present(void)
{
- if (rtas_service_present("system-reboot") &&
- rtas_service_present("power-off")) {
+ if (rtas_function_implemented(RTAS_FN_SYSTEM_REBOOT) &&
+ rtas_function_implemented(RTAS_FN_POWER_OFF)) {
ppc_md.restart = rtas_restart;
pm_power_off = rtas_power_off;
ppc_md.halt = rtas_halt;
diff --git a/arch/powerpc/platforms/powernv/opal-secvar.c b/arch/powerpc/platforms/powernv/opal-secvar.c
index 14133e120bdd..a8436bf35e2f 100644
--- a/arch/powerpc/platforms/powernv/opal-secvar.c
+++ b/arch/powerpc/platforms/powernv/opal-secvar.c
@@ -54,8 +54,7 @@ static int opal_status_to_err(int rc)
return err;
}
-static int opal_get_variable(const char *key, uint64_t ksize,
- u8 *data, uint64_t *dsize)
+static int opal_get_variable(const char *key, u64 ksize, u8 *data, u64 *dsize)
{
int rc;
@@ -71,8 +70,7 @@ static int opal_get_variable(const char *key, uint64_t ksize,
return opal_status_to_err(rc);
}
-static int opal_get_next_variable(const char *key, uint64_t *keylen,
- uint64_t keybufsize)
+static int opal_get_next_variable(const char *key, u64 *keylen, u64 keybufsize)
{
int rc;
@@ -88,8 +86,7 @@ static int opal_get_next_variable(const char *key, uint64_t *keylen,
return opal_status_to_err(rc);
}
-static int opal_set_variable(const char *key, uint64_t ksize, u8 *data,
- uint64_t dsize)
+static int opal_set_variable(const char *key, u64 ksize, u8 *data, u64 dsize)
{
int rc;
@@ -101,10 +98,57 @@ static int opal_set_variable(const char *key, uint64_t ksize, u8 *data,
return opal_status_to_err(rc);
}
+static ssize_t opal_secvar_format(char *buf, size_t bufsize)
+{
+ ssize_t rc = 0;
+ struct device_node *node;
+ const char *format;
+
+ node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
+ if (!of_device_is_available(node)) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ rc = of_property_read_string(node, "format", &format);
+ if (rc)
+ goto out;
+
+ rc = snprintf(buf, bufsize, "%s", format);
+
+out:
+ of_node_put(node);
+
+ return rc;
+}
+
+static int opal_secvar_max_size(u64 *max_size)
+{
+ int rc;
+ struct device_node *node;
+
+ node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
+ if (!node)
+ return -ENODEV;
+
+ if (!of_device_is_available(node)) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ rc = of_property_read_u64(node, "max-var-size", max_size);
+
+out:
+ of_node_put(node);
+ return rc;
+}
+
static const struct secvar_operations opal_secvar_ops = {
.get = opal_get_variable,
.get_next = opal_get_next_variable,
.set = opal_set_variable,
+ .format = opal_secvar_format,
+ .max_size = opal_secvar_max_size,
};
static int opal_secvar_probe(struct platform_device *pdev)
@@ -116,9 +160,7 @@ static int opal_secvar_probe(struct platform_device *pdev)
return -ENODEV;
}
- set_secvar_ops(&opal_secvar_ops);
-
- return 0;
+ return set_secvar_ops(&opal_secvar_ops);
}
static const struct of_device_id opal_secvar_match[] = {
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 5c144c05cbfd..4f6e20a35aa1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2325,7 +2325,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
int index;
int64_t rc;
- if (!res || !res->flags || res->start > res->end)
+ if (!res || !res->flags || res->start > res->end ||
+ res->flags & IORESOURCE_UNSET)
return;
if (res->flags & IORESOURCE_IO) {
diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c
index c27e6cf85272..9de62bd52650 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -146,7 +146,7 @@ static long ps3_hpte_updatepp(unsigned long slot, unsigned long newpp,
static void ps3_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
int psize, int ssize)
{
- panic("ps3_hpte_updateboltedpp() not implemented");
+ pr_info("ps3_hpte_updateboltedpp() not implemented");
}
static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn,
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index a3b4d99567cb..b481c5c8bae1 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -151,16 +151,16 @@ config IBMEBUS
config PSERIES_PLPKS
depends on PPC_PSERIES
- bool "Support for the Platform Key Storage"
- help
- PowerVM provides an isolated Platform Keystore(PKS) storage
- allocation for each LPAR with individually managed access
- controls to store sensitive information securely. It can be
- used to store asymmetric public keys or secrets as required
- by different usecases. Select this config to enable
- operating system interface to hypervisor to access this space.
-
- If unsure, select N.
+ select NLS
+ bool
+ # PowerVM provides an isolated Platform Keystore (PKS) storage
+ # allocation for each LPAR with individually managed access
+ # controls to store sensitive information securely. It can be
+ # used to store asymmetric public keys or secrets as required
+ # by different usecases.
+ #
+ # This option is selected by in-kernel consumers that require
+ # access to the PKS.
config PAPR_SCM
depends on PPC_PSERIES && MEMORY_HOTPLUG && LIBNVDIMM
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 92310202bdd7..53c3b91af2f7 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -3,7 +3,7 @@ ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC)
ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG
obj-y := lpar.o hvCall.o nvram.o reconfig.o \
- of_helpers.o \
+ of_helpers.o rtas-work-area.o papr-sysparm.o \
setup.o iommu.o event_sources.o ras.o \
firmware.o power.o dlpar.o mobility.o rng.o \
pci.o pci_dlpar.o eeh_pseries.o msi.o \
@@ -27,8 +27,8 @@ obj-$(CONFIG_PAPR_SCM) += papr_scm.o
obj-$(CONFIG_PPC_SPLPAR) += vphn.o
obj-$(CONFIG_PPC_SVM) += svm.o
obj-$(CONFIG_FA_DUMP) += rtas-fadump.o
-obj-$(CONFIG_PSERIES_PLPKS) += plpks.o
-
+obj-$(CONFIG_PSERIES_PLPKS) += plpks.o
+obj-$(CONFIG_PPC_SECURE_BOOT) += plpks-secvar.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PPC_VAS) += vas.o vas-sysfs.o
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 498d6efcb5ae..75ffdbcd2865 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -22,6 +22,7 @@
#include <asm/machdep.h>
#include <linux/uaccess.h>
#include <asm/rtas.h>
+#include <asm/rtas-work-area.h>
static struct workqueue_struct *pseries_hp_wq;
@@ -137,37 +138,27 @@ struct device_node *dlpar_configure_connector(__be32 drc_index,
struct property *property;
struct property *last_property = NULL;
struct cc_workarea *ccwa;
+ struct rtas_work_area *work_area;
char *data_buf;
int cc_token;
int rc = -1;
- cc_token = rtas_token("ibm,configure-connector");
+ cc_token = rtas_function_token(RTAS_FN_IBM_CONFIGURE_CONNECTOR);
if (cc_token == RTAS_UNKNOWN_SERVICE)
return NULL;
- data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
- if (!data_buf)
- return NULL;
+ work_area = rtas_work_area_alloc(SZ_4K);
+ data_buf = rtas_work_area_raw_buf(work_area);
ccwa = (struct cc_workarea *)&data_buf[0];
ccwa->drc_index = drc_index;
ccwa->zero = 0;
do {
- /* Since we release the rtas_data_buf lock between configure
- * connector calls we want to re-populate the rtas_data_buffer
- * with the contents of the previous call.
- */
- spin_lock(&rtas_data_buf_lock);
-
- memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE);
- rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL);
- memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
-
- spin_unlock(&rtas_data_buf_lock);
-
- if (rtas_busy_delay(rc))
- continue;
+ do {
+ rc = rtas_call(cc_token, 2, 1, NULL,
+ rtas_work_area_phys(work_area), NULL);
+ } while (rtas_busy_delay(rc));
switch (rc) {
case COMPLETE:
@@ -227,7 +218,7 @@ struct device_node *dlpar_configure_connector(__be32 drc_index,
} while (rc);
cc_error:
- kfree(data_buf);
+ rtas_work_area_free(work_area);
if (rc) {
if (first_dn)
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 6b507b62ce8f..def184da51cf 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -699,7 +699,7 @@ static int pseries_eeh_write_config(struct eeh_dev *edev, int where, int size, u
static int pseries_send_allow_unfreeze(struct pci_dn *pdn, u16 *vf_pe_array, int cur_vfs)
{
int rc;
- int ibm_allow_unfreeze = rtas_token("ibm,open-sriov-allow-unfreeze");
+ int ibm_allow_unfreeze = rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_ALLOW_UNFREEZE);
unsigned long buid, addr;
addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
@@ -774,7 +774,7 @@ static int pseries_notify_resume(struct eeh_dev *edev)
if (!edev)
return -EEXIST;
- if (rtas_token("ibm,open-sriov-allow-unfreeze") == RTAS_UNKNOWN_SERVICE)
+ if (rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_ALLOW_UNFREEZE) == RTAS_UNKNOWN_SERVICE)
return -EINVAL;
if (edev->pdev->is_physfn || edev->pdev->is_virtfn)
@@ -815,14 +815,14 @@ static int __init eeh_pseries_init(void)
int ret, config_addr;
/* figure out EEH RTAS function call tokens */
- ibm_set_eeh_option = rtas_token("ibm,set-eeh-option");
- ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
- ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
- ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
- ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
- ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
- ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
- ibm_configure_pe = rtas_token("ibm,configure-pe");
+ ibm_set_eeh_option = rtas_function_token(RTAS_FN_IBM_SET_EEH_OPTION);
+ ibm_set_slot_reset = rtas_function_token(RTAS_FN_IBM_SET_SLOT_RESET);
+ ibm_read_slot_reset_state2 = rtas_function_token(RTAS_FN_IBM_READ_SLOT_RESET_STATE2);
+ ibm_read_slot_reset_state = rtas_function_token(RTAS_FN_IBM_READ_SLOT_RESET_STATE);
+ ibm_slot_error_detail = rtas_function_token(RTAS_FN_IBM_SLOT_ERROR_DETAIL);
+ ibm_get_config_addr_info2 = rtas_function_token(RTAS_FN_IBM_GET_CONFIG_ADDR_INFO2);
+ ibm_get_config_addr_info = rtas_function_token(RTAS_FN_IBM_GET_CONFIG_ADDR_INFO);
+ ibm_configure_pe = rtas_function_token(RTAS_FN_IBM_CONFIGURE_PE);
/*
* ibm,configure-pe and ibm,configure-bridge have the same semantics,
@@ -830,7 +830,7 @@ static int __init eeh_pseries_init(void)
* ibm,configure-pe then fall back to using ibm,configure-bridge.
*/
if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE)
- ibm_configure_pe = rtas_token("ibm,configure-bridge");
+ ibm_configure_pe = rtas_function_token(RTAS_FN_IBM_CONFIGURE_BRIDGE);
/*
* Necessary sanity check. We needn't check "get-config-addr-info"
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index 090ae5a1e0f5..982e5e4b5e06 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -855,8 +855,8 @@ static int __init pseries_cpu_hotplug_init(void)
ppc_md.cpu_release = dlpar_cpu_release;
#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
- rtas_stop_self_token = rtas_token("stop-self");
- qcss_tok = rtas_token("query-cpu-stopped-state");
+ rtas_stop_self_token = rtas_function_token(RTAS_FN_STOP_SELF);
+ qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE);
if (rtas_stop_self_token == RTAS_UNKNOWN_SERVICE ||
qcss_tok == RTAS_UNKNOWN_SERVICE) {
diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c
index 7b74d4d34e9a..f411d4fe7b24 100644
--- a/arch/powerpc/platforms/pseries/io_event_irq.c
+++ b/arch/powerpc/platforms/pseries/io_event_irq.c
@@ -143,7 +143,7 @@ static int __init ioei_init(void)
{
struct device_node *np;
- ioei_check_exception_token = rtas_token("check-exception");
+ ioei_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION);
if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE)
return -ENODEV;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 97ef6499e501..2eab323f6970 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -32,6 +32,7 @@
#include <asm/iommu.h>
#include <asm/tlb.h>
#include <asm/cputable.h>
+#include <asm/papr-sysparm.h>
#include <asm/udbg.h>
#include <asm/smp.h>
#include <asm/trace.h>
@@ -1469,8 +1470,6 @@ static inline void __init check_lp_set_hblkrm(unsigned int lp,
}
}
-#define SPLPAR_TLB_BIC_TOKEN 50
-
/*
* The size of the TLB Block Invalidate Characteristics is variable. But at the
* maximum it will be the number of possible page sizes *2 + 10 bytes.
@@ -1481,42 +1480,24 @@ static inline void __init check_lp_set_hblkrm(unsigned int lp,
void __init pseries_lpar_read_hblkrm_characteristics(void)
{
- unsigned char local_buffer[SPLPAR_TLB_BIC_MAXLENGTH];
- int call_status, len, idx, bpsize;
+ static struct papr_sysparm_buf buf __initdata;
+ int len, idx, bpsize;
if (!firmware_has_feature(FW_FEATURE_BLOCK_REMOVE))
return;
- spin_lock(&rtas_data_buf_lock);
- memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
- call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
- NULL,
- SPLPAR_TLB_BIC_TOKEN,
- __pa(rtas_data_buf),
- RTAS_DATA_BUF_SIZE);
- memcpy(local_buffer, rtas_data_buf, SPLPAR_TLB_BIC_MAXLENGTH);
- local_buffer[SPLPAR_TLB_BIC_MAXLENGTH - 1] = '\0';
- spin_unlock(&rtas_data_buf_lock);
-
- if (call_status != 0) {
- pr_warn("%s %s Error calling get-system-parameter (0x%x)\n",
- __FILE__, __func__, call_status);
+ if (papr_sysparm_get(PAPR_SYSPARM_TLB_BLOCK_INVALIDATE_ATTRS, &buf))
return;
- }
- /*
- * The first two (2) bytes of the data in the buffer are the length of
- * the returned data, not counting these first two (2) bytes.
- */
- len = be16_to_cpu(*((u16 *)local_buffer)) + 2;
+ len = be16_to_cpu(buf.len);
if (len > SPLPAR_TLB_BIC_MAXLENGTH) {
pr_warn("%s too large returned buffer %d", __func__, len);
return;
}
- idx = 2;
+ idx = 0;
while (idx < len) {
- u8 block_shift = local_buffer[idx++];
+ u8 block_shift = buf.val[idx++];
u32 block_size;
unsigned int npsize;
@@ -1525,9 +1506,9 @@ void __init pseries_lpar_read_hblkrm_characteristics(void)
block_size = 1 << block_shift;
- for (npsize = local_buffer[idx++];
+ for (npsize = buf.val[idx++];
npsize > 0 && idx < len; npsize--)
- check_lp_set_hblkrm((unsigned int) local_buffer[idx++],
+ check_lp_set_hblkrm((unsigned int)buf.val[idx++],
block_size);
}
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index 63fd925ccbb8..8acc70509520 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -19,6 +19,7 @@
#include <linux/errno.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
+#include <asm/papr-sysparm.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
@@ -312,16 +313,6 @@ static void parse_mpp_x_data(struct seq_file *m)
}
/*
- * PAPR defines, in section "7.3.16 System Parameters Option", the token 55 to
- * read the LPAR name, and the largest output data to 4000 + 2 bytes length.
- */
-#define SPLPAR_LPAR_NAME_TOKEN 55
-#define GET_SYS_PARM_BUF_SIZE 4002
-#if GET_SYS_PARM_BUF_SIZE > RTAS_DATA_BUF_SIZE
-#error "GET_SYS_PARM_BUF_SIZE is larger than RTAS_DATA_BUF_SIZE"
-#endif
-
-/*
* Read the lpar name using the RTAS ibm,get-system-parameter call.
*
* The name read through this call is updated if changes are made by the end
@@ -332,46 +323,19 @@ static void parse_mpp_x_data(struct seq_file *m)
*/
static int read_rtas_lpar_name(struct seq_file *m)
{
- int rc, len, token;
- union {
- char raw_buffer[GET_SYS_PARM_BUF_SIZE];
- struct {
- __be16 len;
- char name[GET_SYS_PARM_BUF_SIZE-2];
- };
- } *local_buffer;
-
- token = rtas_token("ibm,get-system-parameter");
- if (token == RTAS_UNKNOWN_SERVICE)
- return -EINVAL;
+ struct papr_sysparm_buf *buf;
+ int err;
- local_buffer = kmalloc(sizeof(*local_buffer), GFP_KERNEL);
- if (!local_buffer)
+ buf = papr_sysparm_buf_alloc();
+ if (!buf)
return -ENOMEM;
- do {
- spin_lock(&rtas_data_buf_lock);
- memset(rtas_data_buf, 0, sizeof(*local_buffer));
- rc = rtas_call(token, 3, 1, NULL, SPLPAR_LPAR_NAME_TOKEN,
- __pa(rtas_data_buf), sizeof(*local_buffer));
- if (!rc)
- memcpy(local_buffer->raw_buffer, rtas_data_buf,
- sizeof(local_buffer->raw_buffer));
- spin_unlock(&rtas_data_buf_lock);
- } while (rtas_busy_delay(rc));
-
- if (!rc) {
- /* Force end of string */
- len = min((int) be16_to_cpu(local_buffer->len),
- (int) sizeof(local_buffer->name)-1);
- local_buffer->name[len] = '\0';
-
- seq_printf(m, "partition_name=%s\n", local_buffer->name);
- } else
- rc = -ENODATA;
+ err = papr_sysparm_get(PAPR_SYSPARM_LPAR_NAME, buf);
+ if (!err)
+ seq_printf(m, "partition_name=%s\n", buf->val);
- kfree(local_buffer);
- return rc;
+ papr_sysparm_buf_free(buf);
+ return err;
}
/*
@@ -397,7 +361,6 @@ static void read_lpar_name(struct seq_file *m)
pr_err_once("Error can't get the LPAR name");
}
-#define SPLPAR_CHARACTERISTICS_TOKEN 20
#define SPLPAR_MAXLENGTH 1026*(sizeof(char))
/*
@@ -408,45 +371,25 @@ static void read_lpar_name(struct seq_file *m)
*/
static void parse_system_parameter_string(struct seq_file *m)
{
- int call_status;
+ struct papr_sysparm_buf *buf;
- unsigned char *local_buffer = kmalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
- if (!local_buffer) {
- printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
- __FILE__, __func__, __LINE__);
+ buf = papr_sysparm_buf_alloc();
+ if (!buf)
return;
- }
- spin_lock(&rtas_data_buf_lock);
- memset(rtas_data_buf, 0, SPLPAR_MAXLENGTH);
- call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
- NULL,
- SPLPAR_CHARACTERISTICS_TOKEN,
- __pa(rtas_data_buf),
- RTAS_DATA_BUF_SIZE);
- memcpy(local_buffer, rtas_data_buf, SPLPAR_MAXLENGTH);
- local_buffer[SPLPAR_MAXLENGTH - 1] = '\0';
- spin_unlock(&rtas_data_buf_lock);
-
- if (call_status != 0) {
- printk(KERN_INFO
- "%s %s Error calling get-system-parameter (0x%x)\n",
- __FILE__, __func__, call_status);
+ if (papr_sysparm_get(PAPR_SYSPARM_SHARED_PROC_LPAR_ATTRS, buf)) {
+ goto out_free;
} else {
+ const char *local_buffer;
int splpar_strlen;
int idx, w_idx;
char *workbuffer = kzalloc(SPLPAR_MAXLENGTH, GFP_KERNEL);
- if (!workbuffer) {
- printk(KERN_ERR "%s %s kmalloc failure at line %d\n",
- __FILE__, __func__, __LINE__);
- kfree(local_buffer);
- return;
- }
-#ifdef LPARCFG_DEBUG
- printk(KERN_INFO "success calling get-system-parameter\n");
-#endif
- splpar_strlen = local_buffer[0] * 256 + local_buffer[1];
- local_buffer += 2; /* step over strlen value */
+
+ if (!workbuffer)
+ goto out_free;
+
+ splpar_strlen = be16_to_cpu(buf->len);
+ local_buffer = buf->val;
w_idx = 0;
idx = 0;
@@ -480,7 +423,8 @@ static void parse_system_parameter_string(struct seq_file *m)
kfree(workbuffer);
local_buffer -= 2; /* back up over strlen value */
}
- kfree(local_buffer);
+out_free:
+ papr_sysparm_buf_free(buf);
}
/* Return the number of processors in the system.
diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c
index 4cea71aa0f41..643d309d1bd0 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -195,7 +195,7 @@ static int update_dt_node(struct device_node *dn, s32 scope)
u32 nprops;
u32 vd;
- update_properties_token = rtas_token("ibm,update-properties");
+ update_properties_token = rtas_function_token(RTAS_FN_IBM_UPDATE_PROPERTIES);
if (update_properties_token == RTAS_UNKNOWN_SERVICE)
return -EINVAL;
@@ -306,7 +306,7 @@ static int pseries_devicetree_update(s32 scope)
int update_nodes_token;
int rc;
- update_nodes_token = rtas_token("ibm,update-nodes");
+ update_nodes_token = rtas_function_token(RTAS_FN_IBM_UPDATE_NODES);
if (update_nodes_token == RTAS_UNKNOWN_SERVICE)
return 0;
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 3f05507e444d..423ee1d5bd94 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -679,8 +679,8 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
static int rtas_msi_init(void)
{
- query_token = rtas_token("ibm,query-interrupt-source-number");
- change_token = rtas_token("ibm,change-msi");
+ query_token = rtas_function_token(RTAS_FN_IBM_QUERY_INTERRUPT_SOURCE_NUMBER);
+ change_token = rtas_function_token(RTAS_FN_IBM_CHANGE_MSI);
if ((query_token == RTAS_UNKNOWN_SERVICE) ||
(change_token == RTAS_UNKNOWN_SERVICE)) {
diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c
index cbf1720eb4aa..8130c37962c0 100644
--- a/arch/powerpc/platforms/pseries/nvram.c
+++ b/arch/powerpc/platforms/pseries/nvram.c
@@ -227,8 +227,8 @@ int __init pSeries_nvram_init(void)
nvram_size = be32_to_cpup(nbytes_p);
- nvram_fetch = rtas_token("nvram-fetch");
- nvram_store = rtas_token("nvram-store");
+ nvram_fetch = rtas_function_token(RTAS_FN_NVRAM_FETCH);
+ nvram_store = rtas_function_token(RTAS_FN_NVRAM_STORE);
printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size);
of_node_put(nvram);
diff --git a/arch/powerpc/platforms/pseries/papr-sysparm.c b/arch/powerpc/platforms/pseries/papr-sysparm.c
new file mode 100644
index 000000000000..fedc61599e6c
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/papr-sysparm.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "papr-sysparm: " fmt
+
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <asm/rtas.h>
+#include <asm/papr-sysparm.h>
+#include <asm/rtas-work-area.h>
+
+struct papr_sysparm_buf *papr_sysparm_buf_alloc(void)
+{
+ struct papr_sysparm_buf *buf = kzalloc(sizeof(*buf), GFP_KERNEL);
+
+ return buf;
+}
+
+void papr_sysparm_buf_free(struct papr_sysparm_buf *buf)
+{
+ kfree(buf);
+}
+
+/**
+ * papr_sysparm_get() - Retrieve the value of a PAPR system parameter.
+ * @param: PAPR system parameter token as described in
+ * 7.3.16 "System Parameters Option".
+ * @buf: A &struct papr_sysparm_buf as returned from papr_sysparm_buf_alloc().
+ *
+ * Place the result of querying the specified parameter, if available,
+ * in @buf. The result includes a be16 length header followed by the
+ * value, which may be a string or binary data. See &struct papr_sysparm_buf.
+ *
+ * Since there is at least one parameter (60, OS Service Entitlement
+ * Status) where the results depend on the incoming contents of the
+ * work area, the caller-supplied buffer is copied unmodified into the
+ * work area before calling ibm,get-system-parameter.
+ *
+ * A defined parameter may not be implemented on a given system, and
+ * some implemented parameters may not be available to all partitions
+ * on a system. A parameter's disposition may change at any time due
+ * to system configuration changes or partition migration.
+ *
+ * Context: This function may sleep.
+ *
+ * Return: 0 on success, -errno otherwise. @buf is unmodified on error.
+ */
+
+int papr_sysparm_get(papr_sysparm_t param, struct papr_sysparm_buf *buf)
+{
+ const s32 token = rtas_function_token(RTAS_FN_IBM_GET_SYSTEM_PARAMETER);
+ struct rtas_work_area *work_area;
+ s32 fwrc;
+ int ret;
+
+ might_sleep();
+
+ if (WARN_ON(!buf))
+ return -EFAULT;
+
+ if (token == RTAS_UNKNOWN_SERVICE)
+ return -ENOENT;
+
+ work_area = rtas_work_area_alloc(sizeof(*buf));
+
+ memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf));
+
+ do {
+ fwrc = rtas_call(token, 3, 1, NULL, param.token,
+ rtas_work_area_phys(work_area),
+ rtas_work_area_size(work_area));
+ } while (rtas_busy_delay(fwrc));
+
+ switch (fwrc) {
+ case 0:
+ ret = 0;
+ memcpy(buf, rtas_work_area_raw_buf(work_area), sizeof(*buf));
+ break;
+ case -3: /* parameter not implemented */
+ ret = -EOPNOTSUPP;
+ break;
+ case -9002: /* this partition not authorized to retrieve this parameter */
+ ret = -EPERM;
+ break;
+ case -9999: /* "parameter error" e.g. the buffer is too small */
+ ret = -EINVAL;
+ break;
+ default:
+ pr_err("unexpected ibm,get-system-parameter result %d\n", fwrc);
+ fallthrough;
+ case -1: /* Hardware/platform error */
+ ret = -EIO;
+ break;
+ }
+
+ rtas_work_area_free(work_area);
+
+ return ret;
+}
+
+int papr_sysparm_set(papr_sysparm_t param, const struct papr_sysparm_buf *buf)
+{
+ const s32 token = rtas_function_token(RTAS_FN_IBM_SET_SYSTEM_PARAMETER);
+ struct rtas_work_area *work_area;
+ s32 fwrc;
+ int ret;
+
+ might_sleep();
+
+ if (WARN_ON(!buf))
+ return -EFAULT;
+
+ if (token == RTAS_UNKNOWN_SERVICE)
+ return -ENOENT;
+
+ work_area = rtas_work_area_alloc(sizeof(*buf));
+
+ memcpy(rtas_work_area_raw_buf(work_area), buf, sizeof(*buf));
+
+ do {
+ fwrc = rtas_call(token, 2, 1, NULL, param.token,
+ rtas_work_area_phys(work_area));
+ } while (rtas_busy_delay(fwrc));
+
+ switch (fwrc) {
+ case 0:
+ ret = 0;
+ break;
+ case -3: /* parameter not supported */
+ ret = -EOPNOTSUPP;
+ break;
+ case -9002: /* this partition not authorized to modify this parameter */
+ ret = -EPERM;
+ break;
+ case -9999: /* "parameter error" e.g. invalid input data */
+ ret = -EINVAL;
+ break;
+ default:
+ pr_err("unexpected ibm,set-system-parameter result %d\n", fwrc);
+ fallthrough;
+ case -1: /* Hardware/platform error */
+ ret = -EIO;
+ break;
+ }
+
+ rtas_work_area_free(work_area);
+
+ return ret;
+}
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 6e671c3809ec..60e0a58928ef 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -60,7 +60,7 @@ static int pseries_send_map_pe(struct pci_dev *pdev, u16 num_vfs,
struct pci_dn *pdn;
int rc;
unsigned long buid, addr;
- int ibm_map_pes = rtas_token("ibm,open-sriov-map-pe-number");
+ int ibm_map_pes = rtas_function_token(RTAS_FN_IBM_OPEN_SRIOV_MAP_PE_NUMBER);
if (ibm_map_pes == RTAS_UNKNOWN_SERVICE)
return -EINVAL;
diff --git a/arch/powerpc/platforms/pseries/plpks-secvar.c b/arch/powerpc/platforms/pseries/plpks-secvar.c
new file mode 100644
index 000000000000..257fd1f8bc19
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/plpks-secvar.c
@@ -0,0 +1,217 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+// Secure variable implementation using the PowerVM LPAR Platform KeyStore (PLPKS)
+//
+// Copyright 2022, 2023 IBM Corporation
+// Authors: Russell Currey
+// Andrew Donnellan
+// Nayna Jain
+
+#define pr_fmt(fmt) "secvar: "fmt
+
+#include <linux/printk.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/kobject.h>
+#include <linux/nls.h>
+#include <asm/machdep.h>
+#include <asm/secvar.h>
+#include <asm/plpks.h>
+
+// Config attributes for sysfs
+#define PLPKS_CONFIG_ATTR(name, fmt, func) \
+ static ssize_t name##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, \
+ char *buf) \
+ { \
+ return sysfs_emit(buf, fmt, func()); \
+ } \
+ static struct kobj_attribute attr_##name = __ATTR_RO(name)
+
+PLPKS_CONFIG_ATTR(version, "%u\n", plpks_get_version);
+PLPKS_CONFIG_ATTR(max_object_size, "%u\n", plpks_get_maxobjectsize);
+PLPKS_CONFIG_ATTR(total_size, "%u\n", plpks_get_totalsize);
+PLPKS_CONFIG_ATTR(used_space, "%u\n", plpks_get_usedspace);
+PLPKS_CONFIG_ATTR(supported_policies, "%08x\n", plpks_get_supportedpolicies);
+PLPKS_CONFIG_ATTR(signed_update_algorithms, "%016llx\n", plpks_get_signedupdatealgorithms);
+
+static const struct attribute *config_attrs[] = {
+ &attr_version.attr,
+ &attr_max_object_size.attr,
+ &attr_total_size.attr,
+ &attr_used_space.attr,
+ &attr_supported_policies.attr,
+ &attr_signed_update_algorithms.attr,
+ NULL,
+};
+
+static u32 get_policy(const char *name)
+{
+ if ((strcmp(name, "db") == 0) ||
+ (strcmp(name, "dbx") == 0) ||
+ (strcmp(name, "grubdb") == 0) ||
+ (strcmp(name, "grubdbx") == 0) ||
+ (strcmp(name, "sbat") == 0))
+ return (PLPKS_WORLDREADABLE | PLPKS_SIGNEDUPDATE);
+ else
+ return PLPKS_SIGNEDUPDATE;
+}
+
+static const char * const plpks_var_names[] = {
+ "PK",
+ "KEK",
+ "db",
+ "dbx",
+ "grubdb",
+ "grubdbx",
+ "sbat",
+ "moduledb",
+ "trustedcadb",
+ NULL,
+};
+
+static int plpks_get_variable(const char *key, u64 key_len, u8 *data,
+ u64 *data_size)
+{
+ struct plpks_var var = {0};
+ int rc = 0;
+
+ // We subtract 1 from key_len because we don't need to include the
+ // null terminator at the end of the string
+ var.name = kcalloc(key_len - 1, sizeof(wchar_t), GFP_KERNEL);
+ if (!var.name)
+ return -ENOMEM;
+ rc = utf8s_to_utf16s(key, key_len - 1, UTF16_LITTLE_ENDIAN, (wchar_t *)var.name,
+ key_len - 1);
+ if (rc < 0)
+ goto err;
+ var.namelen = rc * 2;
+
+ var.os = PLPKS_VAR_LINUX;
+ if (data) {
+ var.data = data;
+ var.datalen = *data_size;
+ }
+ rc = plpks_read_os_var(&var);
+
+ if (rc)
+ goto err;
+
+ *data_size = var.datalen;
+
+err:
+ kfree(var.name);
+ if (rc && rc != -ENOENT) {
+ pr_err("Failed to read variable '%s': %d\n", key, rc);
+ // Return -EIO since userspace probably doesn't care about the
+ // specific error
+ rc = -EIO;
+ }
+ return rc;
+}
+
+static int plpks_set_variable(const char *key, u64 key_len, u8 *data,
+ u64 data_size)
+{
+ struct plpks_var var = {0};
+ int rc = 0;
+ u64 flags;
+
+ // Secure variables need to be prefixed with 8 bytes of flags.
+ // We only want to perform the write if we have at least one byte of data.
+ if (data_size <= sizeof(flags))
+ return -EINVAL;
+
+ // We subtract 1 from key_len because we don't need to include the
+ // null terminator at the end of the string
+ var.name = kcalloc(key_len - 1, sizeof(wchar_t), GFP_KERNEL);
+ if (!var.name)
+ return -ENOMEM;
+ rc = utf8s_to_utf16s(key, key_len - 1, UTF16_LITTLE_ENDIAN, (wchar_t *)var.name,
+ key_len - 1);
+ if (rc < 0)
+ goto err;
+ var.namelen = rc * 2;
+
+ // Flags are contained in the first 8 bytes of the buffer, and are always big-endian
+ flags = be64_to_cpup((__be64 *)data);
+
+ var.datalen = data_size - sizeof(flags);
+ var.data = data + sizeof(flags);
+ var.os = PLPKS_VAR_LINUX;
+ var.policy = get_policy(key);
+
+ // Unlike in the read case, the plpks error code can be useful to
+ // userspace on write, so we return it rather than just -EIO
+ rc = plpks_signed_update_var(&var, flags);
+
+err:
+ kfree(var.name);
+ return rc;
+}
+
+// PLPKS dynamic secure boot doesn't give us a format string in the same way OPAL does.
+// Instead, report the format using the SB_VERSION variable in the keystore.
+// The string is made up by us, and takes the form "ibm,plpks-sb-v<n>" (or "ibm,plpks-sb-unknown"
+// if the SB_VERSION variable doesn't exist). Hypervisor defines the SB_VERSION variable as a
+// "1 byte unsigned integer value".
+static ssize_t plpks_secvar_format(char *buf, size_t bufsize)
+{
+ struct plpks_var var = {0};
+ ssize_t ret;
+ u8 version;
+
+ var.component = NULL;
+ // Only the signed variables have null bytes in their names, this one doesn't
+ var.name = "SB_VERSION";
+ var.namelen = strlen(var.name);
+ var.datalen = 1;
+ var.data = &version;
+
+ // Unlike the other vars, SB_VERSION is owned by firmware instead of the OS
+ ret = plpks_read_fw_var(&var);
+ if (ret) {
+ if (ret == -ENOENT) {
+ ret = snprintf(buf, bufsize, "ibm,plpks-sb-unknown");
+ } else {
+ pr_err("Error %ld reading SB_VERSION from firmware\n", ret);
+ ret = -EIO;
+ }
+ goto err;
+ }
+
+ ret = snprintf(buf, bufsize, "ibm,plpks-sb-v%hhu", version);
+err:
+ return ret;
+}
+
+static int plpks_max_size(u64 *max_size)
+{
+ // The max object size reported by the hypervisor is accurate for the
+ // object itself, but we use the first 8 bytes of data on write as the
+ // signed update flags, so the max size a user can write is larger.
+ *max_size = (u64)plpks_get_maxobjectsize() + sizeof(u64);
+
+ return 0;
+}
+
+
+static const struct secvar_operations plpks_secvar_ops = {
+ .get = plpks_get_variable,
+ .set = plpks_set_variable,
+ .format = plpks_secvar_format,
+ .max_size = plpks_max_size,
+ .config_attrs = config_attrs,
+ .var_names = plpks_var_names,
+};
+
+static int plpks_secvar_init(void)
+{
+ if (!plpks_is_available())
+ return -ENODEV;
+
+ return set_secvar_ops(&plpks_secvar_ops);
+}
+machine_device_initcall(pseries, plpks_secvar_init);
diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c
index 4edd1585e245..6f7bf3fc3aea 100644
--- a/arch/powerpc/platforms/pseries/plpks.c
+++ b/arch/powerpc/platforms/pseries/plpks.c
@@ -16,30 +16,28 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+#include <linux/memblock.h>
#include <asm/hvcall.h>
#include <asm/machdep.h>
-
-#include "plpks.h"
-
-#define PKS_FW_OWNER 0x1
-#define PKS_BOOTLOADER_OWNER 0x2
-#define PKS_OS_OWNER 0x3
-
-#define LABEL_VERSION 0
-#define MAX_LABEL_ATTR_SIZE 16
-#define MAX_NAME_SIZE 239
-#define MAX_DATA_SIZE 4000
-
-#define PKS_FLUSH_MAX_TIMEOUT 5000 //msec
-#define PKS_FLUSH_SLEEP 10 //msec
-#define PKS_FLUSH_SLEEP_RANGE 400
+#include <asm/plpks.h>
+#include <asm/firmware.h>
static u8 *ospassword;
static u16 ospasswordlength;
// Retrieved with H_PKS_GET_CONFIG
+static u8 version;
+static u16 objoverhead;
static u16 maxpwsize;
static u16 maxobjsize;
+static s16 maxobjlabelsize;
+static u32 totalsize;
+static u32 usedspace;
+static u32 supportedpolicies;
+static u32 maxlargeobjectsize;
+static u64 signedupdatealgorithms;
struct plpks_auth {
u8 version;
@@ -60,7 +58,7 @@ struct label_attr {
struct label {
struct label_attr attr;
- u8 name[MAX_NAME_SIZE];
+ u8 name[PLPKS_MAX_NAME_SIZE];
size_t size;
};
@@ -87,6 +85,12 @@ static int pseries_status_to_err(int rc)
err = -ENOENT;
break;
case H_BUSY:
+ case H_LONG_BUSY_ORDER_1_MSEC:
+ case H_LONG_BUSY_ORDER_10_MSEC:
+ case H_LONG_BUSY_ORDER_100_MSEC:
+ case H_LONG_BUSY_ORDER_1_SEC:
+ case H_LONG_BUSY_ORDER_10_SEC:
+ case H_LONG_BUSY_ORDER_100_SEC:
err = -EBUSY;
break;
case H_AUTHORITY:
@@ -117,16 +121,25 @@ static int pseries_status_to_err(int rc)
err = -EINVAL;
}
+ pr_debug("Converted hypervisor code %d to Linux %d\n", rc, err);
+
return err;
}
static int plpks_gen_password(void)
{
unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 };
- u8 *password, consumer = PKS_OS_OWNER;
+ u8 *password, consumer = PLPKS_OS_OWNER;
int rc;
- password = kzalloc(maxpwsize, GFP_KERNEL);
+ // If we booted from kexec, we could be reusing an existing password already
+ if (ospassword) {
+ pr_debug("Password of length %u already in use\n", ospasswordlength);
+ return 0;
+ }
+
+ // The password must not cross a page boundary, so we align to the next power of 2
+ password = kzalloc(roundup_pow_of_two(maxpwsize), GFP_KERNEL);
if (!password)
return -ENOMEM;
@@ -143,7 +156,7 @@ static int plpks_gen_password(void)
memcpy(ospassword, password, ospasswordlength);
} else {
if (rc == H_IN_USE) {
- pr_warn("Password is already set for POWER LPAR Platform KeyStore\n");
+ pr_warn("Password already set - authenticated operations will fail\n");
rc = 0;
} else {
goto out;
@@ -159,17 +172,19 @@ static struct plpks_auth *construct_auth(u8 consumer)
{
struct plpks_auth *auth;
- if (consumer > PKS_OS_OWNER)
+ if (consumer > PLPKS_OS_OWNER)
return ERR_PTR(-EINVAL);
- auth = kzalloc(struct_size(auth, password, maxpwsize), GFP_KERNEL);
+ // The auth structure must not cross a page boundary and must be
+ // 16 byte aligned. We align to the next largest power of 2
+ auth = kzalloc(roundup_pow_of_two(struct_size(auth, password, maxpwsize)), GFP_KERNEL);
if (!auth)
return ERR_PTR(-ENOMEM);
auth->version = 1;
auth->consumer = consumer;
- if (consumer == PKS_FW_OWNER || consumer == PKS_BOOTLOADER_OWNER)
+ if (consumer == PLPKS_FW_OWNER || consumer == PLPKS_BOOTLOADER_OWNER)
return auth;
memcpy(auth->password, ospassword, ospasswordlength);
@@ -187,25 +202,29 @@ static struct label *construct_label(char *component, u8 varos, u8 *name,
u16 namelen)
{
struct label *label;
- size_t slen;
+ size_t slen = 0;
- if (!name || namelen > MAX_NAME_SIZE)
+ if (!name || namelen > PLPKS_MAX_NAME_SIZE)
return ERR_PTR(-EINVAL);
- slen = strlen(component);
- if (component && slen > sizeof(label->attr.prefix))
- return ERR_PTR(-EINVAL);
+ // Support NULL component for signed updates
+ if (component) {
+ slen = strlen(component);
+ if (slen > sizeof(label->attr.prefix))
+ return ERR_PTR(-EINVAL);
+ }
- label = kzalloc(sizeof(*label), GFP_KERNEL);
+ // The label structure must not cross a page boundary, so we align to the next power of 2
+ label = kzalloc(roundup_pow_of_two(sizeof(*label)), GFP_KERNEL);
if (!label)
return ERR_PTR(-ENOMEM);
if (component)
memcpy(&label->attr.prefix, component, slen);
- label->attr.version = LABEL_VERSION;
+ label->attr.version = PLPKS_LABEL_VERSION;
label->attr.os = varos;
- label->attr.length = MAX_LABEL_ATTR_SIZE;
+ label->attr.length = PLPKS_MAX_LABEL_ATTR_SIZE;
memcpy(&label->name, name, namelen);
label->size = sizeof(struct label_attr) + namelen;
@@ -216,38 +235,164 @@ static struct label *construct_label(char *component, u8 varos, u8 *name,
static int _plpks_get_config(void)
{
unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 };
- struct {
+ struct config {
u8 version;
u8 flags;
- __be32 rsvd0;
+ __be16 rsvd0;
+ __be16 objoverhead;
__be16 maxpwsize;
__be16 maxobjlabelsize;
__be16 maxobjsize;
__be32 totalsize;
__be32 usedspace;
__be32 supportedpolicies;
- __be64 rsvd1;
- } __packed config;
+ __be32 maxlargeobjectsize;
+ __be64 signedupdatealgorithms;
+ u8 rsvd1[476];
+ } __packed * config;
size_t size;
- int rc;
+ int rc = 0;
+
+ size = sizeof(*config);
+
+ // Config struct must not cross a page boundary. So long as the struct
+ // size is a power of 2, this should be fine as alignment is guaranteed
+ config = kzalloc(size, GFP_KERNEL);
+ if (!config) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ rc = plpar_hcall(H_PKS_GET_CONFIG, retbuf, virt_to_phys(config), size);
+
+ if (rc != H_SUCCESS) {
+ rc = pseries_status_to_err(rc);
+ goto err;
+ }
+
+ version = config->version;
+ objoverhead = be16_to_cpu(config->objoverhead);
+ maxpwsize = be16_to_cpu(config->maxpwsize);
+ maxobjsize = be16_to_cpu(config->maxobjsize);
+ maxobjlabelsize = be16_to_cpu(config->maxobjlabelsize);
+ totalsize = be32_to_cpu(config->totalsize);
+ usedspace = be32_to_cpu(config->usedspace);
+ supportedpolicies = be32_to_cpu(config->supportedpolicies);
+ maxlargeobjectsize = be32_to_cpu(config->maxlargeobjectsize);
+ signedupdatealgorithms = be64_to_cpu(config->signedupdatealgorithms);
+
+ // Validate that the numbers we get back match the requirements of the spec
+ if (maxpwsize < 32) {
+ pr_err("Invalid Max Password Size received from hypervisor (%d < 32)\n", maxpwsize);
+ rc = -EIO;
+ goto err;
+ }
+
+ if (maxobjlabelsize < 255) {
+ pr_err("Invalid Max Object Label Size received from hypervisor (%d < 255)\n",
+ maxobjlabelsize);
+ rc = -EIO;
+ goto err;
+ }
+
+ if (totalsize < 4096) {
+ pr_err("Invalid Total Size received from hypervisor (%d < 4096)\n", totalsize);
+ rc = -EIO;
+ goto err;
+ }
+
+ if (version >= 3 && maxlargeobjectsize >= 65536 && maxobjsize != 0xFFFF) {
+ pr_err("Invalid Max Object Size (0x%x != 0xFFFF)\n", maxobjsize);
+ rc = -EIO;
+ goto err;
+ }
+
+err:
+ kfree(config);
+ return rc;
+}
+
+u8 plpks_get_version(void)
+{
+ return version;
+}
+
+u16 plpks_get_objoverhead(void)
+{
+ return objoverhead;
+}
+
+u16 plpks_get_maxpwsize(void)
+{
+ return maxpwsize;
+}
+
+u16 plpks_get_maxobjectsize(void)
+{
+ return maxobjsize;
+}
+
+u16 plpks_get_maxobjectlabelsize(void)
+{
+ return maxobjlabelsize;
+}
+
+u32 plpks_get_totalsize(void)
+{
+ return totalsize;
+}
+
+u32 plpks_get_usedspace(void)
+{
+ // Unlike other config values, usedspace regularly changes as objects
+ // are updated, so we need to refresh.
+ int rc = _plpks_get_config();
+ if (rc) {
+ pr_err("Couldn't get config, rc: %d\n", rc);
+ return 0;
+ }
+ return usedspace;
+}
+
+u32 plpks_get_supportedpolicies(void)
+{
+ return supportedpolicies;
+}
+
+u32 plpks_get_maxlargeobjectsize(void)
+{
+ return maxlargeobjectsize;
+}
+
+u64 plpks_get_signedupdatealgorithms(void)
+{
+ return signedupdatealgorithms;
+}
- size = sizeof(config);
+u16 plpks_get_passwordlen(void)
+{
+ return ospasswordlength;
+}
- rc = plpar_hcall(H_PKS_GET_CONFIG, retbuf, virt_to_phys(&config), size);
+bool plpks_is_available(void)
+{
+ int rc;
- if (rc != H_SUCCESS)
- return pseries_status_to_err(rc);
+ if (!firmware_has_feature(FW_FEATURE_LPAR))
+ return false;
- maxpwsize = be16_to_cpu(config.maxpwsize);
- maxobjsize = be16_to_cpu(config.maxobjsize);
+ rc = _plpks_get_config();
+ if (rc)
+ return false;
- return 0;
+ return true;
}
static int plpks_confirm_object_flushed(struct label *label,
struct plpks_auth *auth)
{
unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = { 0 };
+ bool timed_out = true;
u64 timeout = 0;
u8 status;
int rc;
@@ -259,20 +404,79 @@ static int plpks_confirm_object_flushed(struct label *label,
status = retbuf[0];
if (rc) {
+ timed_out = false;
if (rc == H_NOT_FOUND && status == 1)
rc = 0;
break;
}
- if (!rc && status == 1)
+ if (!rc && status == 1) {
+ timed_out = false;
break;
+ }
- usleep_range(PKS_FLUSH_SLEEP,
- PKS_FLUSH_SLEEP + PKS_FLUSH_SLEEP_RANGE);
- timeout = timeout + PKS_FLUSH_SLEEP;
- } while (timeout < PKS_FLUSH_MAX_TIMEOUT);
+ usleep_range(PLPKS_FLUSH_SLEEP,
+ PLPKS_FLUSH_SLEEP + PLPKS_FLUSH_SLEEP_RANGE);
+ timeout = timeout + PLPKS_FLUSH_SLEEP;
+ } while (timeout < PLPKS_MAX_TIMEOUT);
- rc = pseries_status_to_err(rc);
+ if (timed_out)
+ return -ETIMEDOUT;
+
+ return pseries_status_to_err(rc);
+}
+
+int plpks_signed_update_var(struct plpks_var *var, u64 flags)
+{
+ unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
+ int rc;
+ struct label *label;
+ struct plpks_auth *auth;
+ u64 continuetoken = 0;
+ u64 timeout = 0;
+
+ if (!var->data || var->datalen <= 0 || var->namelen > PLPKS_MAX_NAME_SIZE)
+ return -EINVAL;
+
+ if (!(var->policy & PLPKS_SIGNEDUPDATE))
+ return -EINVAL;
+
+ // Signed updates need the component to be NULL.
+ if (var->component)
+ return -EINVAL;
+
+ auth = construct_auth(PLPKS_OS_OWNER);
+ if (IS_ERR(auth))
+ return PTR_ERR(auth);
+
+ label = construct_label(var->component, var->os, var->name, var->namelen);
+ if (IS_ERR(label)) {
+ rc = PTR_ERR(label);
+ goto out;
+ }
+
+ do {
+ rc = plpar_hcall9(H_PKS_SIGNED_UPDATE, retbuf,
+ virt_to_phys(auth), virt_to_phys(label),
+ label->size, var->policy, flags,
+ virt_to_phys(var->data), var->datalen,
+ continuetoken);
+
+ continuetoken = retbuf[0];
+ if (pseries_status_to_err(rc) == -EBUSY) {
+ int delay_ms = get_longbusy_msecs(rc);
+ mdelay(delay_ms);
+ timeout += delay_ms;
+ }
+ rc = pseries_status_to_err(rc);
+ } while (rc == -EBUSY && timeout < PLPKS_MAX_TIMEOUT);
+
+ if (!rc)
+ rc = plpks_confirm_object_flushed(label, auth);
+
+ kfree(label);
+out:
+ kfree(auth);
return rc;
}
@@ -285,13 +489,13 @@ int plpks_write_var(struct plpks_var var)
int rc;
if (!var.component || !var.data || var.datalen <= 0 ||
- var.namelen > MAX_NAME_SIZE || var.datalen > MAX_DATA_SIZE)
+ var.namelen > PLPKS_MAX_NAME_SIZE || var.datalen > PLPKS_MAX_DATA_SIZE)
return -EINVAL;
- if (var.policy & SIGNEDUPDATE)
+ if (var.policy & PLPKS_SIGNEDUPDATE)
return -EINVAL;
- auth = construct_auth(PKS_OS_OWNER);
+ auth = construct_auth(PLPKS_OS_OWNER);
if (IS_ERR(auth))
return PTR_ERR(auth);
@@ -323,10 +527,10 @@ int plpks_remove_var(char *component, u8 varos, struct plpks_var_name vname)
struct label *label;
int rc;
- if (!component || vname.namelen > MAX_NAME_SIZE)
+ if (vname.namelen > PLPKS_MAX_NAME_SIZE)
return -EINVAL;
- auth = construct_auth(PKS_OS_OWNER);
+ auth = construct_auth(PLPKS_OS_OWNER);
if (IS_ERR(auth))
return PTR_ERR(auth);
@@ -358,14 +562,14 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var)
u8 *output;
int rc;
- if (var->namelen > MAX_NAME_SIZE)
+ if (var->namelen > PLPKS_MAX_NAME_SIZE)
return -EINVAL;
auth = construct_auth(consumer);
if (IS_ERR(auth))
return PTR_ERR(auth);
- if (consumer == PKS_OS_OWNER) {
+ if (consumer == PLPKS_OS_OWNER) {
label = construct_label(var->component, var->os, var->name,
var->namelen);
if (IS_ERR(label)) {
@@ -380,7 +584,7 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var)
goto out_free_label;
}
- if (consumer == PKS_OS_OWNER)
+ if (consumer == PLPKS_OS_OWNER)
rc = plpar_hcall(H_PKS_READ_OBJECT, retbuf, virt_to_phys(auth),
virt_to_phys(label), label->size, virt_to_phys(output),
maxobjsize);
@@ -395,17 +599,14 @@ static int plpks_read_var(u8 consumer, struct plpks_var *var)
goto out_free_output;
}
- if (var->datalen == 0 || var->datalen > retbuf[0])
+ if (!var->data || var->datalen > retbuf[0])
var->datalen = retbuf[0];
- var->data = kzalloc(var->datalen, GFP_KERNEL);
- if (!var->data) {
- rc = -ENOMEM;
- goto out_free_output;
- }
var->policy = retbuf[1];
- memcpy(var->data, output, var->datalen);
+ if (var->data)
+ memcpy(var->data, output, var->datalen);
+
rc = 0;
out_free_output:
@@ -420,17 +621,69 @@ out_free_auth:
int plpks_read_os_var(struct plpks_var *var)
{
- return plpks_read_var(PKS_OS_OWNER, var);
+ return plpks_read_var(PLPKS_OS_OWNER, var);
}
int plpks_read_fw_var(struct plpks_var *var)
{
- return plpks_read_var(PKS_FW_OWNER, var);
+ return plpks_read_var(PLPKS_FW_OWNER, var);
}
int plpks_read_bootloader_var(struct plpks_var *var)
{
- return plpks_read_var(PKS_BOOTLOADER_OWNER, var);
+ return plpks_read_var(PLPKS_BOOTLOADER_OWNER, var);
+}
+
+int plpks_populate_fdt(void *fdt)
+{
+ int chosen_offset = fdt_path_offset(fdt, "/chosen");
+
+ if (chosen_offset < 0) {
+ pr_err("Can't find chosen node: %s\n",
+ fdt_strerror(chosen_offset));
+ return chosen_offset;
+ }
+
+ return fdt_setprop(fdt, chosen_offset, "ibm,plpks-pw", ospassword, ospasswordlength);
+}
+
+// Once a password is registered with the hypervisor it cannot be cleared without
+// rebooting the LPAR, so to keep using the PLPKS across kexec boots we need to
+// recover the previous password from the FDT.
+//
+// There are a few challenges here. We don't want the password to be visible to
+// users, so we need to clear it from the FDT. This has to be done in early boot.
+// Clearing it from the FDT would make the FDT's checksum invalid, so we have to
+// manually cause the checksum to be recalculated.
+void __init plpks_early_init_devtree(void)
+{
+ void *fdt = initial_boot_params;
+ int chosen_node = fdt_path_offset(fdt, "/chosen");
+ const u8 *password;
+ int len;
+
+ if (chosen_node < 0)
+ return;
+
+ password = fdt_getprop(fdt, chosen_node, "ibm,plpks-pw", &len);
+ if (len <= 0) {
+ pr_debug("Couldn't find ibm,plpks-pw node.\n");
+ return;
+ }
+
+ ospassword = memblock_alloc_raw(len, SMP_CACHE_BYTES);
+ if (!ospassword) {
+ pr_err("Error allocating memory for password.\n");
+ goto out;
+ }
+
+ memcpy(ospassword, password, len);
+ ospasswordlength = (u16)len;
+
+out:
+ fdt_nop_property(fdt, chosen_node, "ibm,plpks-pw");
+ // Since we've cleared the password, we must update the FDT checksum
+ early_init_dt_verify(fdt);
}
static __init int pseries_plpks_init(void)
diff --git a/arch/powerpc/platforms/pseries/plpks.h b/arch/powerpc/platforms/pseries/plpks.h
deleted file mode 100644
index 275ccd86bfb5..000000000000
--- a/arch/powerpc/platforms/pseries/plpks.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2022 IBM Corporation
- * Author: Nayna Jain <nayna@linux.ibm.com>
- *
- * Platform keystore for pseries LPAR(PLPKS).
- */
-
-#ifndef _PSERIES_PLPKS_H
-#define _PSERIES_PLPKS_H
-
-#include <linux/types.h>
-#include <linux/list.h>
-
-#define OSSECBOOTAUDIT 0x40000000
-#define OSSECBOOTENFORCE 0x20000000
-#define WORLDREADABLE 0x08000000
-#define SIGNEDUPDATE 0x01000000
-
-#define PLPKS_VAR_LINUX 0x02
-#define PLPKS_VAR_COMMON 0x04
-
-struct plpks_var {
- char *component;
- u8 *name;
- u8 *data;
- u32 policy;
- u16 namelen;
- u16 datalen;
- u8 os;
-};
-
-struct plpks_var_name {
- u8 *name;
- u16 namelen;
-};
-
-struct plpks_var_name_list {
- u32 varcount;
- struct plpks_var_name varlist[];
-};
-
-/**
- * Writes the specified var and its data to PKS.
- * Any caller of PKS driver should present a valid component type for
- * their variable.
- */
-int plpks_write_var(struct plpks_var var);
-
-/**
- * Removes the specified var and its data from PKS.
- */
-int plpks_remove_var(char *component, u8 varos,
- struct plpks_var_name vname);
-
-/**
- * Returns the data for the specified os variable.
- */
-int plpks_read_os_var(struct plpks_var *var);
-
-/**
- * Returns the data for the specified firmware variable.
- */
-int plpks_read_fw_var(struct plpks_var *var);
-
-/**
- * Returns the data for the specified bootloader variable.
- */
-int plpks_read_bootloader_var(struct plpks_var *var);
-
-#endif
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index f12516c3998c..adafd593d9d3 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -155,7 +155,7 @@ static int __init init_ras_IRQ(void)
{
struct device_node *np;
- ras_check_exception_token = rtas_token("check-exception");
+ ras_check_exception_token = rtas_function_token(RTAS_FN_CHECK_EXCEPTION);
/* Internal Errors */
np = of_find_node_by_path("/event-sources/internal-errors");
diff --git a/arch/powerpc/platforms/pseries/rtas-work-area.c b/arch/powerpc/platforms/pseries/rtas-work-area.c
new file mode 100644
index 000000000000..b37d52f40360
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/rtas-work-area.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "rtas-work-area: " fmt
+
+#include <linux/genalloc.h>
+#include <linux/log2.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/mempool.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
+#include <linux/numa.h>
+#include <linux/sizes.h>
+#include <linux/wait.h>
+
+#include <asm/machdep.h>
+#include <asm/rtas-work-area.h>
+#include <asm/rtas.h>
+
+enum {
+ /*
+ * Ensure the pool is page-aligned.
+ */
+ RTAS_WORK_AREA_ARENA_ALIGN = PAGE_SIZE,
+ /*
+ * Don't let a single allocation claim the whole arena.
+ */
+ RTAS_WORK_AREA_ARENA_SZ = RTAS_WORK_AREA_MAX_ALLOC_SZ * 2,
+ /*
+ * The smallest known work area size is for ibm,get-vpd's
+ * location code argument, which is limited to 79 characters
+ * plus 1 nul terminator.
+ *
+ * PAPR+ 7.3.20 ibm,get-vpd RTAS Call
+ * PAPR+ 12.3.2.4 Converged Location Code Rules - Length Restrictions
+ */
+ RTAS_WORK_AREA_MIN_ALLOC_SZ = roundup_pow_of_two(80),
+};
+
+static struct {
+ struct gen_pool *gen_pool;
+ char *arena;
+ struct mutex mutex; /* serializes allocations */
+ struct wait_queue_head wqh;
+ mempool_t descriptor_pool;
+ bool available;
+} rwa_state = {
+ .mutex = __MUTEX_INITIALIZER(rwa_state.mutex),
+ .wqh = __WAIT_QUEUE_HEAD_INITIALIZER(rwa_state.wqh),
+};
+
+/*
+ * A single work area buffer and descriptor to serve requests early in
+ * boot before the allocator is fully initialized. We know 4KB is the
+ * most any boot time user needs (they all call ibm,get-system-parameter).
+ */
+static bool early_work_area_in_use __initdata;
+static char early_work_area_buf[SZ_4K] __initdata __aligned(SZ_4K);
+static struct rtas_work_area early_work_area __initdata = {
+ .buf = early_work_area_buf,
+ .size = sizeof(early_work_area_buf),
+};
+
+
+static struct rtas_work_area * __init rtas_work_area_alloc_early(size_t size)
+{
+ WARN_ON(size > early_work_area.size);
+ WARN_ON(early_work_area_in_use);
+ early_work_area_in_use = true;
+ memset(early_work_area.buf, 0, early_work_area.size);
+ return &early_work_area;
+}
+
+static void __init rtas_work_area_free_early(struct rtas_work_area *work_area)
+{
+ WARN_ON(work_area != &early_work_area);
+ WARN_ON(!early_work_area_in_use);
+ early_work_area_in_use = false;
+}
+
+struct rtas_work_area * __ref __rtas_work_area_alloc(size_t size)
+{
+ struct rtas_work_area *area;
+ unsigned long addr;
+
+ might_sleep();
+
+ /*
+ * The rtas_work_area_alloc() wrapper enforces this at build
+ * time. Requests that exceed the arena size will block
+ * indefinitely.
+ */
+ WARN_ON(size > RTAS_WORK_AREA_MAX_ALLOC_SZ);
+
+ if (!rwa_state.available)
+ return rtas_work_area_alloc_early(size);
+ /*
+ * To ensure FCFS behavior and prevent a high rate of smaller
+ * requests from starving larger ones, use the mutex to queue
+ * allocations.
+ */
+ mutex_lock(&rwa_state.mutex);
+ wait_event(rwa_state.wqh,
+ (addr = gen_pool_alloc(rwa_state.gen_pool, size)) != 0);
+ mutex_unlock(&rwa_state.mutex);
+
+ area = mempool_alloc(&rwa_state.descriptor_pool, GFP_KERNEL);
+ area->buf = (char *)addr;
+ area->size = size;
+
+ return area;
+}
+
+void __ref rtas_work_area_free(struct rtas_work_area *area)
+{
+ if (!rwa_state.available) {
+ rtas_work_area_free_early(area);
+ return;
+ }
+
+ gen_pool_free(rwa_state.gen_pool, (unsigned long)area->buf, area->size);
+ mempool_free(area, &rwa_state.descriptor_pool);
+ wake_up(&rwa_state.wqh);
+}
+
+/*
+ * Initialization of the work area allocator happens in two parts. To
+ * reliably reserve an arena that satisfies RTAS addressing
+ * requirements, we must perform a memblock allocation early,
+ * immmediately after RTAS instantiation. Then we have to wait until
+ * the slab allocator is up before setting up the descriptor mempool
+ * and adding the arena to a gen_pool.
+ */
+static __init int rtas_work_area_allocator_init(void)
+{
+ const unsigned int order = ilog2(RTAS_WORK_AREA_MIN_ALLOC_SZ);
+ const phys_addr_t pa_start = __pa(rwa_state.arena);
+ const phys_addr_t pa_end = pa_start + RTAS_WORK_AREA_ARENA_SZ - 1;
+ struct gen_pool *pool;
+ const int nid = NUMA_NO_NODE;
+ int err;
+
+ err = -ENOMEM;
+ if (!rwa_state.arena)
+ goto err_out;
+
+ pool = gen_pool_create(order, nid);
+ if (!pool)
+ goto err_out;
+ /*
+ * All RTAS functions that consume work areas are OK with
+ * natural alignment, when they have alignment requirements at
+ * all.
+ */
+ gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
+
+ err = gen_pool_add(pool, (unsigned long)rwa_state.arena,
+ RTAS_WORK_AREA_ARENA_SZ, nid);
+ if (err)
+ goto err_destroy;
+
+ err = mempool_init_kmalloc_pool(&rwa_state.descriptor_pool, 1,
+ sizeof(struct rtas_work_area));
+ if (err)
+ goto err_destroy;
+
+ rwa_state.gen_pool = pool;
+ rwa_state.available = true;
+
+ pr_debug("arena [%pa-%pa] (%uK), min/max alloc sizes %u/%u\n",
+ &pa_start, &pa_end,
+ RTAS_WORK_AREA_ARENA_SZ / SZ_1K,
+ RTAS_WORK_AREA_MIN_ALLOC_SZ,
+ RTAS_WORK_AREA_MAX_ALLOC_SZ);
+
+ return 0;
+
+err_destroy:
+ gen_pool_destroy(pool);
+err_out:
+ return err;
+}
+machine_arch_initcall(pseries, rtas_work_area_allocator_init);
+
+/**
+ * rtas_work_area_reserve_arena() - Reserve memory suitable for RTAS work areas.
+ */
+void __init rtas_work_area_reserve_arena(const phys_addr_t limit)
+{
+ const phys_addr_t align = RTAS_WORK_AREA_ARENA_ALIGN;
+ const phys_addr_t size = RTAS_WORK_AREA_ARENA_SZ;
+ const phys_addr_t min = MEMBLOCK_LOW_LIMIT;
+ const int nid = NUMA_NO_NODE;
+
+ /*
+ * Too early for a machine_is(pseries) check. But PAPR
+ * effectively mandates that ibm,get-system-parameter is
+ * present:
+ *
+ * R1–7.3.16–1. All platforms must support the System
+ * Parameters option.
+ *
+ * So set up the arena if we find that, with a fallback to
+ * ibm,configure-connector, just in case.
+ */
+ if (rtas_function_implemented(RTAS_FN_IBM_GET_SYSTEM_PARAMETER) ||
+ rtas_function_implemented(RTAS_FN_IBM_CONFIGURE_CONNECTOR))
+ rwa_state.arena = memblock_alloc_try_nid(size, align, min, limit, nid);
+}
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 8ef3270515a9..4a0cec8cf623 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -57,6 +57,7 @@
#include <asm/pmc.h>
#include <asm/xics.h>
#include <asm/xive.h>
+#include <asm/papr-sysparm.h>
#include <asm/ppc-pci.h>
#include <asm/i8259.h>
#include <asm/udbg.h>
@@ -135,11 +136,11 @@ static void __init fwnmi_init(void)
#endif
int ibm_nmi_register_token;
- ibm_nmi_register_token = rtas_token("ibm,nmi-register");
+ ibm_nmi_register_token = rtas_function_token(RTAS_FN_IBM_NMI_REGISTER);
if (ibm_nmi_register_token == RTAS_UNKNOWN_SERVICE)
return;
- ibm_nmi_interlock_token = rtas_token("ibm,nmi-interlock");
+ ibm_nmi_interlock_token = rtas_function_token(RTAS_FN_IBM_NMI_INTERLOCK);
if (WARN_ON(ibm_nmi_interlock_token == RTAS_UNKNOWN_SERVICE))
return;
@@ -941,28 +942,21 @@ void pSeries_coalesce_init(void)
*/
static void __init pSeries_cmo_feature_init(void)
{
+ static struct papr_sysparm_buf buf __initdata;
+ static_assert(sizeof(buf.val) >= CMO_MAXLENGTH);
char *ptr, *key, *value, *end;
- int call_status;
int page_order = IOMMU_PAGE_SHIFT_4K;
pr_debug(" -> fw_cmo_feature_init()\n");
- spin_lock(&rtas_data_buf_lock);
- memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE);
- call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
- NULL,
- CMO_CHARACTERISTICS_TOKEN,
- __pa(rtas_data_buf),
- RTAS_DATA_BUF_SIZE);
-
- if (call_status != 0) {
- spin_unlock(&rtas_data_buf_lock);
+
+ if (papr_sysparm_get(PAPR_SYSPARM_COOP_MEM_OVERCOMMIT_ATTRS, &buf)) {
pr_debug("CMO not available\n");
pr_debug(" <- fw_cmo_feature_init()\n");
return;
}
- end = rtas_data_buf + CMO_MAXLENGTH - 2;
- ptr = rtas_data_buf + 2; /* step over strlen value */
+ end = &buf.val[CMO_MAXLENGTH];
+ ptr = &buf.val[0];
key = value = ptr;
while (*ptr && (ptr <= end)) {
@@ -1008,7 +1002,6 @@ static void __init pSeries_cmo_feature_init(void)
} else
pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP,
CMO_SecPSP);
- spin_unlock(&rtas_data_buf_lock);
pr_debug(" <- fw_cmo_feature_init()\n");
}
@@ -1078,14 +1071,14 @@ static void __init pseries_init(void)
static void pseries_power_off(void)
{
int rc;
- int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups");
+ int rtas_poweroff_ups_token = rtas_function_token(RTAS_FN_IBM_POWER_OFF_UPS);
if (rtas_flash_term_hook)
rtas_flash_term_hook(SYS_POWER_OFF);
if (rtas_poweron_auto == 0 ||
rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) {
- rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1);
+ rc = rtas_call(rtas_function_token(RTAS_FN_POWER_OFF), 2, 1, NULL, -1, -1);
printk(KERN_INFO "RTAS power-off returned %d\n", rc);
} else {
rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL);
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index fd2174edfa1d..c597711ef20a 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -55,7 +55,7 @@ static cpumask_var_t of_spin_mask;
int smp_query_cpu_stopped(unsigned int pcpu)
{
int cpu_status, status;
- int qcss_tok = rtas_token("query-cpu-stopped-state");
+ int qcss_tok = rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE);
if (qcss_tok == RTAS_UNKNOWN_SERVICE) {
printk_once(KERN_INFO
@@ -108,7 +108,7 @@ static inline int smp_startup_cpu(unsigned int lcpu)
* If the RTAS start-cpu token does not exist then presume the
* cpu is already spinning.
*/
- start_cpu = rtas_token("start-cpu");
+ start_cpu = rtas_function_token(RTAS_FN_START_CPU);
if (start_cpu == RTAS_UNKNOWN_SERVICE)
return 1;
@@ -266,7 +266,7 @@ void __init smp_init_pseries(void)
* We know prom_init will not have started them if RTAS supports
* query-cpu-stopped-state.
*/
- if (rtas_token("query-cpu-stopped-state") == RTAS_UNKNOWN_SERVICE) {
+ if (rtas_function_token(RTAS_FN_QUERY_CPU_STOPPED_STATE) == RTAS_UNKNOWN_SERVICE) {
if (cpu_has_feature(CPU_FTR_SMT)) {
for_each_present_cpu(i) {
if (cpu_thread_in_core(i) == 0)
@@ -278,11 +278,5 @@ void __init smp_init_pseries(void)
cpumask_clear_cpu(boot_cpuid, of_spin_mask);
}
- /* Non-lpar has additional take/give timebase */
- if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) {
- smp_ops->give_timebase = rtas_give_timebase;
- smp_ops->take_timebase = rtas_take_timebase;
- }
-
pr_debug(" <- smp_init_pSeries()\n");
}
diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
index a81d155b89ae..6f5e2727963c 100644
--- a/arch/powerpc/purgatory/Makefile
+++ b/arch/powerpc/purgatory/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
targets += trampoline_$(BITS).o purgatory.ro
diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c
index f8320f8e5bc7..b772a833d9b7 100644
--- a/arch/powerpc/sysdev/xics/ics-rtas.c
+++ b/arch/powerpc/sysdev/xics/ics-rtas.c
@@ -200,10 +200,10 @@ static struct ics ics_rtas = {
__init int ics_rtas_init(void)
{
- ibm_get_xive = rtas_token("ibm,get-xive");
- ibm_set_xive = rtas_token("ibm,set-xive");
- ibm_int_on = rtas_token("ibm,int-on");
- ibm_int_off = rtas_token("ibm,int-off");
+ ibm_get_xive = rtas_function_token(RTAS_FN_IBM_GET_XIVE);
+ ibm_set_xive = rtas_function_token(RTAS_FN_IBM_SET_XIVE);
+ ibm_int_on = rtas_function_token(RTAS_FN_IBM_INT_ON);
+ ibm_int_off = rtas_function_token(RTAS_FN_IBM_INT_OFF);
/* We enable the RTAS "ICS" if RTAS is present with the
* appropriate tokens
diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile
index eb25d7554ffd..d334de392e6c 100644
--- a/arch/powerpc/xmon/Makefile
+++ b/arch/powerpc/xmon/Makefile
@@ -5,6 +5,7 @@ GCOV_PROFILE := n
KCOV_INSTRUMENT := n
UBSAN_SANITIZE := n
KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
# Disable ftrace for the entire directory
ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE)
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 0da66bc4823d..73c620c2a3a1 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -76,9 +76,6 @@ static cpumask_t xmon_batch_cpus = CPU_MASK_NONE;
#define xmon_owner 0
#endif /* CONFIG_SMP */
-#ifdef CONFIG_PPC_PSERIES
-static int set_indicator_token = RTAS_UNKNOWN_SERVICE;
-#endif
static unsigned long in_xmon __read_mostly = 0;
static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT);
static bool xmon_is_ro = IS_ENABLED(CONFIG_XMON_DEFAULT_RO_MODE);
@@ -398,6 +395,7 @@ static inline void disable_surveillance(void)
#ifdef CONFIG_PPC_PSERIES
/* Since this can't be a module, args should end up below 4GB. */
static struct rtas_args args;
+ const s32 token = rtas_function_token(RTAS_FN_SET_INDICATOR);
/*
* At this point we have got all the cpus we can into
@@ -406,10 +404,10 @@ static inline void disable_surveillance(void)
* If we did try to take rtas.lock there would be a
* real possibility of deadlock.
*/
- if (set_indicator_token == RTAS_UNKNOWN_SERVICE)
+ if (token == RTAS_UNKNOWN_SERVICE)
return;
- rtas_call_unlocked(&args, set_indicator_token, 3, 1, NULL,
+ rtas_call_unlocked(&args, token, 3, 1, NULL,
SURVEILLANCE_TOKEN, 0, 0);
#endif /* CONFIG_PPC_PSERIES */
@@ -3976,14 +3974,6 @@ static void xmon_init(int enable)
__debugger_iabr_match = xmon_iabr_match;
__debugger_break_match = xmon_break_match;
__debugger_fault_handler = xmon_fault_handler;
-
-#ifdef CONFIG_PPC_PSERIES
- /*
- * Get the token here to avoid trying to get a lock
- * during the crash, causing a deadlock.
- */
- set_indicator_token = rtas_token("set-indicator");
-#endif
} else {
__debugger = NULL;
__debugger_ipi = NULL;
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 9c687da7756d..c5e42cc37604 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -14,10 +14,11 @@ config RISCV
def_bool y
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
+ select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_BINFMT_FLAT
select ARCH_HAS_CURRENT_STACK_POINTER
- select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEBUG_VIRTUAL if MMU
+ select ARCH_HAS_DEBUG_VM_PGTABLE
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
@@ -44,12 +45,14 @@ config RISCV
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_GENERAL_HUGETLB
+ select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
+ select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL
select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
select BUILDTIME_TABLE_SORT if MMU
- select CLONE_BACKWARDS
select CLINT_TIMER if !MMU
+ select CLONE_BACKWARDS
select COMMON_CLK
select CPU_PM if CPU_IDLE
select EDAC_SUPPORT
@@ -84,16 +87,16 @@ config RISCV
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_SECCOMP_FILTER
+ select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
- select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
- select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_VMAP_STACK if MMU && 64BIT
select HAVE_ASM_MODVERSIONS
select HAVE_CONTEXT_TRACKING_USER
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS if MMU
select HAVE_EBPF_JIT if MMU
+ select HAVE_FUNCTION_ARG_ACCESS_API
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_GCC_PLUGINS
select HAVE_GENERIC_VDSO if MMU && 64BIT
@@ -110,10 +113,9 @@ config RISCV
select HAVE_PERF_USER_STACK_DUMP
select HAVE_POSIX_CPU_TIMERS_TASK_WORK
select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_FUNCTION_ARG_ACCESS_API
+ select HAVE_RSEQ
select HAVE_STACKPROTECTOR
select HAVE_SYSCALL_TRACEPOINTS
- select HAVE_RSEQ
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_RELA if MODULES
@@ -137,7 +139,7 @@ config RISCV
select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_GRAPH_TRACER
- select HAVE_FUNCTION_TRACER if !XIP_KERNEL
+ select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION
config ARCH_MMAP_RND_BITS_MIN
default 18 if 64BIT
@@ -234,9 +236,9 @@ config LOCKDEP_SUPPORT
config RISCV_DMA_NONCOHERENT
bool
select ARCH_HAS_DMA_PREP_COHERENT
- select ARCH_HAS_SYNC_DMA_FOR_DEVICE
- select ARCH_HAS_SYNC_DMA_FOR_CPU
select ARCH_HAS_SETUP_DMA_OPS
+ select ARCH_HAS_SYNC_DMA_FOR_CPU
+ select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select DMA_DIRECT_REMAP
config AS_HAS_INSN
@@ -351,11 +353,11 @@ endchoice
config NUMA
bool "NUMA Memory Allocation and Scheduler Support"
depends on SMP && MMU
+ select ARCH_SUPPORTS_NUMA_BALANCING
select GENERIC_ARCH_NUMA
+ select NEED_PER_CPU_EMBED_FIRST_CHUNK
select OF_NUMA
- select ARCH_SUPPORTS_NUMA_BALANCING
select USE_PERCPU_NUMA_NODE_ID
- select NEED_PER_CPU_EMBED_FIRST_CHUNK
help
Enable NUMA (Non-Uniform Memory Access) support.
@@ -400,8 +402,8 @@ config RISCV_ISA_SVPBMT
bool "SVPBMT extension support"
depends on 64BIT && MMU
depends on !XIP_KERNEL
- select RISCV_ALTERNATIVE
default y
+ select RISCV_ALTERNATIVE
help
Adds support to dynamically detect the presence of the SVPBMT
ISA-extension (Supervisor-mode: page-based memory types) and
@@ -415,20 +417,36 @@ config RISCV_ISA_SVPBMT
If you don't know what to do here, say Y.
-config TOOLCHAIN_HAS_ZICBOM
+config TOOLCHAIN_HAS_ZBB
bool
default y
- depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zicbom)
- depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zicbom)
- depends on LLD_VERSION >= 150000 || LD_VERSION >= 23800
+ depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb)
+ depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb)
+ depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
+ depends on AS_IS_GNU
+
+config RISCV_ISA_ZBB
+ bool "Zbb extension support for bit manipulation instructions"
+ depends on TOOLCHAIN_HAS_ZBB
+ depends on !XIP_KERNEL && MMU
+ select RISCV_ALTERNATIVE
+ default y
+ help
+ Adds support to dynamically detect the presence of the ZBB
+ extension (basic bit manipulation) and enable its usage.
+
+ The Zbb extension provides instructions to accelerate a number
+ of bit-specific operations (count bit population, sign extending,
+ bitrotation, etc).
+
+ If you don't know what to do here, say Y.
config RISCV_ISA_ZICBOM
bool "Zicbom extension support for non-coherent DMA operation"
- depends on TOOLCHAIN_HAS_ZICBOM
depends on !XIP_KERNEL && MMU
- select RISCV_DMA_NONCOHERENT
- select RISCV_ALTERNATIVE
default y
+ select RISCV_ALTERNATIVE
+ select RISCV_DMA_NONCOHERENT
help
Adds support to dynamically detect the presence of the ZICBOM
extension (Cache Block Management Operations) and enable its
@@ -490,9 +508,9 @@ config RISCV_BOOT_SPINWAIT
config KEXEC
bool "Kexec system call"
- select KEXEC_CORE
- select HOTPLUG_CPU if SMP
depends on MMU
+ select HOTPLUG_CPU if SMP
+ select KEXEC_CORE
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -503,10 +521,10 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based systmem call"
+ depends on 64BIT && MMU
+ select HAVE_IMA_KEXEC if IMA
select KEXEC_CORE
select KEXEC_ELF
- select HAVE_IMA_KEXEC if IMA
- depends on 64BIT && MMU
help
This is new version of kexec system call. This system call is
file based and takes file descriptors as system call argument
@@ -595,15 +613,15 @@ config EFI_STUB
config EFI
bool "UEFI runtime support"
depends on OF && !XIP_KERNEL
- select LIBFDT
- select UCS2_STRING
- select EFI_PARAMS_FROM_FDT
- select EFI_STUB
+ depends on MMU
+ default y
select EFI_GENERIC_STUB
+ select EFI_PARAMS_FROM_FDT
select EFI_RUNTIME_WRAPPERS
+ select EFI_STUB
+ select LIBFDT
select RISCV_ISA_C
- depends on MMU
- default y
+ select UCS2_STRING
help
This option provides support for runtime services provided
by UEFI firmware (such as non-volatile variables, realtime
@@ -682,8 +700,8 @@ config PORTABLE
bool
default !NONPORTABLE
select EFI
- select OF
select MMU
+ select OF
menu "Power management options"
diff --git a/arch/riscv/Kconfig.socs b/arch/riscv/Kconfig.socs
index 4b91367604ea..1cf69f958f10 100644
--- a/arch/riscv/Kconfig.socs
+++ b/arch/riscv/Kconfig.socs
@@ -43,7 +43,7 @@ config ARCH_SUNXI
config ARCH_VIRT
def_bool SOC_VIRT
-
+
config SOC_VIRT
bool "QEMU Virt Machine"
select CLINT_TIMER if RISCV_M_MODE
@@ -88,7 +88,8 @@ config SOC_CANAAN_K210_DTB_BUILTIN
If unsure, say Y.
config ARCH_CANAAN_K210_DTB_SOURCE
- def_bool SOC_CANAAN_K210_DTB_SOURCE
+ string
+ default SOC_CANAAN_K210_DTB_SOURCE
config SOC_CANAAN_K210_DTB_SOURCE
string "Source file for the Canaan Kendryte K210 builtin DTB"
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 7123511d977c..6203c3378922 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -11,7 +11,11 @@ LDFLAGS_vmlinux :=
ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
LDFLAGS_vmlinux := --no-relax
KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
- CC_FLAGS_FTRACE := -fpatchable-function-entry=8
+ifeq ($(CONFIG_RISCV_ISA_C),y)
+ CC_FLAGS_FTRACE := -fpatchable-function-entry=4
+else
+ CC_FLAGS_FTRACE := -fpatchable-function-entry=2
+endif
endif
ifeq ($(CONFIG_CMODEL_MEDLOW),y)
@@ -58,9 +62,6 @@ riscv-march-$(CONFIG_RISCV_ISA_C) := $(riscv-march-y)c
toolchain-need-zicsr-zifencei := $(call cc-option-yn, -march=$(riscv-march-y)_zicsr_zifencei)
riscv-march-$(toolchain-need-zicsr-zifencei) := $(riscv-march-y)_zicsr_zifencei
-# Check if the toolchain supports Zicbom extension
-riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZICBOM) := $(riscv-march-y)_zicbom
-
# Check if the toolchain supports Zihintpause extension
riscv-march-$(CONFIG_TOOLCHAIN_HAS_ZIHINTPAUSE) := $(riscv-march-y)_zihintpause
diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c
index 1031038423e7..da55cb247e89 100644
--- a/arch/riscv/errata/sifive/errata.c
+++ b/arch/riscv/errata/sifive/errata.c
@@ -4,6 +4,7 @@
*/
#include <linux/kernel.h>
+#include <linux/memory.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/bug.h>
@@ -107,7 +108,10 @@ void __init_or_module sifive_errata_patch_func(struct alt_entry *begin,
tmp = (1U << alt->errata_id);
if (cpu_req_errata & tmp) {
- patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len);
+ mutex_lock(&text_mutex);
+ patch_text_nosync(ALT_OLD_PTR(alt), ALT_ALT_PTR(alt),
+ alt->alt_len);
+ mutex_lock(&text_mutex);
cpu_apply_errata |= tmp;
}
}
diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c
index fac5742d1c1e..3b96a06d3c54 100644
--- a/arch/riscv/errata/thead/errata.c
+++ b/arch/riscv/errata/thead/errata.c
@@ -5,6 +5,7 @@
#include <linux/bug.h>
#include <linux/kernel.h>
+#include <linux/memory.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -87,6 +88,7 @@ void __init_or_module thead_errata_patch_func(struct alt_entry *begin, struct al
struct alt_entry *alt;
u32 cpu_req_errata = thead_errata_probe(stage, archid, impid);
u32 tmp;
+ void *oldptr, *altptr;
for (alt = begin; alt < end; alt++) {
if (alt->vendor_id != THEAD_VENDOR_ID)
@@ -96,12 +98,17 @@ void __init_or_module thead_errata_patch_func(struct alt_entry *begin, struct al
tmp = (1U << alt->errata_id);
if (cpu_req_errata & tmp) {
+ oldptr = ALT_OLD_PTR(alt);
+ altptr = ALT_ALT_PTR(alt);
+
/* On vm-alternatives, the mmu isn't running yet */
- if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
- memcpy((void *)__pa_symbol(alt->old_ptr),
- (void *)__pa_symbol(alt->alt_ptr), alt->alt_len);
- else
- patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len);
+ if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) {
+ memcpy(oldptr, altptr, alt->alt_len);
+ } else {
+ mutex_lock(&text_mutex);
+ patch_text_nosync(oldptr, altptr, alt->alt_len);
+ mutex_unlock(&text_mutex);
+ }
}
}
diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h
index 2c0f4c887289..51c6867e02f3 100644
--- a/arch/riscv/include/asm/alternative-macros.h
+++ b/arch/riscv/include/asm/alternative-macros.h
@@ -7,11 +7,11 @@
#ifdef __ASSEMBLY__
.macro ALT_ENTRY oldptr newptr vendor_id errata_id new_len
- RISCV_PTR \oldptr
- RISCV_PTR \newptr
- REG_ASM \vendor_id
- REG_ASM \new_len
- .word \errata_id
+ .4byte \oldptr - .
+ .4byte \newptr - .
+ .2byte \vendor_id
+ .2byte \new_len
+ .4byte \errata_id
.endm
.macro ALT_NEW_CONTENT vendor_id, errata_id, enable = 1, new_c : vararg
@@ -59,11 +59,11 @@
#include <linux/stringify.h>
#define ALT_ENTRY(oldptr, newptr, vendor_id, errata_id, newlen) \
- RISCV_PTR " " oldptr "\n" \
- RISCV_PTR " " newptr "\n" \
- REG_ASM " " vendor_id "\n" \
- REG_ASM " " newlen "\n" \
- ".word " errata_id "\n"
+ ".4byte ((" oldptr ") - .) \n" \
+ ".4byte ((" newptr ") - .) \n" \
+ ".2byte " vendor_id "\n" \
+ ".2byte " newlen "\n" \
+ ".4byte " errata_id "\n"
#define ALT_NEW_CONTENT(vendor_id, errata_id, enable, new_c) \
".if " __stringify(enable) " == 1\n" \
diff --git a/arch/riscv/include/asm/alternative.h b/arch/riscv/include/asm/alternative.h
index 6511dd73e812..b8648d4f2ac1 100644
--- a/arch/riscv/include/asm/alternative.h
+++ b/arch/riscv/include/asm/alternative.h
@@ -23,17 +23,25 @@
#define RISCV_ALTERNATIVES_MODULE 1 /* alternatives applied during module-init */
#define RISCV_ALTERNATIVES_EARLY_BOOT 2 /* alternatives applied before mmu start */
+/* add the relative offset to the address of the offset to get the absolute address */
+#define __ALT_PTR(a, f) ((void *)&(a)->f + (a)->f)
+#define ALT_OLD_PTR(a) __ALT_PTR(a, old_offset)
+#define ALT_ALT_PTR(a) __ALT_PTR(a, alt_offset)
+
void __init apply_boot_alternatives(void);
void __init apply_early_boot_alternatives(void);
void apply_module_alternatives(void *start, size_t length);
+void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len,
+ int patch_offset);
+
struct alt_entry {
- void *old_ptr; /* address of original instruciton or data */
- void *alt_ptr; /* address of replacement instruction or data */
- unsigned long vendor_id; /* cpu vendor id */
- unsigned long alt_len; /* The replacement size */
- unsigned int errata_id; /* The errata id */
-} __packed;
+ s32 old_offset; /* offset relative to original instruction or data */
+ s32 alt_offset; /* offset relative to replacement instruction or data */
+ u16 vendor_id; /* cpu vendor id */
+ u16 alt_len; /* The replacement size */
+ u32 errata_id; /* The errata id */
+};
struct errata_checkfunc_id {
unsigned long vendor_id;
diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h
index e7acffdf21d2..30e7d2455960 100644
--- a/arch/riscv/include/asm/elf.h
+++ b/arch/riscv/include/asm/elf.h
@@ -14,6 +14,7 @@
#include <asm/auxvec.h>
#include <asm/byteorder.h>
#include <asm/cacheinfo.h>
+#include <asm/hwcap.h>
/*
* These are used to set parameters in the core dumps.
@@ -59,12 +60,13 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr);
#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12))
#endif
#endif
+
/*
- * This yields a mask that user programs can use to figure out what
- * instruction set this CPU supports. This could be done in user space,
- * but it's not easy, and we've already done it here.
+ * Provides information on the availiable set of ISA extensions to userspace,
+ * via a bitmap that coorespends to each single-letter ISA extension. This is
+ * essentially defunct, but will remain for compatibility with userspace.
*/
-#define ELF_HWCAP (elf_hwcap)
+#define ELF_HWCAP (elf_hwcap & ((1UL << RISCV_ISA_EXT_BASE) - 1))
extern unsigned long elf_hwcap;
/*
diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
index 4180312d2a70..fb1a810f3d8c 100644
--- a/arch/riscv/include/asm/errata_list.h
+++ b/arch/riscv/include/asm/errata_list.h
@@ -7,6 +7,8 @@
#include <asm/alternative.h>
#include <asm/csr.h>
+#include <asm/insn-def.h>
+#include <asm/hwcap.h>
#include <asm/vendorid_list.h>
#ifdef CONFIG_ERRATA_SIFIVE
@@ -22,10 +24,6 @@
#define ERRATA_THEAD_NUMBER 3
#endif
-#define CPUFEATURE_SVPBMT 0
-#define CPUFEATURE_ZICBOM 1
-#define CPUFEATURE_NUMBER 2
-
#ifdef __ASSEMBLY__
#define ALT_INSN_FAULT(x) \
@@ -55,7 +53,7 @@ asm(ALTERNATIVE("sfence.vma %0", "sfence.vma", SIFIVE_VENDOR_ID, \
#define ALT_SVPBMT(_val, prot) \
asm(ALTERNATIVE_2("li %0, 0\t\nnop", \
"li %0, %1\t\nslli %0,%0,%3", 0, \
- CPUFEATURE_SVPBMT, CONFIG_RISCV_ISA_SVPBMT, \
+ RISCV_ISA_EXT_SVPBMT, CONFIG_RISCV_ISA_SVPBMT, \
"li %0, %2\t\nslli %0,%0,%4", THEAD_VENDOR_ID, \
ERRATA_THEAD_PBMT, CONFIG_ERRATA_THEAD_PBMT) \
: "=r"(_val) \
@@ -125,11 +123,11 @@ asm volatile(ALTERNATIVE_2( \
"mv a0, %1\n\t" \
"j 2f\n\t" \
"3:\n\t" \
- "cbo." __stringify(_op) " (a0)\n\t" \
+ CBO_##_op(a0) \
"add a0, a0, %0\n\t" \
"2:\n\t" \
"bltu a0, %2, 3b\n\t" \
- "nop", 0, CPUFEATURE_ZICBOM, CONFIG_RISCV_ISA_ZICBOM, \
+ "nop", 0, RISCV_ISA_EXT_ZICBOM, CONFIG_RISCV_ISA_ZICBOM, \
"mv a0, %1\n\t" \
"j 2f\n\t" \
"3:\n\t" \
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 04dad3380041..9e73922e1e2e 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -42,6 +42,14 @@ struct dyn_arch_ftrace {
* 2) jalr: setting low-12 offset to ra, jump to ra, and set ra to
* return address (original pc + 4)
*
+ *<ftrace enable>:
+ * 0: auipc t0/ra, 0x?
+ * 4: jalr t0/ra, ?(t0/ra)
+ *
+ *<ftrace disable>:
+ * 0: nop
+ * 4: nop
+ *
* Dynamic ftrace generates probes to call sites, so we must deal with
* both auipc and jalr at the same time.
*/
@@ -52,25 +60,43 @@ struct dyn_arch_ftrace {
#define AUIPC_OFFSET_MASK (0xfffff000)
#define AUIPC_PAD (0x00001000)
#define JALR_SHIFT 20
-#define JALR_BASIC (0x000080e7)
-#define AUIPC_BASIC (0x00000097)
+#define JALR_RA (0x000080e7)
+#define AUIPC_RA (0x00000097)
+#define JALR_T0 (0x000282e7)
+#define AUIPC_T0 (0x00000297)
#define NOP4 (0x00000013)
-#define make_call(caller, callee, call) \
+#define to_jalr_t0(offset) \
+ (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_T0)
+
+#define to_auipc_t0(offset) \
+ ((offset & JALR_SIGN_MASK) ? \
+ (((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_T0) : \
+ ((offset & AUIPC_OFFSET_MASK) | AUIPC_T0))
+
+#define make_call_t0(caller, callee, call) \
do { \
- call[0] = to_auipc_insn((unsigned int)((unsigned long)callee - \
- (unsigned long)caller)); \
- call[1] = to_jalr_insn((unsigned int)((unsigned long)callee - \
- (unsigned long)caller)); \
+ unsigned int offset = \
+ (unsigned long) callee - (unsigned long) caller; \
+ call[0] = to_auipc_t0(offset); \
+ call[1] = to_jalr_t0(offset); \
} while (0)
-#define to_jalr_insn(offset) \
- (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_BASIC)
+#define to_jalr_ra(offset) \
+ (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_RA)
-#define to_auipc_insn(offset) \
+#define to_auipc_ra(offset) \
((offset & JALR_SIGN_MASK) ? \
- (((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_BASIC) : \
- ((offset & AUIPC_OFFSET_MASK) | AUIPC_BASIC))
+ (((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_RA) : \
+ ((offset & AUIPC_OFFSET_MASK) | AUIPC_RA))
+
+#define make_call_ra(caller, callee, call) \
+do { \
+ unsigned int offset = \
+ (unsigned long) callee - (unsigned long) caller; \
+ call[0] = to_auipc_ra(offset); \
+ call[1] = to_jalr_ra(offset); \
+} while (0)
/*
* Let auipc+jalr be the basic *mcount unit*, so we make it 8 bytes here.
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index 64ad1937e714..e3021b2590de 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -8,24 +8,11 @@
#ifndef _ASM_RISCV_HWCAP_H
#define _ASM_RISCV_HWCAP_H
+#include <asm/alternative-macros.h>
#include <asm/errno.h>
#include <linux/bits.h>
#include <uapi/asm/hwcap.h>
-#ifndef __ASSEMBLY__
-#include <linux/jump_label.h>
-/*
- * This yields a mask that user programs can use to figure out what
- * instruction set this cpu supports.
- */
-#define ELF_HWCAP (elf_hwcap)
-
-enum {
- CAP_HWCAP = 1,
-};
-
-extern unsigned long elf_hwcap;
-
#define RISCV_ISA_EXT_a ('a' - 'a')
#define RISCV_ISA_EXT_c ('c' - 'a')
#define RISCV_ISA_EXT_d ('d' - 'a')
@@ -37,42 +24,31 @@ extern unsigned long elf_hwcap;
#define RISCV_ISA_EXT_u ('u' - 'a')
/*
- * Increse this to higher value as kernel support more ISA extensions.
+ * These macros represent the logical IDs of each multi-letter RISC-V ISA
+ * extension and are used in the ISA bitmap. The logical IDs start from
+ * RISCV_ISA_EXT_BASE, which allows the 0-25 range to be reserved for single
+ * letter extensions. The maximum, RISCV_ISA_EXT_MAX, is defined in order
+ * to allocate the bitmap and may be increased when necessary.
+ *
+ * New extensions should just be added to the bottom, rather than added
+ * alphabetically, in order to avoid unnecessary shuffling.
*/
-#define RISCV_ISA_EXT_MAX 64
-#define RISCV_ISA_EXT_NAME_LEN_MAX 32
+#define RISCV_ISA_EXT_BASE 26
-/* The base ID for multi-letter ISA extensions */
-#define RISCV_ISA_EXT_BASE 26
+#define RISCV_ISA_EXT_SSCOFPMF 26
+#define RISCV_ISA_EXT_SSTC 27
+#define RISCV_ISA_EXT_SVINVAL 28
+#define RISCV_ISA_EXT_SVPBMT 29
+#define RISCV_ISA_EXT_ZBB 30
+#define RISCV_ISA_EXT_ZICBOM 31
+#define RISCV_ISA_EXT_ZIHINTPAUSE 32
-/*
- * This enum represent the logical ID for each multi-letter RISC-V ISA extension.
- * The logical ID should start from RISCV_ISA_EXT_BASE and must not exceed
- * RISCV_ISA_EXT_MAX. 0-25 range is reserved for single letter
- * extensions while all the multi-letter extensions should define the next
- * available logical extension id.
- */
-enum riscv_isa_ext_id {
- RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
- RISCV_ISA_EXT_SVPBMT,
- RISCV_ISA_EXT_ZICBOM,
- RISCV_ISA_EXT_ZIHINTPAUSE,
- RISCV_ISA_EXT_SSTC,
- RISCV_ISA_EXT_SVINVAL,
- RISCV_ISA_EXT_ID_MAX
-};
-static_assert(RISCV_ISA_EXT_ID_MAX <= RISCV_ISA_EXT_MAX);
+#define RISCV_ISA_EXT_MAX 64
+#define RISCV_ISA_EXT_NAME_LEN_MAX 32
-/*
- * This enum represents the logical ID for each RISC-V ISA extension static
- * keys. We can use static key to optimize code path if some ISA extensions
- * are available.
- */
-enum riscv_isa_ext_key {
- RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */
- RISCV_ISA_EXT_KEY_SVINVAL,
- RISCV_ISA_EXT_KEY_MAX,
-};
+#ifndef __ASSEMBLY__
+
+#include <linux/jump_label.h>
struct riscv_isa_ext_data {
/* Name of the extension displayed to userspace via /proc/cpuinfo */
@@ -81,20 +57,40 @@ struct riscv_isa_ext_data {
unsigned int isa_ext_id;
};
-extern struct static_key_false riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_MAX];
+static __always_inline bool
+riscv_has_extension_likely(const unsigned long ext)
+{
+ compiletime_assert(ext < RISCV_ISA_EXT_MAX,
+ "ext must be < RISCV_ISA_EXT_MAX");
+
+ asm_volatile_goto(
+ ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1)
+ :
+ : [ext] "i" (ext)
+ :
+ : l_no);
+
+ return true;
+l_no:
+ return false;
+}
-static __always_inline int riscv_isa_ext2key(int num)
+static __always_inline bool
+riscv_has_extension_unlikely(const unsigned long ext)
{
- switch (num) {
- case RISCV_ISA_EXT_f:
- return RISCV_ISA_EXT_KEY_FPU;
- case RISCV_ISA_EXT_d:
- return RISCV_ISA_EXT_KEY_FPU;
- case RISCV_ISA_EXT_SVINVAL:
- return RISCV_ISA_EXT_KEY_SVINVAL;
- default:
- return -EINVAL;
- }
+ compiletime_assert(ext < RISCV_ISA_EXT_MAX,
+ "ext must be < RISCV_ISA_EXT_MAX");
+
+ asm_volatile_goto(
+ ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1)
+ :
+ : [ext] "i" (ext)
+ :
+ : l_yes);
+
+ return false;
+l_yes:
+ return true;
}
unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap);
diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
index 16044affa57c..e01ab51f50d2 100644
--- a/arch/riscv/include/asm/insn-def.h
+++ b/arch/riscv/include/asm/insn-def.h
@@ -12,6 +12,12 @@
#define INSN_R_RD_SHIFT 7
#define INSN_R_OPCODE_SHIFT 0
+#define INSN_I_SIMM12_SHIFT 20
+#define INSN_I_RS1_SHIFT 15
+#define INSN_I_FUNC3_SHIFT 12
+#define INSN_I_RD_SHIFT 7
+#define INSN_I_OPCODE_SHIFT 0
+
#ifdef __ASSEMBLY__
#ifdef CONFIG_AS_HAS_INSN
@@ -20,6 +26,10 @@
.insn r \opcode, \func3, \func7, \rd, \rs1, \rs2
.endm
+ .macro insn_i, opcode, func3, rd, rs1, simm12
+ .insn i \opcode, \func3, \rd, \rs1, \simm12
+ .endm
+
#else
#include <asm/gpr-num.h>
@@ -33,9 +43,18 @@
(.L__gpr_num_\rs2 << INSN_R_RS2_SHIFT))
.endm
+ .macro insn_i, opcode, func3, rd, rs1, simm12
+ .4byte ((\opcode << INSN_I_OPCODE_SHIFT) | \
+ (\func3 << INSN_I_FUNC3_SHIFT) | \
+ (.L__gpr_num_\rd << INSN_I_RD_SHIFT) | \
+ (.L__gpr_num_\rs1 << INSN_I_RS1_SHIFT) | \
+ (\simm12 << INSN_I_SIMM12_SHIFT))
+ .endm
+
#endif
#define __INSN_R(...) insn_r __VA_ARGS__
+#define __INSN_I(...) insn_i __VA_ARGS__
#else /* ! __ASSEMBLY__ */
@@ -44,6 +63,9 @@
#define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
".insn r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n"
+#define __INSN_I(opcode, func3, rd, rs1, simm12) \
+ ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
+
#else
#include <linux/stringify.h>
@@ -60,14 +82,32 @@
" (.L__gpr_num_\\rs2 << " __stringify(INSN_R_RS2_SHIFT) "))\n" \
" .endm\n"
+#define DEFINE_INSN_I \
+ __DEFINE_ASM_GPR_NUMS \
+" .macro insn_i, opcode, func3, rd, rs1, simm12\n" \
+" .4byte ((\\opcode << " __stringify(INSN_I_OPCODE_SHIFT) ") |" \
+" (\\func3 << " __stringify(INSN_I_FUNC3_SHIFT) ") |" \
+" (.L__gpr_num_\\rd << " __stringify(INSN_I_RD_SHIFT) ") |" \
+" (.L__gpr_num_\\rs1 << " __stringify(INSN_I_RS1_SHIFT) ") |" \
+" (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \
+" .endm\n"
+
#define UNDEFINE_INSN_R \
" .purgem insn_r\n"
+#define UNDEFINE_INSN_I \
+" .purgem insn_i\n"
+
#define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \
DEFINE_INSN_R \
"insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
UNDEFINE_INSN_R
+#define __INSN_I(opcode, func3, rd, rs1, simm12) \
+ DEFINE_INSN_I \
+ "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
+ UNDEFINE_INSN_I
+
#endif
#endif /* ! __ASSEMBLY__ */
@@ -76,9 +116,14 @@
__INSN_R(RV_##opcode, RV_##func3, RV_##func7, \
RV_##rd, RV_##rs1, RV_##rs2)
+#define INSN_I(opcode, func3, rd, rs1, simm12) \
+ __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \
+ RV_##rs1, RV_##simm12)
+
#define RV_OPCODE(v) __ASM_STR(v)
#define RV_FUNC3(v) __ASM_STR(v)
#define RV_FUNC7(v) __ASM_STR(v)
+#define RV_SIMM12(v) __ASM_STR(v)
#define RV_RD(v) __ASM_STR(v)
#define RV_RS1(v) __ASM_STR(v)
#define RV_RS2(v) __ASM_STR(v)
@@ -87,6 +132,7 @@
#define RV___RS1(v) __RV_REG(v)
#define RV___RS2(v) __RV_REG(v)
+#define RV_OPCODE_MISC_MEM RV_OPCODE(15)
#define RV_OPCODE_SYSTEM RV_OPCODE(115)
#define HFENCE_VVMA(vaddr, asid) \
@@ -134,4 +180,16 @@
INSN_R(OPCODE_SYSTEM, FUNC3(0), FUNC7(51), \
__RD(0), RS1(gaddr), RS2(vmid))
+#define CBO_inval(base) \
+ INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
+ RS1(base), SIMM12(0))
+
+#define CBO_clean(base) \
+ INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
+ RS1(base), SIMM12(1))
+
+#define CBO_flush(base) \
+ INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
+ RS1(base), SIMM12(2))
+
#endif /* __ASM_INSN_DEF_H */
diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h
new file mode 100644
index 000000000000..8d5c84f2d5ef
--- /dev/null
+++ b/arch/riscv/include/asm/insn.h
@@ -0,0 +1,381 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020 SiFive
+ */
+
+#ifndef _ASM_RISCV_INSN_H
+#define _ASM_RISCV_INSN_H
+
+#include <linux/bits.h>
+
+#define RV_INSN_FUNCT3_MASK GENMASK(14, 12)
+#define RV_INSN_FUNCT3_OPOFF 12
+#define RV_INSN_OPCODE_MASK GENMASK(6, 0)
+#define RV_INSN_OPCODE_OPOFF 0
+#define RV_INSN_FUNCT12_OPOFF 20
+
+#define RV_ENCODE_FUNCT3(f_) (RVG_FUNCT3_##f_ << RV_INSN_FUNCT3_OPOFF)
+#define RV_ENCODE_FUNCT12(f_) (RVG_FUNCT12_##f_ << RV_INSN_FUNCT12_OPOFF)
+
+/* The bit field of immediate value in I-type instruction */
+#define RV_I_IMM_SIGN_OPOFF 31
+#define RV_I_IMM_11_0_OPOFF 20
+#define RV_I_IMM_SIGN_OFF 12
+#define RV_I_IMM_11_0_OFF 0
+#define RV_I_IMM_11_0_MASK GENMASK(11, 0)
+
+/* The bit field of immediate value in J-type instruction */
+#define RV_J_IMM_SIGN_OPOFF 31
+#define RV_J_IMM_10_1_OPOFF 21
+#define RV_J_IMM_11_OPOFF 20
+#define RV_J_IMM_19_12_OPOFF 12
+#define RV_J_IMM_SIGN_OFF 20
+#define RV_J_IMM_10_1_OFF 1
+#define RV_J_IMM_11_OFF 11
+#define RV_J_IMM_19_12_OFF 12
+#define RV_J_IMM_10_1_MASK GENMASK(9, 0)
+#define RV_J_IMM_11_MASK GENMASK(0, 0)
+#define RV_J_IMM_19_12_MASK GENMASK(7, 0)
+
+/*
+ * U-type IMMs contain the upper 20bits [31:20] of an immediate with
+ * the rest filled in by zeros, so no shifting required. Similarly,
+ * bit31 contains the signed state, so no sign extension necessary.
+ */
+#define RV_U_IMM_SIGN_OPOFF 31
+#define RV_U_IMM_31_12_OPOFF 0
+#define RV_U_IMM_31_12_MASK GENMASK(31, 12)
+
+/* The bit field of immediate value in B-type instruction */
+#define RV_B_IMM_SIGN_OPOFF 31
+#define RV_B_IMM_10_5_OPOFF 25
+#define RV_B_IMM_4_1_OPOFF 8
+#define RV_B_IMM_11_OPOFF 7
+#define RV_B_IMM_SIGN_OFF 12
+#define RV_B_IMM_10_5_OFF 5
+#define RV_B_IMM_4_1_OFF 1
+#define RV_B_IMM_11_OFF 11
+#define RV_B_IMM_10_5_MASK GENMASK(5, 0)
+#define RV_B_IMM_4_1_MASK GENMASK(3, 0)
+#define RV_B_IMM_11_MASK GENMASK(0, 0)
+
+/* The register offset in RVG instruction */
+#define RVG_RS1_OPOFF 15
+#define RVG_RS2_OPOFF 20
+#define RVG_RD_OPOFF 7
+#define RVG_RD_MASK GENMASK(4, 0)
+
+/* The bit field of immediate value in RVC J instruction */
+#define RVC_J_IMM_SIGN_OPOFF 12
+#define RVC_J_IMM_4_OPOFF 11
+#define RVC_J_IMM_9_8_OPOFF 9
+#define RVC_J_IMM_10_OPOFF 8
+#define RVC_J_IMM_6_OPOFF 7
+#define RVC_J_IMM_7_OPOFF 6
+#define RVC_J_IMM_3_1_OPOFF 3
+#define RVC_J_IMM_5_OPOFF 2
+#define RVC_J_IMM_SIGN_OFF 11
+#define RVC_J_IMM_4_OFF 4
+#define RVC_J_IMM_9_8_OFF 8
+#define RVC_J_IMM_10_OFF 10
+#define RVC_J_IMM_6_OFF 6
+#define RVC_J_IMM_7_OFF 7
+#define RVC_J_IMM_3_1_OFF 1
+#define RVC_J_IMM_5_OFF 5
+#define RVC_J_IMM_4_MASK GENMASK(0, 0)
+#define RVC_J_IMM_9_8_MASK GENMASK(1, 0)
+#define RVC_J_IMM_10_MASK GENMASK(0, 0)
+#define RVC_J_IMM_6_MASK GENMASK(0, 0)
+#define RVC_J_IMM_7_MASK GENMASK(0, 0)
+#define RVC_J_IMM_3_1_MASK GENMASK(2, 0)
+#define RVC_J_IMM_5_MASK GENMASK(0, 0)
+
+/* The bit field of immediate value in RVC B instruction */
+#define RVC_B_IMM_SIGN_OPOFF 12
+#define RVC_B_IMM_4_3_OPOFF 10
+#define RVC_B_IMM_7_6_OPOFF 5
+#define RVC_B_IMM_2_1_OPOFF 3
+#define RVC_B_IMM_5_OPOFF 2
+#define RVC_B_IMM_SIGN_OFF 8
+#define RVC_B_IMM_4_3_OFF 3
+#define RVC_B_IMM_7_6_OFF 6
+#define RVC_B_IMM_2_1_OFF 1
+#define RVC_B_IMM_5_OFF 5
+#define RVC_B_IMM_4_3_MASK GENMASK(1, 0)
+#define RVC_B_IMM_7_6_MASK GENMASK(1, 0)
+#define RVC_B_IMM_2_1_MASK GENMASK(1, 0)
+#define RVC_B_IMM_5_MASK GENMASK(0, 0)
+
+#define RVC_INSN_FUNCT4_MASK GENMASK(15, 12)
+#define RVC_INSN_FUNCT4_OPOFF 12
+#define RVC_INSN_FUNCT3_MASK GENMASK(15, 13)
+#define RVC_INSN_FUNCT3_OPOFF 13
+#define RVC_INSN_J_RS2_MASK GENMASK(6, 2)
+#define RVC_INSN_OPCODE_MASK GENMASK(1, 0)
+#define RVC_ENCODE_FUNCT3(f_) (RVC_FUNCT3_##f_ << RVC_INSN_FUNCT3_OPOFF)
+#define RVC_ENCODE_FUNCT4(f_) (RVC_FUNCT4_##f_ << RVC_INSN_FUNCT4_OPOFF)
+
+/* The register offset in RVC op=C0 instruction */
+#define RVC_C0_RS1_OPOFF 7
+#define RVC_C0_RS2_OPOFF 2
+#define RVC_C0_RD_OPOFF 2
+
+/* The register offset in RVC op=C1 instruction */
+#define RVC_C1_RS1_OPOFF 7
+#define RVC_C1_RS2_OPOFF 2
+#define RVC_C1_RD_OPOFF 7
+
+/* The register offset in RVC op=C2 instruction */
+#define RVC_C2_RS1_OPOFF 7
+#define RVC_C2_RS2_OPOFF 2
+#define RVC_C2_RD_OPOFF 7
+
+/* parts of opcode for RVG*/
+#define RVG_OPCODE_FENCE 0x0f
+#define RVG_OPCODE_AUIPC 0x17
+#define RVG_OPCODE_BRANCH 0x63
+#define RVG_OPCODE_JALR 0x67
+#define RVG_OPCODE_JAL 0x6f
+#define RVG_OPCODE_SYSTEM 0x73
+
+/* parts of opcode for RVC*/
+#define RVC_OPCODE_C0 0x0
+#define RVC_OPCODE_C1 0x1
+#define RVC_OPCODE_C2 0x2
+
+/* parts of funct3 code for I, M, A extension*/
+#define RVG_FUNCT3_JALR 0x0
+#define RVG_FUNCT3_BEQ 0x0
+#define RVG_FUNCT3_BNE 0x1
+#define RVG_FUNCT3_BLT 0x4
+#define RVG_FUNCT3_BGE 0x5
+#define RVG_FUNCT3_BLTU 0x6
+#define RVG_FUNCT3_BGEU 0x7
+
+/* parts of funct3 code for C extension*/
+#define RVC_FUNCT3_C_BEQZ 0x6
+#define RVC_FUNCT3_C_BNEZ 0x7
+#define RVC_FUNCT3_C_J 0x5
+#define RVC_FUNCT3_C_JAL 0x1
+#define RVC_FUNCT4_C_JR 0x8
+#define RVC_FUNCT4_C_JALR 0x9
+#define RVC_FUNCT4_C_EBREAK 0x9
+
+#define RVG_FUNCT12_EBREAK 0x1
+#define RVG_FUNCT12_SRET 0x102
+
+#define RVG_MATCH_AUIPC (RVG_OPCODE_AUIPC)
+#define RVG_MATCH_JALR (RV_ENCODE_FUNCT3(JALR) | RVG_OPCODE_JALR)
+#define RVG_MATCH_JAL (RVG_OPCODE_JAL)
+#define RVG_MATCH_FENCE (RVG_OPCODE_FENCE)
+#define RVG_MATCH_BEQ (RV_ENCODE_FUNCT3(BEQ) | RVG_OPCODE_BRANCH)
+#define RVG_MATCH_BNE (RV_ENCODE_FUNCT3(BNE) | RVG_OPCODE_BRANCH)
+#define RVG_MATCH_BLT (RV_ENCODE_FUNCT3(BLT) | RVG_OPCODE_BRANCH)
+#define RVG_MATCH_BGE (RV_ENCODE_FUNCT3(BGE) | RVG_OPCODE_BRANCH)
+#define RVG_MATCH_BLTU (RV_ENCODE_FUNCT3(BLTU) | RVG_OPCODE_BRANCH)
+#define RVG_MATCH_BGEU (RV_ENCODE_FUNCT3(BGEU) | RVG_OPCODE_BRANCH)
+#define RVG_MATCH_EBREAK (RV_ENCODE_FUNCT12(EBREAK) | RVG_OPCODE_SYSTEM)
+#define RVG_MATCH_SRET (RV_ENCODE_FUNCT12(SRET) | RVG_OPCODE_SYSTEM)
+#define RVC_MATCH_C_BEQZ (RVC_ENCODE_FUNCT3(C_BEQZ) | RVC_OPCODE_C1)
+#define RVC_MATCH_C_BNEZ (RVC_ENCODE_FUNCT3(C_BNEZ) | RVC_OPCODE_C1)
+#define RVC_MATCH_C_J (RVC_ENCODE_FUNCT3(C_J) | RVC_OPCODE_C1)
+#define RVC_MATCH_C_JAL (RVC_ENCODE_FUNCT3(C_JAL) | RVC_OPCODE_C1)
+#define RVC_MATCH_C_JR (RVC_ENCODE_FUNCT4(C_JR) | RVC_OPCODE_C2)
+#define RVC_MATCH_C_JALR (RVC_ENCODE_FUNCT4(C_JALR) | RVC_OPCODE_C2)
+#define RVC_MATCH_C_EBREAK (RVC_ENCODE_FUNCT4(C_EBREAK) | RVC_OPCODE_C2)
+
+#define RVG_MASK_AUIPC (RV_INSN_OPCODE_MASK)
+#define RVG_MASK_JALR (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK)
+#define RVG_MASK_JAL (RV_INSN_OPCODE_MASK)
+#define RVG_MASK_FENCE (RV_INSN_OPCODE_MASK)
+#define RVC_MASK_C_JALR (RVC_INSN_FUNCT4_MASK | RVC_INSN_J_RS2_MASK | RVC_INSN_OPCODE_MASK)
+#define RVC_MASK_C_JR (RVC_INSN_FUNCT4_MASK | RVC_INSN_J_RS2_MASK | RVC_INSN_OPCODE_MASK)
+#define RVC_MASK_C_JAL (RVC_INSN_FUNCT3_MASK | RVC_INSN_OPCODE_MASK)
+#define RVC_MASK_C_J (RVC_INSN_FUNCT3_MASK | RVC_INSN_OPCODE_MASK)
+#define RVG_MASK_BEQ (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK)
+#define RVG_MASK_BNE (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK)
+#define RVG_MASK_BLT (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK)
+#define RVG_MASK_BGE (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK)
+#define RVG_MASK_BLTU (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK)
+#define RVG_MASK_BGEU (RV_INSN_FUNCT3_MASK | RV_INSN_OPCODE_MASK)
+#define RVC_MASK_C_BEQZ (RVC_INSN_FUNCT3_MASK | RVC_INSN_OPCODE_MASK)
+#define RVC_MASK_C_BNEZ (RVC_INSN_FUNCT3_MASK | RVC_INSN_OPCODE_MASK)
+#define RVC_MASK_C_EBREAK 0xffff
+#define RVG_MASK_EBREAK 0xffffffff
+#define RVG_MASK_SRET 0xffffffff
+
+#define __INSN_LENGTH_MASK _UL(0x3)
+#define __INSN_LENGTH_GE_32 _UL(0x3)
+#define __INSN_OPCODE_MASK _UL(0x7F)
+#define __INSN_BRANCH_OPCODE _UL(RVG_OPCODE_BRANCH)
+
+#define __RISCV_INSN_FUNCS(name, mask, val) \
+static __always_inline bool riscv_insn_is_##name(u32 code) \
+{ \
+ BUILD_BUG_ON(~(mask) & (val)); \
+ return (code & (mask)) == (val); \
+} \
+
+#if __riscv_xlen == 32
+/* C.JAL is an RV32C-only instruction */
+__RISCV_INSN_FUNCS(c_jal, RVC_MASK_C_JAL, RVC_MATCH_C_JAL)
+#else
+#define riscv_insn_is_c_jal(opcode) 0
+#endif
+__RISCV_INSN_FUNCS(auipc, RVG_MASK_AUIPC, RVG_MATCH_AUIPC)
+__RISCV_INSN_FUNCS(jalr, RVG_MASK_JALR, RVG_MATCH_JALR)
+__RISCV_INSN_FUNCS(jal, RVG_MASK_JAL, RVG_MATCH_JAL)
+__RISCV_INSN_FUNCS(c_jr, RVC_MASK_C_JR, RVC_MATCH_C_JR)
+__RISCV_INSN_FUNCS(c_jalr, RVC_MASK_C_JALR, RVC_MATCH_C_JALR)
+__RISCV_INSN_FUNCS(c_j, RVC_MASK_C_J, RVC_MATCH_C_J)
+__RISCV_INSN_FUNCS(beq, RVG_MASK_BEQ, RVG_MATCH_BEQ)
+__RISCV_INSN_FUNCS(bne, RVG_MASK_BNE, RVG_MATCH_BNE)
+__RISCV_INSN_FUNCS(blt, RVG_MASK_BLT, RVG_MATCH_BLT)
+__RISCV_INSN_FUNCS(bge, RVG_MASK_BGE, RVG_MATCH_BGE)
+__RISCV_INSN_FUNCS(bltu, RVG_MASK_BLTU, RVG_MATCH_BLTU)
+__RISCV_INSN_FUNCS(bgeu, RVG_MASK_BGEU, RVG_MATCH_BGEU)
+__RISCV_INSN_FUNCS(c_beqz, RVC_MASK_C_BEQZ, RVC_MATCH_C_BEQZ)
+__RISCV_INSN_FUNCS(c_bnez, RVC_MASK_C_BNEZ, RVC_MATCH_C_BNEZ)
+__RISCV_INSN_FUNCS(c_ebreak, RVC_MASK_C_EBREAK, RVC_MATCH_C_EBREAK)
+__RISCV_INSN_FUNCS(ebreak, RVG_MASK_EBREAK, RVG_MATCH_EBREAK)
+__RISCV_INSN_FUNCS(sret, RVG_MASK_SRET, RVG_MATCH_SRET)
+__RISCV_INSN_FUNCS(fence, RVG_MASK_FENCE, RVG_MATCH_FENCE);
+
+/* special case to catch _any_ system instruction */
+static __always_inline bool riscv_insn_is_system(u32 code)
+{
+ return (code & RV_INSN_OPCODE_MASK) == RVG_OPCODE_SYSTEM;
+}
+
+/* special case to catch _any_ branch instruction */
+static __always_inline bool riscv_insn_is_branch(u32 code)
+{
+ return (code & RV_INSN_OPCODE_MASK) == RVG_OPCODE_BRANCH;
+}
+
+#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1))
+#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1))
+#define RV_X(X, s, mask) (((X) >> (s)) & (mask))
+#define RVC_X(X, s, mask) RV_X(X, s, mask)
+
+#define RV_EXTRACT_RD_REG(x) \
+ ({typeof(x) x_ = (x); \
+ (RV_X(x_, RVG_RD_OPOFF, RVG_RD_MASK)); })
+
+#define RV_EXTRACT_UTYPE_IMM(x) \
+ ({typeof(x) x_ = (x); \
+ (RV_X(x_, RV_U_IMM_31_12_OPOFF, RV_U_IMM_31_12_MASK)); })
+
+#define RV_EXTRACT_JTYPE_IMM(x) \
+ ({typeof(x) x_ = (x); \
+ (RV_X(x_, RV_J_IMM_10_1_OPOFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OFF) | \
+ (RV_X(x_, RV_J_IMM_11_OPOFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OFF) | \
+ (RV_X(x_, RV_J_IMM_19_12_OPOFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OFF) | \
+ (RV_IMM_SIGN(x_) << RV_J_IMM_SIGN_OFF); })
+
+#define RV_EXTRACT_ITYPE_IMM(x) \
+ ({typeof(x) x_ = (x); \
+ (RV_X(x_, RV_I_IMM_11_0_OPOFF, RV_I_IMM_11_0_MASK)) | \
+ (RV_IMM_SIGN(x_) << RV_I_IMM_SIGN_OFF); })
+
+#define RV_EXTRACT_BTYPE_IMM(x) \
+ ({typeof(x) x_ = (x); \
+ (RV_X(x_, RV_B_IMM_4_1_OPOFF, RV_B_IMM_4_1_MASK) << RV_B_IMM_4_1_OFF) | \
+ (RV_X(x_, RV_B_IMM_10_5_OPOFF, RV_B_IMM_10_5_MASK) << RV_B_IMM_10_5_OFF) | \
+ (RV_X(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \
+ (RV_IMM_SIGN(x_) << RV_B_IMM_SIGN_OFF); })
+
+#define RVC_EXTRACT_JTYPE_IMM(x) \
+ ({typeof(x) x_ = (x); \
+ (RVC_X(x_, RVC_J_IMM_3_1_OPOFF, RVC_J_IMM_3_1_MASK) << RVC_J_IMM_3_1_OFF) | \
+ (RVC_X(x_, RVC_J_IMM_4_OPOFF, RVC_J_IMM_4_MASK) << RVC_J_IMM_4_OFF) | \
+ (RVC_X(x_, RVC_J_IMM_5_OPOFF, RVC_J_IMM_5_MASK) << RVC_J_IMM_5_OFF) | \
+ (RVC_X(x_, RVC_J_IMM_6_OPOFF, RVC_J_IMM_6_MASK) << RVC_J_IMM_6_OFF) | \
+ (RVC_X(x_, RVC_J_IMM_7_OPOFF, RVC_J_IMM_7_MASK) << RVC_J_IMM_7_OFF) | \
+ (RVC_X(x_, RVC_J_IMM_9_8_OPOFF, RVC_J_IMM_9_8_MASK) << RVC_J_IMM_9_8_OFF) | \
+ (RVC_X(x_, RVC_J_IMM_10_OPOFF, RVC_J_IMM_10_MASK) << RVC_J_IMM_10_OFF) | \
+ (RVC_IMM_SIGN(x_) << RVC_J_IMM_SIGN_OFF); })
+
+#define RVC_EXTRACT_BTYPE_IMM(x) \
+ ({typeof(x) x_ = (x); \
+ (RVC_X(x_, RVC_B_IMM_2_1_OPOFF, RVC_B_IMM_2_1_MASK) << RVC_B_IMM_2_1_OFF) | \
+ (RVC_X(x_, RVC_B_IMM_4_3_OPOFF, RVC_B_IMM_4_3_MASK) << RVC_B_IMM_4_3_OFF) | \
+ (RVC_X(x_, RVC_B_IMM_5_OPOFF, RVC_B_IMM_5_MASK) << RVC_B_IMM_5_OFF) | \
+ (RVC_X(x_, RVC_B_IMM_7_6_OPOFF, RVC_B_IMM_7_6_MASK) << RVC_B_IMM_7_6_OFF) | \
+ (RVC_IMM_SIGN(x_) << RVC_B_IMM_SIGN_OFF); })
+
+/*
+ * Get the immediate from a J-type instruction.
+ *
+ * @insn: instruction to process
+ * Return: immediate
+ */
+static inline s32 riscv_insn_extract_jtype_imm(u32 insn)
+{
+ return RV_EXTRACT_JTYPE_IMM(insn);
+}
+
+/*
+ * Update a J-type instruction with an immediate value.
+ *
+ * @insn: pointer to the jtype instruction
+ * @imm: the immediate to insert into the instruction
+ */
+static inline void riscv_insn_insert_jtype_imm(u32 *insn, s32 imm)
+{
+ /* drop the old IMMs, all jal IMM bits sit at 31:12 */
+ *insn &= ~GENMASK(31, 12);
+ *insn |= (RV_X(imm, RV_J_IMM_10_1_OFF, RV_J_IMM_10_1_MASK) << RV_J_IMM_10_1_OPOFF) |
+ (RV_X(imm, RV_J_IMM_11_OFF, RV_J_IMM_11_MASK) << RV_J_IMM_11_OPOFF) |
+ (RV_X(imm, RV_J_IMM_19_12_OFF, RV_J_IMM_19_12_MASK) << RV_J_IMM_19_12_OPOFF) |
+ (RV_X(imm, RV_J_IMM_SIGN_OFF, 1) << RV_J_IMM_SIGN_OPOFF);
+}
+
+/*
+ * Put together one immediate from a U-type and I-type instruction pair.
+ *
+ * The U-type contains an upper immediate, meaning bits[31:12] with [11:0]
+ * being zero, while the I-type contains a 12bit immediate.
+ * Combined these can encode larger 32bit values and are used for example
+ * in auipc + jalr pairs to allow larger jumps.
+ *
+ * @utype_insn: instruction containing the upper immediate
+ * @itype_insn: instruction
+ * Return: combined immediate
+ */
+static inline s32 riscv_insn_extract_utype_itype_imm(u32 utype_insn, u32 itype_insn)
+{
+ s32 imm;
+
+ imm = RV_EXTRACT_UTYPE_IMM(utype_insn);
+ imm += RV_EXTRACT_ITYPE_IMM(itype_insn);
+
+ return imm;
+}
+
+/*
+ * Update a set of two instructions (U-type + I-type) with an immediate value.
+ *
+ * Used for example in auipc+jalrs pairs the U-type instructions contains
+ * a 20bit upper immediate representing bits[31:12], while the I-type
+ * instruction contains a 12bit immediate representing bits[11:0].
+ *
+ * This also takes into account that both separate immediates are
+ * considered as signed values, so if the I-type immediate becomes
+ * negative (BIT(11) set) the U-type part gets adjusted.
+ *
+ * @utype_insn: pointer to the utype instruction of the pair
+ * @itype_insn: pointer to the itype instruction of the pair
+ * @imm: the immediate to insert into the two instructions
+ */
+static inline void riscv_insn_insert_utype_itype_imm(u32 *utype_insn, u32 *itype_insn, s32 imm)
+{
+ /* drop possible old IMM values */
+ *utype_insn &= ~(RV_U_IMM_31_12_MASK);
+ *itype_insn &= ~(RV_I_IMM_11_0_MASK << RV_I_IMM_11_0_OPOFF);
+
+ /* add the adapted IMMs */
+ *utype_insn |= (imm & RV_U_IMM_31_12_MASK) + ((imm & BIT(11)) << 1);
+ *itype_insn |= ((imm & RV_I_IMM_11_0_MASK) << RV_I_IMM_11_0_OPOFF);
+}
+#endif /* _ASM_RISCV_INSN_H */
diff --git a/arch/riscv/include/asm/jump_label.h b/arch/riscv/include/asm/jump_label.h
index 6d58bbb5da46..14a5ea8d8ef0 100644
--- a/arch/riscv/include/asm/jump_label.h
+++ b/arch/riscv/include/asm/jump_label.h
@@ -18,6 +18,7 @@ static __always_inline bool arch_static_branch(struct static_key * const key,
const bool branch)
{
asm_volatile_goto(
+ " .align 2 \n\t"
" .option push \n\t"
" .option norelax \n\t"
" .option norvc \n\t"
@@ -39,6 +40,7 @@ static __always_inline bool arch_static_branch_jump(struct static_key * const ke
const bool branch)
{
asm_volatile_goto(
+ " .align 2 \n\t"
" .option push \n\t"
" .option norelax \n\t"
" .option norvc \n\t"
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 93f43a3e7886..cc7da66ee0c0 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -18,6 +18,7 @@
#include <asm/kvm_vcpu_insn.h>
#include <asm/kvm_vcpu_sbi.h>
#include <asm/kvm_vcpu_timer.h>
+#include <asm/kvm_vcpu_pmu.h>
#define KVM_MAX_VCPUS 1024
@@ -228,9 +229,11 @@ struct kvm_vcpu_arch {
/* Don't run the VCPU (blocked) */
bool pause;
+
+ /* Performance monitoring context */
+ struct kvm_pmu pmu_context;
};
-static inline void kvm_arch_hardware_unsetup(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
@@ -297,11 +300,11 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm);
void kvm_riscv_gstage_free_pgd(struct kvm *kvm);
void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu);
-void kvm_riscv_gstage_mode_detect(void);
-unsigned long kvm_riscv_gstage_mode(void);
+void __init kvm_riscv_gstage_mode_detect(void);
+unsigned long __init kvm_riscv_gstage_mode(void);
int kvm_riscv_gstage_gpa_bits(void);
-void kvm_riscv_gstage_vmid_detect(void);
+void __init kvm_riscv_gstage_vmid_detect(void);
unsigned long kvm_riscv_gstage_vmid_bits(void);
int kvm_riscv_gstage_vmid_init(struct kvm *kvm);
bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid);
diff --git a/arch/riscv/include/asm/kvm_vcpu_pmu.h b/arch/riscv/include/asm/kvm_vcpu_pmu.h
new file mode 100644
index 000000000000..395518a1664e
--- /dev/null
+++ b/arch/riscv/include/asm/kvm_vcpu_pmu.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023 Rivos Inc
+ *
+ * Authors:
+ * Atish Patra <atishp@rivosinc.com>
+ */
+
+#ifndef __KVM_VCPU_RISCV_PMU_H
+#define __KVM_VCPU_RISCV_PMU_H
+
+#include <linux/perf/riscv_pmu.h>
+#include <asm/sbi.h>
+
+#ifdef CONFIG_RISCV_PMU_SBI
+#define RISCV_KVM_MAX_FW_CTRS 32
+#define RISCV_KVM_MAX_HW_CTRS 32
+#define RISCV_KVM_MAX_COUNTERS (RISCV_KVM_MAX_HW_CTRS + RISCV_KVM_MAX_FW_CTRS)
+static_assert(RISCV_KVM_MAX_COUNTERS <= 64);
+
+struct kvm_fw_event {
+ /* Current value of the event */
+ unsigned long value;
+
+ /* Event monitoring status */
+ bool started;
+};
+
+/* Per virtual pmu counter data */
+struct kvm_pmc {
+ u8 idx;
+ struct perf_event *perf_event;
+ u64 counter_val;
+ union sbi_pmu_ctr_info cinfo;
+ /* Event monitoring status */
+ bool started;
+ /* Monitoring event ID */
+ unsigned long event_idx;
+};
+
+/* PMU data structure per vcpu */
+struct kvm_pmu {
+ struct kvm_pmc pmc[RISCV_KVM_MAX_COUNTERS];
+ struct kvm_fw_event fw_event[RISCV_KVM_MAX_FW_CTRS];
+ /* Number of the virtual firmware counters available */
+ int num_fw_ctrs;
+ /* Number of the virtual hardware counters available */
+ int num_hw_ctrs;
+ /* A flag to indicate that pmu initialization is done */
+ bool init_done;
+ /* Bit map of all the virtual counter used */
+ DECLARE_BITMAP(pmc_in_use, RISCV_KVM_MAX_COUNTERS);
+};
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu_context)
+#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu_context))
+
+#if defined(CONFIG_32BIT)
+#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \
+{.base = CSR_CYCLEH, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm }, \
+{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm },
+#else
+#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \
+{.base = CSR_CYCLE, .count = 31, .func = kvm_riscv_vcpu_pmu_read_hpm },
+#endif
+
+int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid);
+int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask);
+
+int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu, struct kvm_vcpu_sbi_return *retdata);
+int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
+ struct kvm_vcpu_sbi_return *retdata);
+int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags, u64 ival,
+ struct kvm_vcpu_sbi_return *retdata);
+int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags,
+ struct kvm_vcpu_sbi_return *retdata);
+int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags,
+ unsigned long eidx, u64 evtdata,
+ struct kvm_vcpu_sbi_return *retdata);
+int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
+ struct kvm_vcpu_sbi_return *retdata);
+void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu);
+void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu);
+
+#else
+struct kvm_pmu {
+};
+
+#define KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS \
+{.base = 0, .count = 0, .func = NULL },
+
+static inline void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu) {}
+static inline int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid)
+{
+ return 0;
+}
+
+static inline void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu) {}
+static inline void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu) {}
+#endif /* CONFIG_RISCV_PMU_SBI */
+#endif /* !__KVM_VCPU_RISCV_PMU_H */
diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h
index f79478a85d2d..8425556af7d1 100644
--- a/arch/riscv/include/asm/kvm_vcpu_sbi.h
+++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h
@@ -18,6 +18,13 @@ struct kvm_vcpu_sbi_context {
int return_handled;
};
+struct kvm_vcpu_sbi_return {
+ unsigned long out_val;
+ unsigned long err_val;
+ struct kvm_cpu_trap *utrap;
+ bool uexit;
+};
+
struct kvm_vcpu_sbi_extension {
unsigned long extid_start;
unsigned long extid_end;
@@ -27,8 +34,10 @@ struct kvm_vcpu_sbi_extension {
* specific error codes.
*/
int (*handler)(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long *out_val, struct kvm_cpu_trap *utrap,
- bool *exit);
+ struct kvm_vcpu_sbi_return *retdata);
+
+ /* Extension specific probe function */
+ unsigned long (*probe)(struct kvm_vcpu *vcpu);
};
void kvm_riscv_vcpu_sbi_forward(struct kvm_vcpu *vcpu, struct kvm_run *run);
diff --git a/arch/riscv/include/asm/module.h b/arch/riscv/include/asm/module.h
index 76aa96a9fc08..0f3baaa6a9a8 100644
--- a/arch/riscv/include/asm/module.h
+++ b/arch/riscv/include/asm/module.h
@@ -5,6 +5,7 @@
#define _ASM_RISCV_MODULE_H
#include <asm-generic/module.h>
+#include <linux/elf.h>
struct module;
unsigned long module_emit_got_entry(struct module *mod, unsigned long val);
@@ -111,4 +112,19 @@ static inline struct plt_entry *get_plt_entry(unsigned long val,
#endif /* CONFIG_MODULE_SECTIONS */
+static inline const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
+ const Elf_Shdr *sechdrs,
+ const char *name)
+{
+ const Elf_Shdr *s, *se;
+ const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+ for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) {
+ if (strcmp(name, secstrs + s->sh_name) == 0)
+ return s;
+ }
+
+ return NULL;
+}
+
#endif /* _ASM_RISCV_MODULE_H */
diff --git a/arch/riscv/include/asm/parse_asm.h b/arch/riscv/include/asm/parse_asm.h
deleted file mode 100644
index f36368de839f..000000000000
--- a/arch/riscv/include/asm/parse_asm.h
+++ /dev/null
@@ -1,219 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2020 SiFive
- */
-
-#include <linux/bits.h>
-
-/* The bit field of immediate value in I-type instruction */
-#define I_IMM_SIGN_OPOFF 31
-#define I_IMM_11_0_OPOFF 20
-#define I_IMM_SIGN_OFF 12
-#define I_IMM_11_0_OFF 0
-#define I_IMM_11_0_MASK GENMASK(11, 0)
-
-/* The bit field of immediate value in J-type instruction */
-#define J_IMM_SIGN_OPOFF 31
-#define J_IMM_10_1_OPOFF 21
-#define J_IMM_11_OPOFF 20
-#define J_IMM_19_12_OPOFF 12
-#define J_IMM_SIGN_OFF 20
-#define J_IMM_10_1_OFF 1
-#define J_IMM_11_OFF 11
-#define J_IMM_19_12_OFF 12
-#define J_IMM_10_1_MASK GENMASK(9, 0)
-#define J_IMM_11_MASK GENMASK(0, 0)
-#define J_IMM_19_12_MASK GENMASK(7, 0)
-
-/* The bit field of immediate value in B-type instruction */
-#define B_IMM_SIGN_OPOFF 31
-#define B_IMM_10_5_OPOFF 25
-#define B_IMM_4_1_OPOFF 8
-#define B_IMM_11_OPOFF 7
-#define B_IMM_SIGN_OFF 12
-#define B_IMM_10_5_OFF 5
-#define B_IMM_4_1_OFF 1
-#define B_IMM_11_OFF 11
-#define B_IMM_10_5_MASK GENMASK(5, 0)
-#define B_IMM_4_1_MASK GENMASK(3, 0)
-#define B_IMM_11_MASK GENMASK(0, 0)
-
-/* The register offset in RVG instruction */
-#define RVG_RS1_OPOFF 15
-#define RVG_RS2_OPOFF 20
-#define RVG_RD_OPOFF 7
-
-/* The bit field of immediate value in RVC J instruction */
-#define RVC_J_IMM_SIGN_OPOFF 12
-#define RVC_J_IMM_4_OPOFF 11
-#define RVC_J_IMM_9_8_OPOFF 9
-#define RVC_J_IMM_10_OPOFF 8
-#define RVC_J_IMM_6_OPOFF 7
-#define RVC_J_IMM_7_OPOFF 6
-#define RVC_J_IMM_3_1_OPOFF 3
-#define RVC_J_IMM_5_OPOFF 2
-#define RVC_J_IMM_SIGN_OFF 11
-#define RVC_J_IMM_4_OFF 4
-#define RVC_J_IMM_9_8_OFF 8
-#define RVC_J_IMM_10_OFF 10
-#define RVC_J_IMM_6_OFF 6
-#define RVC_J_IMM_7_OFF 7
-#define RVC_J_IMM_3_1_OFF 1
-#define RVC_J_IMM_5_OFF 5
-#define RVC_J_IMM_4_MASK GENMASK(0, 0)
-#define RVC_J_IMM_9_8_MASK GENMASK(1, 0)
-#define RVC_J_IMM_10_MASK GENMASK(0, 0)
-#define RVC_J_IMM_6_MASK GENMASK(0, 0)
-#define RVC_J_IMM_7_MASK GENMASK(0, 0)
-#define RVC_J_IMM_3_1_MASK GENMASK(2, 0)
-#define RVC_J_IMM_5_MASK GENMASK(0, 0)
-
-/* The bit field of immediate value in RVC B instruction */
-#define RVC_B_IMM_SIGN_OPOFF 12
-#define RVC_B_IMM_4_3_OPOFF 10
-#define RVC_B_IMM_7_6_OPOFF 5
-#define RVC_B_IMM_2_1_OPOFF 3
-#define RVC_B_IMM_5_OPOFF 2
-#define RVC_B_IMM_SIGN_OFF 8
-#define RVC_B_IMM_4_3_OFF 3
-#define RVC_B_IMM_7_6_OFF 6
-#define RVC_B_IMM_2_1_OFF 1
-#define RVC_B_IMM_5_OFF 5
-#define RVC_B_IMM_4_3_MASK GENMASK(1, 0)
-#define RVC_B_IMM_7_6_MASK GENMASK(1, 0)
-#define RVC_B_IMM_2_1_MASK GENMASK(1, 0)
-#define RVC_B_IMM_5_MASK GENMASK(0, 0)
-
-/* The register offset in RVC op=C0 instruction */
-#define RVC_C0_RS1_OPOFF 7
-#define RVC_C0_RS2_OPOFF 2
-#define RVC_C0_RD_OPOFF 2
-
-/* The register offset in RVC op=C1 instruction */
-#define RVC_C1_RS1_OPOFF 7
-#define RVC_C1_RS2_OPOFF 2
-#define RVC_C1_RD_OPOFF 7
-
-/* The register offset in RVC op=C2 instruction */
-#define RVC_C2_RS1_OPOFF 7
-#define RVC_C2_RS2_OPOFF 2
-#define RVC_C2_RD_OPOFF 7
-
-/* parts of opcode for RVG*/
-#define OPCODE_BRANCH 0x63
-#define OPCODE_JALR 0x67
-#define OPCODE_JAL 0x6f
-#define OPCODE_SYSTEM 0x73
-
-/* parts of opcode for RVC*/
-#define OPCODE_C_0 0x0
-#define OPCODE_C_1 0x1
-#define OPCODE_C_2 0x2
-
-/* parts of funct3 code for I, M, A extension*/
-#define FUNCT3_JALR 0x0
-#define FUNCT3_BEQ 0x0
-#define FUNCT3_BNE 0x1000
-#define FUNCT3_BLT 0x4000
-#define FUNCT3_BGE 0x5000
-#define FUNCT3_BLTU 0x6000
-#define FUNCT3_BGEU 0x7000
-
-/* parts of funct3 code for C extension*/
-#define FUNCT3_C_BEQZ 0xc000
-#define FUNCT3_C_BNEZ 0xe000
-#define FUNCT3_C_J 0xa000
-#define FUNCT3_C_JAL 0x2000
-#define FUNCT4_C_JR 0x8000
-#define FUNCT4_C_JALR 0xf000
-
-#define FUNCT12_SRET 0x10200000
-
-#define MATCH_JALR (FUNCT3_JALR | OPCODE_JALR)
-#define MATCH_JAL (OPCODE_JAL)
-#define MATCH_BEQ (FUNCT3_BEQ | OPCODE_BRANCH)
-#define MATCH_BNE (FUNCT3_BNE | OPCODE_BRANCH)
-#define MATCH_BLT (FUNCT3_BLT | OPCODE_BRANCH)
-#define MATCH_BGE (FUNCT3_BGE | OPCODE_BRANCH)
-#define MATCH_BLTU (FUNCT3_BLTU | OPCODE_BRANCH)
-#define MATCH_BGEU (FUNCT3_BGEU | OPCODE_BRANCH)
-#define MATCH_SRET (FUNCT12_SRET | OPCODE_SYSTEM)
-#define MATCH_C_BEQZ (FUNCT3_C_BEQZ | OPCODE_C_1)
-#define MATCH_C_BNEZ (FUNCT3_C_BNEZ | OPCODE_C_1)
-#define MATCH_C_J (FUNCT3_C_J | OPCODE_C_1)
-#define MATCH_C_JAL (FUNCT3_C_JAL | OPCODE_C_1)
-#define MATCH_C_JR (FUNCT4_C_JR | OPCODE_C_2)
-#define MATCH_C_JALR (FUNCT4_C_JALR | OPCODE_C_2)
-
-#define MASK_JALR 0x707f
-#define MASK_JAL 0x7f
-#define MASK_C_JALR 0xf07f
-#define MASK_C_JR 0xf07f
-#define MASK_C_JAL 0xe003
-#define MASK_C_J 0xe003
-#define MASK_BEQ 0x707f
-#define MASK_BNE 0x707f
-#define MASK_BLT 0x707f
-#define MASK_BGE 0x707f
-#define MASK_BLTU 0x707f
-#define MASK_BGEU 0x707f
-#define MASK_C_BEQZ 0xe003
-#define MASK_C_BNEZ 0xe003
-#define MASK_SRET 0xffffffff
-
-#define __INSN_LENGTH_MASK _UL(0x3)
-#define __INSN_LENGTH_GE_32 _UL(0x3)
-#define __INSN_OPCODE_MASK _UL(0x7F)
-#define __INSN_BRANCH_OPCODE _UL(OPCODE_BRANCH)
-
-/* Define a series of is_XXX_insn functions to check if the value INSN
- * is an instance of instruction XXX.
- */
-#define DECLARE_INSN(INSN_NAME, INSN_MATCH, INSN_MASK) \
-static inline bool is_ ## INSN_NAME ## _insn(long insn) \
-{ \
- return (insn & (INSN_MASK)) == (INSN_MATCH); \
-}
-
-#define RV_IMM_SIGN(x) (-(((x) >> 31) & 1))
-#define RVC_IMM_SIGN(x) (-(((x) >> 12) & 1))
-#define RV_X(X, s, mask) (((X) >> (s)) & (mask))
-#define RVC_X(X, s, mask) RV_X(X, s, mask)
-
-#define EXTRACT_JTYPE_IMM(x) \
- ({typeof(x) x_ = (x); \
- (RV_X(x_, J_IMM_10_1_OPOFF, J_IMM_10_1_MASK) << J_IMM_10_1_OFF) | \
- (RV_X(x_, J_IMM_11_OPOFF, J_IMM_11_MASK) << J_IMM_11_OFF) | \
- (RV_X(x_, J_IMM_19_12_OPOFF, J_IMM_19_12_MASK) << J_IMM_19_12_OFF) | \
- (RV_IMM_SIGN(x_) << J_IMM_SIGN_OFF); })
-
-#define EXTRACT_ITYPE_IMM(x) \
- ({typeof(x) x_ = (x); \
- (RV_X(x_, I_IMM_11_0_OPOFF, I_IMM_11_0_MASK)) | \
- (RV_IMM_SIGN(x_) << I_IMM_SIGN_OFF); })
-
-#define EXTRACT_BTYPE_IMM(x) \
- ({typeof(x) x_ = (x); \
- (RV_X(x_, B_IMM_4_1_OPOFF, B_IMM_4_1_MASK) << B_IMM_4_1_OFF) | \
- (RV_X(x_, B_IMM_10_5_OPOFF, B_IMM_10_5_MASK) << B_IMM_10_5_OFF) | \
- (RV_X(x_, B_IMM_11_OPOFF, B_IMM_11_MASK) << B_IMM_11_OFF) | \
- (RV_IMM_SIGN(x_) << B_IMM_SIGN_OFF); })
-
-#define EXTRACT_RVC_J_IMM(x) \
- ({typeof(x) x_ = (x); \
- (RVC_X(x_, RVC_J_IMM_3_1_OPOFF, RVC_J_IMM_3_1_MASK) << RVC_J_IMM_3_1_OFF) | \
- (RVC_X(x_, RVC_J_IMM_4_OPOFF, RVC_J_IMM_4_MASK) << RVC_J_IMM_4_OFF) | \
- (RVC_X(x_, RVC_J_IMM_5_OPOFF, RVC_J_IMM_5_MASK) << RVC_J_IMM_5_OFF) | \
- (RVC_X(x_, RVC_J_IMM_6_OPOFF, RVC_J_IMM_6_MASK) << RVC_J_IMM_6_OFF) | \
- (RVC_X(x_, RVC_J_IMM_7_OPOFF, RVC_J_IMM_7_MASK) << RVC_J_IMM_7_OFF) | \
- (RVC_X(x_, RVC_J_IMM_9_8_OPOFF, RVC_J_IMM_9_8_MASK) << RVC_J_IMM_9_8_OFF) | \
- (RVC_X(x_, RVC_J_IMM_10_OPOFF, RVC_J_IMM_10_MASK) << RVC_J_IMM_10_OFF) | \
- (RVC_IMM_SIGN(x_) << RVC_J_IMM_SIGN_OFF); })
-
-#define EXTRACT_RVC_B_IMM(x) \
- ({typeof(x) x_ = (x); \
- (RVC_X(x_, RVC_B_IMM_2_1_OPOFF, RVC_B_IMM_2_1_MASK) << RVC_B_IMM_2_1_OFF) | \
- (RVC_X(x_, RVC_B_IMM_4_3_OPOFF, RVC_B_IMM_4_3_MASK) << RVC_B_IMM_4_3_OFF) | \
- (RVC_X(x_, RVC_B_IMM_5_OPOFF, RVC_B_IMM_5_MASK) << RVC_B_IMM_5_OFF) | \
- (RVC_X(x_, RVC_B_IMM_7_6_OPOFF, RVC_B_IMM_7_6_MASK) << RVC_B_IMM_7_6_OFF) | \
- (RVC_IMM_SIGN(x_) << RVC_B_IMM_SIGN_OFF); })
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index d8d8de0ded99..ab05f892d317 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -31,7 +31,7 @@
#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t))
/*
- * Half of the kernel address space (half of the entries of the page global
+ * Half of the kernel address space (1/4 of the entries of the page global
* directory) is for the direct mapping.
*/
#define KERN_VIRT_SIZE ((PTRS_PER_PGD / 2 * PGDIR_SIZE) / 2)
@@ -415,7 +415,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
* Relying on flush_tlb_fix_spurious_fault would suffice, but
* the extra traps reduce performance. So, eagerly SFENCE.VMA.
*/
- flush_tlb_page(vma, address);
+ local_flush_tlb_page(address);
}
#define __HAVE_ARCH_UPDATE_MMU_TLB
diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 4ca7fbacff42..945b7be249c1 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -169,9 +169,9 @@ enum sbi_pmu_fw_generic_events_t {
SBI_PMU_FW_ILLEGAL_INSN = 4,
SBI_PMU_FW_SET_TIMER = 5,
SBI_PMU_FW_IPI_SENT = 6,
- SBI_PMU_FW_IPI_RECVD = 7,
+ SBI_PMU_FW_IPI_RCVD = 7,
SBI_PMU_FW_FENCE_I_SENT = 8,
- SBI_PMU_FW_FENCE_I_RECVD = 9,
+ SBI_PMU_FW_FENCE_I_RCVD = 9,
SBI_PMU_FW_SFENCE_VMA_SENT = 10,
SBI_PMU_FW_SFENCE_VMA_RCVD = 11,
SBI_PMU_FW_SFENCE_VMA_ASID_SENT = 12,
@@ -215,6 +215,9 @@ enum sbi_pmu_ctr_type {
#define SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK 0x06
#define SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK 0x01
+#define SBI_PMU_EVENT_CACHE_ID_SHIFT 3
+#define SBI_PMU_EVENT_CACHE_OP_SHIFT 1
+
#define SBI_PMU_EVENT_IDX_INVALID 0xFFFFFFFF
/* Flags defined for config matching function */
diff --git a/arch/riscv/include/asm/signal.h b/arch/riscv/include/asm/signal.h
index 532c29ef0376..956ae0a01bad 100644
--- a/arch/riscv/include/asm/signal.h
+++ b/arch/riscv/include/asm/signal.h
@@ -7,6 +7,6 @@
#include <uapi/asm/ptrace.h>
asmlinkage __visible
-void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags);
+void do_work_pending(struct pt_regs *regs, unsigned long thread_info_flags);
#endif
diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 909049366555..a96b1fea24fe 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -18,6 +18,16 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
#define __HAVE_ARCH_MEMMOVE
extern asmlinkage void *memmove(void *, const void *, size_t);
extern asmlinkage void *__memmove(void *, const void *, size_t);
+
+#define __HAVE_ARCH_STRCMP
+extern asmlinkage int strcmp(const char *cs, const char *ct);
+
+#define __HAVE_ARCH_STRLEN
+extern asmlinkage __kernel_size_t strlen(const char *);
+
+#define __HAVE_ARCH_STRNCMP
+extern asmlinkage int strncmp(const char *cs, const char *ct, size_t count);
+
/* For those files which don't want to check by kasan. */
#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
#define memcpy(dst, src, len) __memcpy(dst, src, len)
diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h
index 11463489fec6..60f8ca01d36e 100644
--- a/arch/riscv/include/asm/switch_to.h
+++ b/arch/riscv/include/asm/switch_to.h
@@ -59,7 +59,8 @@ static inline void __switch_to_aux(struct task_struct *prev,
static __always_inline bool has_fpu(void)
{
- return static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_FPU]);
+ return riscv_has_extension_likely(RISCV_ISA_EXT_f) ||
+ riscv_has_extension_likely(RISCV_ISA_EXT_d);
}
#else
static __always_inline bool has_fpu(void) { return false; }
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index 67322f878e0d..f704c8dd57e0 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -43,6 +43,7 @@
#ifndef __ASSEMBLY__
extern long shadow_stack[SHADOW_OVERFLOW_STACK_SIZE / sizeof(long)];
+extern unsigned long spin_shadow_stack;
#include <asm/processor.h>
#include <asm/csr.h>
diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h
index a7644f46d0e5..f891478829a5 100644
--- a/arch/riscv/include/asm/vdso.h
+++ b/arch/riscv/include/asm/vdso.h
@@ -28,8 +28,12 @@
#define COMPAT_VDSO_SYMBOL(base, name) \
(void __user *)((unsigned long)(base) + compat__vdso_##name##_offset)
+extern char compat_vdso_start[], compat_vdso_end[];
+
#endif /* CONFIG_COMPAT */
+extern char vdso_start[], vdso_end[];
+
#endif /* !__ASSEMBLY__ */
#endif /* CONFIG_MMU */
diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c
index a7d26a00beea..2354c69dc7d1 100644
--- a/arch/riscv/kernel/alternative.c
+++ b/arch/riscv/kernel/alternative.c
@@ -11,10 +11,14 @@
#include <linux/cpu.h>
#include <linux/uaccess.h>
#include <asm/alternative.h>
+#include <asm/module.h>
#include <asm/sections.h>
+#include <asm/vdso.h>
#include <asm/vendorid_list.h>
#include <asm/sbi.h>
#include <asm/csr.h>
+#include <asm/insn.h>
+#include <asm/patch.h>
struct cpu_manufacturer_info_t {
unsigned long vendor_id;
@@ -53,6 +57,88 @@ static void __init_or_module riscv_fill_cpu_mfr_info(struct cpu_manufacturer_inf
}
}
+static u32 riscv_instruction_at(void *p)
+{
+ u16 *parcel = p;
+
+ return (u32)parcel[0] | (u32)parcel[1] << 16;
+}
+
+static void riscv_alternative_fix_auipc_jalr(void *ptr, u32 auipc_insn,
+ u32 jalr_insn, int patch_offset)
+{
+ u32 call[2] = { auipc_insn, jalr_insn };
+ s32 imm;
+
+ /* get and adjust new target address */
+ imm = riscv_insn_extract_utype_itype_imm(auipc_insn, jalr_insn);
+ imm -= patch_offset;
+
+ /* update instructions */
+ riscv_insn_insert_utype_itype_imm(&call[0], &call[1], imm);
+
+ /* patch the call place again */
+ patch_text_nosync(ptr, call, sizeof(u32) * 2);
+}
+
+static void riscv_alternative_fix_jal(void *ptr, u32 jal_insn, int patch_offset)
+{
+ s32 imm;
+
+ /* get and adjust new target address */
+ imm = riscv_insn_extract_jtype_imm(jal_insn);
+ imm -= patch_offset;
+
+ /* update instruction */
+ riscv_insn_insert_jtype_imm(&jal_insn, imm);
+
+ /* patch the call place again */
+ patch_text_nosync(ptr, &jal_insn, sizeof(u32));
+}
+
+void riscv_alternative_fix_offsets(void *alt_ptr, unsigned int len,
+ int patch_offset)
+{
+ int num_insn = len / sizeof(u32);
+ int i;
+
+ for (i = 0; i < num_insn; i++) {
+ u32 insn = riscv_instruction_at(alt_ptr + i * sizeof(u32));
+
+ /*
+ * May be the start of an auipc + jalr pair
+ * Needs to check that at least one more instruction
+ * is in the list.
+ */
+ if (riscv_insn_is_auipc(insn) && i < num_insn - 1) {
+ u32 insn2 = riscv_instruction_at(alt_ptr + (i + 1) * sizeof(u32));
+
+ if (!riscv_insn_is_jalr(insn2))
+ continue;
+
+ /* if instruction pair is a call, it will use the ra register */
+ if (RV_EXTRACT_RD_REG(insn) != 1)
+ continue;
+
+ riscv_alternative_fix_auipc_jalr(alt_ptr + i * sizeof(u32),
+ insn, insn2, patch_offset);
+ i++;
+ }
+
+ if (riscv_insn_is_jal(insn)) {
+ s32 imm = riscv_insn_extract_jtype_imm(insn);
+
+ /* Don't modify jumps inside the alternative block */
+ if ((alt_ptr + i * sizeof(u32) + imm) >= alt_ptr &&
+ (alt_ptr + i * sizeof(u32) + imm) < (alt_ptr + len))
+ continue;
+
+ riscv_alternative_fix_jal(alt_ptr + i * sizeof(u32),
+ insn, patch_offset);
+ }
+ }
+}
+
/*
* This is called very early in the boot process (directly after we run
* a feature detect on the boot CPU). No need to worry about other CPUs
@@ -77,6 +163,31 @@ static void __init_or_module _apply_alternatives(struct alt_entry *begin,
stage);
}
+#ifdef CONFIG_MMU
+static void __init apply_vdso_alternatives(void)
+{
+ const Elf_Ehdr *hdr;
+ const Elf_Shdr *shdr;
+ const Elf_Shdr *alt;
+ struct alt_entry *begin, *end;
+
+ hdr = (Elf_Ehdr *)vdso_start;
+ shdr = (void *)hdr + hdr->e_shoff;
+ alt = find_section(hdr, shdr, ".alternative");
+ if (!alt)
+ return;
+
+ begin = (void *)hdr + alt->sh_offset,
+ end = (void *)hdr + alt->sh_offset + alt->sh_size,
+
+ _apply_alternatives((struct alt_entry *)begin,
+ (struct alt_entry *)end,
+ RISCV_ALTERNATIVES_BOOT);
+}
+#else
+static void __init apply_vdso_alternatives(void) { }
+#endif
+
void __init apply_boot_alternatives(void)
{
/* If called on non-boot cpu things could go wrong */
@@ -85,6 +196,8 @@ void __init apply_boot_alternatives(void)
_apply_alternatives((struct alt_entry *)__alt_start,
(struct alt_entry *)__alt_end,
RISCV_ALTERNATIVES_BOOT);
+
+ apply_vdso_alternatives();
}
/*
diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index 1b9a5a66e55a..8400f0cc9704 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -144,30 +144,54 @@ arch_initcall(riscv_cpuinfo_init);
.uprop = #UPROP, \
.isa_ext_id = EXTID, \
}
+
/*
- * Here are the ordering rules of extension naming defined by RISC-V
- * specification :
- * 1. All extensions should be separated from other multi-letter extensions
- * by an underscore.
- * 2. The first letter following the 'Z' conventionally indicates the most
+ * The canonical order of ISA extension names in the ISA string is defined in
+ * chapter 27 of the unprivileged specification.
+ *
+ * Ordinarily, for in-kernel data structures, this order is unimportant but
+ * isa_ext_arr defines the order of the ISA string in /proc/cpuinfo.
+ *
+ * The specification uses vague wording, such as should, when it comes to
+ * ordering, so for our purposes the following rules apply:
+ *
+ * 1. All multi-letter extensions must be separated from other extensions by an
+ * underscore.
+ *
+ * 2. Additional standard extensions (starting with 'Z') must be sorted after
+ * single-letter extensions and before any higher-privileged extensions.
+
+ * 3. The first letter following the 'Z' conventionally indicates the most
* closely related alphabetical extension category, IMAFDQLCBKJTPVH.
- * If multiple 'Z' extensions are named, they should be ordered first
- * by category, then alphabetically within a category.
- * 3. Standard supervisor-level extensions (starts with 'S') should be
- * listed after standard unprivileged extensions. If multiple
- * supervisor-level extensions are listed, they should be ordered
+ * If multiple 'Z' extensions are named, they must be ordered first by
+ * category, then alphabetically within a category.
+ *
+ * 3. Standard supervisor-level extensions (starting with 'S') must be listed
+ * after standard unprivileged extensions. If multiple supervisor-level
+ * extensions are listed, they must be ordered alphabetically.
+ *
+ * 4. Standard machine-level extensions (starting with 'Zxm') must be listed
+ * after any lower-privileged, standard extensions. If multiple
+ * machine-level extensions are listed, they must be ordered
* alphabetically.
- * 4. Non-standard extensions (starts with 'X') must be listed after all
- * standard extensions. They must be separated from other multi-letter
- * extensions by an underscore.
+ *
+ * 5. Non-standard extensions (starting with 'X') must be listed after all
+ * standard extensions. If multiple non-standard extensions are listed, they
+ * must be ordered alphabetically.
+ *
+ * An example string following the order is:
+ * rv64imadc_zifoo_zigoo_zafoo_sbar_scar_zxmbaz_xqux_xrux
+ *
+ * New entries to this struct should follow the ordering rules described above.
*/
static struct riscv_isa_ext_data isa_ext_arr[] = {
+ __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
+ __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
+ __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),
__RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
__RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
__RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
- __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
- __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
__RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX),
};
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 93e45560af30..59d58ee0f68d 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -10,6 +10,7 @@
#include <linux/ctype.h>
#include <linux/libfdt.h>
#include <linux/log2.h>
+#include <linux/memory.h>
#include <linux/module.h>
#include <linux/of.h>
#include <asm/alternative.h>
@@ -29,9 +30,6 @@ unsigned long elf_hwcap __read_mostly;
/* Host ISA bitmap */
static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly;
-DEFINE_STATIC_KEY_ARRAY_FALSE(riscv_isa_ext_keys, RISCV_ISA_EXT_KEY_MAX);
-EXPORT_SYMBOL(riscv_isa_ext_keys);
-
/**
* riscv_isa_extension_base() - Get base extension word
*
@@ -222,12 +220,14 @@ void __init riscv_fill_hwcap(void)
set_bit(nr, this_isa);
}
} else {
+ /* sorted alphabetically */
SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF);
+ SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
+ SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT);
+ SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB);
SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
- SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
- SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL);
}
#undef SET_ISA_EXT_MAP
}
@@ -266,81 +266,38 @@ void __init riscv_fill_hwcap(void)
if (elf_hwcap & BIT_MASK(i))
print_str[j++] = (char)('a' + i);
pr_info("riscv: ELF capabilities %s\n", print_str);
-
- for_each_set_bit(i, riscv_isa, RISCV_ISA_EXT_MAX) {
- j = riscv_isa_ext2key(i);
- if (j >= 0)
- static_branch_enable(&riscv_isa_ext_keys[j]);
- }
}
#ifdef CONFIG_RISCV_ALTERNATIVE
-static bool __init_or_module cpufeature_probe_svpbmt(unsigned int stage)
-{
- if (!IS_ENABLED(CONFIG_RISCV_ISA_SVPBMT))
- return false;
-
- if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
- return false;
-
- return riscv_isa_extension_available(NULL, SVPBMT);
-}
-
-static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
-{
- if (!IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM))
- return false;
-
- if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
- return false;
-
- if (!riscv_isa_extension_available(NULL, ZICBOM))
- return false;
-
- riscv_noncoherent_supported();
- return true;
-}
-
-/*
- * Probe presence of individual extensions.
- *
- * This code may also be executed before kernel relocation, so we cannot use
- * addresses generated by the address-of operator as they won't be valid in
- * this context.
- */
-static u32 __init_or_module cpufeature_probe(unsigned int stage)
-{
- u32 cpu_req_feature = 0;
-
- if (cpufeature_probe_svpbmt(stage))
- cpu_req_feature |= BIT(CPUFEATURE_SVPBMT);
-
- if (cpufeature_probe_zicbom(stage))
- cpu_req_feature |= BIT(CPUFEATURE_ZICBOM);
-
- return cpu_req_feature;
-}
-
void __init_or_module riscv_cpufeature_patch_func(struct alt_entry *begin,
struct alt_entry *end,
unsigned int stage)
{
- u32 cpu_req_feature = cpufeature_probe(stage);
struct alt_entry *alt;
- u32 tmp;
+ void *oldptr, *altptr;
+
+ if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
+ return;
for (alt = begin; alt < end; alt++) {
if (alt->vendor_id != 0)
continue;
- if (alt->errata_id >= CPUFEATURE_NUMBER) {
- WARN(1, "This feature id:%d is not in kernel cpufeature list",
+ if (alt->errata_id >= RISCV_ISA_EXT_MAX) {
+ WARN(1, "This extension id:%d is not in ISA extension list",
alt->errata_id);
continue;
}
- tmp = (1U << alt->errata_id);
- if (cpu_req_feature & tmp)
- patch_text_nosync(alt->old_ptr, alt->alt_ptr, alt->alt_len);
+ if (!__riscv_isa_extension_available(NULL, alt->errata_id))
+ continue;
+
+ oldptr = ALT_OLD_PTR(alt);
+ altptr = ALT_ALT_PTR(alt);
+
+ mutex_lock(&text_mutex);
+ patch_text_nosync(oldptr, altptr, alt->alt_len);
+ riscv_alternative_fix_offsets(oldptr, alt->alt_len, oldptr - altptr);
+ mutex_unlock(&text_mutex);
}
}
#endif
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 2086f6585773..5bff37af4770 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -55,12 +55,15 @@ static int ftrace_check_current_call(unsigned long hook_pos,
}
static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
- bool enable)
+ bool enable, bool ra)
{
unsigned int call[2];
unsigned int nops[2] = {NOP4, NOP4};
- make_call(hook_pos, target, call);
+ if (ra)
+ make_call_ra(hook_pos, target, call);
+ else
+ make_call_t0(hook_pos, target, call);
/* Replace the auipc-jalr pair at once. Return -EPERM on write error. */
if (patch_text_nosync
@@ -70,42 +73,13 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
return 0;
}
-/*
- * Put 5 instructions with 16 bytes at the front of function within
- * patchable function entry nops' area.
- *
- * 0: REG_S ra, -SZREG(sp)
- * 1: auipc ra, 0x?
- * 2: jalr -?(ra)
- * 3: REG_L ra, -SZREG(sp)
- *
- * So the opcodes is:
- * 0: 0xfe113c23 (sd)/0xfe112e23 (sw)
- * 1: 0x???????? -> auipc
- * 2: 0x???????? -> jalr
- * 3: 0xff813083 (ld)/0xffc12083 (lw)
- */
-#if __riscv_xlen == 64
-#define INSN0 0xfe113c23
-#define INSN3 0xff813083
-#elif __riscv_xlen == 32
-#define INSN0 0xfe112e23
-#define INSN3 0xffc12083
-#endif
-
-#define FUNC_ENTRY_SIZE 16
-#define FUNC_ENTRY_JMP 4
-
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned int call[4] = {INSN0, 0, 0, INSN3};
- unsigned long target = addr;
- unsigned long caller = rec->ip + FUNC_ENTRY_JMP;
+ unsigned int call[2];
- call[1] = to_auipc_insn((unsigned int)(target - caller));
- call[2] = to_jalr_insn((unsigned int)(target - caller));
+ make_call_t0(rec->ip, addr, call);
- if (patch_text_nosync((void *)rec->ip, call, FUNC_ENTRY_SIZE))
+ if (patch_text_nosync((void *)rec->ip, call, MCOUNT_INSN_SIZE))
return -EPERM;
return 0;
@@ -114,15 +88,14 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
unsigned long addr)
{
- unsigned int nops[4] = {NOP4, NOP4, NOP4, NOP4};
+ unsigned int nops[2] = {NOP4, NOP4};
- if (patch_text_nosync((void *)rec->ip, nops, FUNC_ENTRY_SIZE))
+ if (patch_text_nosync((void *)rec->ip, nops, MCOUNT_INSN_SIZE))
return -EPERM;
return 0;
}
-
/*
* This is called early on, and isn't wrapped by
* ftrace_arch_code_modify_{prepare,post_process}() and therefor doesn't hold
@@ -144,10 +117,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
int ftrace_update_ftrace_func(ftrace_func_t func)
{
int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
- (unsigned long)func, true);
+ (unsigned long)func, true, true);
if (!ret) {
ret = __ftrace_modify_call((unsigned long)&ftrace_regs_call,
- (unsigned long)func, true);
+ (unsigned long)func, true, true);
}
return ret;
@@ -159,16 +132,16 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
unsigned int call[2];
- unsigned long caller = rec->ip + FUNC_ENTRY_JMP;
+ unsigned long caller = rec->ip;
int ret;
- make_call(caller, old_addr, call);
+ make_call_t0(caller, old_addr, call);
ret = ftrace_check_current_call(caller, call);
if (ret)
return ret;
- return __ftrace_modify_call(caller, addr, true);
+ return __ftrace_modify_call(caller, addr, true, false);
}
#endif
@@ -203,12 +176,12 @@ int ftrace_enable_ftrace_graph_caller(void)
int ret;
ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call,
- (unsigned long)&prepare_ftrace_return, true);
+ (unsigned long)&prepare_ftrace_return, true, true);
if (ret)
return ret;
return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call,
- (unsigned long)&prepare_ftrace_return, true);
+ (unsigned long)&prepare_ftrace_return, true, true);
}
int ftrace_disable_ftrace_graph_caller(void)
@@ -216,12 +189,12 @@ int ftrace_disable_ftrace_graph_caller(void)
int ret;
ret = __ftrace_modify_call((unsigned long)&ftrace_graph_call,
- (unsigned long)&prepare_ftrace_return, false);
+ (unsigned long)&prepare_ftrace_return, false, true);
if (ret)
return ret;
return __ftrace_modify_call((unsigned long)&ftrace_graph_regs_call,
- (unsigned long)&prepare_ftrace_return, false);
+ (unsigned long)&prepare_ftrace_return, false, true);
}
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/riscv/kernel/kgdb.c b/arch/riscv/kernel/kgdb.c
index 963ed7edcff2..2e0266ae6bd7 100644
--- a/arch/riscv/kernel/kgdb.c
+++ b/arch/riscv/kernel/kgdb.c
@@ -11,7 +11,7 @@
#include <linux/string.h>
#include <asm/cacheflush.h>
#include <asm/gdb_xml.h>
-#include <asm/parse_asm.h>
+#include <asm/insn.h>
enum {
NOT_KGDB_BREAK = 0,
@@ -23,27 +23,6 @@ enum {
static unsigned long stepped_address;
static unsigned int stepped_opcode;
-#if __riscv_xlen == 32
-/* C.JAL is an RV32C-only instruction */
-DECLARE_INSN(c_jal, MATCH_C_JAL, MASK_C_JAL)
-#else
-#define is_c_jal_insn(opcode) 0
-#endif
-DECLARE_INSN(jalr, MATCH_JALR, MASK_JALR)
-DECLARE_INSN(jal, MATCH_JAL, MASK_JAL)
-DECLARE_INSN(c_jr, MATCH_C_JR, MASK_C_JR)
-DECLARE_INSN(c_jalr, MATCH_C_JALR, MASK_C_JALR)
-DECLARE_INSN(c_j, MATCH_C_J, MASK_C_J)
-DECLARE_INSN(beq, MATCH_BEQ, MASK_BEQ)
-DECLARE_INSN(bne, MATCH_BNE, MASK_BNE)
-DECLARE_INSN(blt, MATCH_BLT, MASK_BLT)
-DECLARE_INSN(bge, MATCH_BGE, MASK_BGE)
-DECLARE_INSN(bltu, MATCH_BLTU, MASK_BLTU)
-DECLARE_INSN(bgeu, MATCH_BGEU, MASK_BGEU)
-DECLARE_INSN(c_beqz, MATCH_C_BEQZ, MASK_C_BEQZ)
-DECLARE_INSN(c_bnez, MATCH_C_BNEZ, MASK_C_BNEZ)
-DECLARE_INSN(sret, MATCH_SRET, MASK_SRET)
-
static int decode_register_index(unsigned long opcode, int offset)
{
return (opcode >> offset) & 0x1F;
@@ -65,23 +44,25 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr)
if (get_kernel_nofault(op_code, (void *)pc))
return -EINVAL;
if ((op_code & __INSN_LENGTH_MASK) != __INSN_LENGTH_GE_32) {
- if (is_c_jalr_insn(op_code) || is_c_jr_insn(op_code)) {
+ if (riscv_insn_is_c_jalr(op_code) ||
+ riscv_insn_is_c_jr(op_code)) {
rs1_num = decode_register_index(op_code, RVC_C2_RS1_OPOFF);
*next_addr = regs_ptr[rs1_num];
- } else if (is_c_j_insn(op_code) || is_c_jal_insn(op_code)) {
- *next_addr = EXTRACT_RVC_J_IMM(op_code) + pc;
- } else if (is_c_beqz_insn(op_code)) {
+ } else if (riscv_insn_is_c_j(op_code) ||
+ riscv_insn_is_c_jal(op_code)) {
+ *next_addr = RVC_EXTRACT_JTYPE_IMM(op_code) + pc;
+ } else if (riscv_insn_is_c_beqz(op_code)) {
rs1_num = decode_register_index_short(op_code,
RVC_C1_RS1_OPOFF);
if (!rs1_num || regs_ptr[rs1_num] == 0)
- *next_addr = EXTRACT_RVC_B_IMM(op_code) + pc;
+ *next_addr = RVC_EXTRACT_BTYPE_IMM(op_code) + pc;
else
*next_addr = pc + 2;
- } else if (is_c_bnez_insn(op_code)) {
+ } else if (riscv_insn_is_c_bnez(op_code)) {
rs1_num =
decode_register_index_short(op_code, RVC_C1_RS1_OPOFF);
if (rs1_num && regs_ptr[rs1_num] != 0)
- *next_addr = EXTRACT_RVC_B_IMM(op_code) + pc;
+ *next_addr = RVC_EXTRACT_BTYPE_IMM(op_code) + pc;
else
*next_addr = pc + 2;
} else {
@@ -90,7 +71,7 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr)
} else {
if ((op_code & __INSN_OPCODE_MASK) == __INSN_BRANCH_OPCODE) {
bool result = false;
- long imm = EXTRACT_BTYPE_IMM(op_code);
+ long imm = RV_EXTRACT_BTYPE_IMM(op_code);
unsigned long rs1_val = 0, rs2_val = 0;
rs1_num = decode_register_index(op_code, RVG_RS1_OPOFF);
@@ -100,34 +81,34 @@ static int get_step_address(struct pt_regs *regs, unsigned long *next_addr)
if (rs2_num)
rs2_val = regs_ptr[rs2_num];
- if (is_beq_insn(op_code))
+ if (riscv_insn_is_beq(op_code))
result = (rs1_val == rs2_val) ? true : false;
- else if (is_bne_insn(op_code))
+ else if (riscv_insn_is_bne(op_code))
result = (rs1_val != rs2_val) ? true : false;
- else if (is_blt_insn(op_code))
+ else if (riscv_insn_is_blt(op_code))
result =
((long)rs1_val <
(long)rs2_val) ? true : false;
- else if (is_bge_insn(op_code))
+ else if (riscv_insn_is_bge(op_code))
result =
((long)rs1_val >=
(long)rs2_val) ? true : false;
- else if (is_bltu_insn(op_code))
+ else if (riscv_insn_is_bltu(op_code))
result = (rs1_val < rs2_val) ? true : false;
- else if (is_bgeu_insn(op_code))
+ else if (riscv_insn_is_bgeu(op_code))
result = (rs1_val >= rs2_val) ? true : false;
if (result)
*next_addr = imm + pc;
else
*next_addr = pc + 4;
- } else if (is_jal_insn(op_code)) {
- *next_addr = EXTRACT_JTYPE_IMM(op_code) + pc;
- } else if (is_jalr_insn(op_code)) {
+ } else if (riscv_insn_is_jal(op_code)) {
+ *next_addr = RV_EXTRACT_JTYPE_IMM(op_code) + pc;
+ } else if (riscv_insn_is_jalr(op_code)) {
rs1_num = decode_register_index(op_code, RVG_RS1_OPOFF);
if (rs1_num)
*next_addr = ((unsigned long *)regs)[rs1_num];
- *next_addr += EXTRACT_ITYPE_IMM(op_code);
- } else if (is_sret_insn(op_code)) {
+ *next_addr += RV_EXTRACT_ITYPE_IMM(op_code);
+ } else if (riscv_insn_is_sret(op_code)) {
*next_addr = pc;
} else {
*next_addr = pc + 4;
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index d171eca623b6..125de818d1ba 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -13,8 +13,8 @@
.text
-#define FENTRY_RA_OFFSET 12
-#define ABI_SIZE_ON_STACK 72
+#define FENTRY_RA_OFFSET 8
+#define ABI_SIZE_ON_STACK 80
#define ABI_A0 0
#define ABI_A1 8
#define ABI_A2 16
@@ -23,10 +23,10 @@
#define ABI_A5 40
#define ABI_A6 48
#define ABI_A7 56
-#define ABI_RA 64
+#define ABI_T0 64
+#define ABI_RA 72
.macro SAVE_ABI
- addi sp, sp, -SZREG
addi sp, sp, -ABI_SIZE_ON_STACK
REG_S a0, ABI_A0(sp)
@@ -37,6 +37,7 @@
REG_S a5, ABI_A5(sp)
REG_S a6, ABI_A6(sp)
REG_S a7, ABI_A7(sp)
+ REG_S t0, ABI_T0(sp)
REG_S ra, ABI_RA(sp)
.endm
@@ -49,24 +50,18 @@
REG_L a5, ABI_A5(sp)
REG_L a6, ABI_A6(sp)
REG_L a7, ABI_A7(sp)
+ REG_L t0, ABI_T0(sp)
REG_L ra, ABI_RA(sp)
addi sp, sp, ABI_SIZE_ON_STACK
- addi sp, sp, SZREG
.endm
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
.macro SAVE_ALL
- addi sp, sp, -SZREG
addi sp, sp, -PT_SIZE_ON_STACK
- REG_S x1, PT_EPC(sp)
- addi sp, sp, PT_SIZE_ON_STACK
- REG_L x1, (sp)
- addi sp, sp, -PT_SIZE_ON_STACK
+ REG_S t0, PT_EPC(sp)
REG_S x1, PT_RA(sp)
- REG_L x1, PT_EPC(sp)
-
REG_S x2, PT_SP(sp)
REG_S x3, PT_GP(sp)
REG_S x4, PT_TP(sp)
@@ -100,15 +95,11 @@
.endm
.macro RESTORE_ALL
+ REG_L t0, PT_EPC(sp)
REG_L x1, PT_RA(sp)
- addi sp, sp, PT_SIZE_ON_STACK
- REG_S x1, (sp)
- addi sp, sp, -PT_SIZE_ON_STACK
- REG_L x1, PT_EPC(sp)
REG_L x2, PT_SP(sp)
REG_L x3, PT_GP(sp)
REG_L x4, PT_TP(sp)
- REG_L x5, PT_T0(sp)
REG_L x6, PT_T1(sp)
REG_L x7, PT_T2(sp)
REG_L x8, PT_S0(sp)
@@ -137,17 +128,16 @@
REG_L x31, PT_T6(sp)
addi sp, sp, PT_SIZE_ON_STACK
- addi sp, sp, SZREG
.endm
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
ENTRY(ftrace_caller)
SAVE_ABI
- addi a0, ra, -FENTRY_RA_OFFSET
+ addi a0, t0, -FENTRY_RA_OFFSET
la a1, function_trace_op
REG_L a2, 0(a1)
- REG_L a1, ABI_SIZE_ON_STACK(sp)
+ mv a1, ra
mv a3, sp
ftrace_call:
@@ -155,8 +145,8 @@ ftrace_call:
call ftrace_stub
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- addi a0, sp, ABI_SIZE_ON_STACK
- REG_L a1, ABI_RA(sp)
+ addi a0, sp, ABI_RA
+ REG_L a1, ABI_T0(sp)
addi a1, a1, -FENTRY_RA_OFFSET
#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
mv a2, s0
@@ -166,17 +156,17 @@ ftrace_graph_call:
call ftrace_stub
#endif
RESTORE_ABI
- ret
+ jr t0
ENDPROC(ftrace_caller)
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
ENTRY(ftrace_regs_caller)
SAVE_ALL
- addi a0, ra, -FENTRY_RA_OFFSET
+ addi a0, t0, -FENTRY_RA_OFFSET
la a1, function_trace_op
REG_L a2, 0(a1)
- REG_L a1, PT_SIZE_ON_STACK(sp)
+ mv a1, ra
mv a3, sp
ftrace_regs_call:
@@ -196,6 +186,6 @@ ftrace_graph_regs_call:
#endif
RESTORE_ALL
- ret
+ jr t0
ENDPROC(ftrace_regs_caller)
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 91fe16bfaa07..7c651d55fcbd 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -268,6 +268,13 @@ static int apply_r_riscv_align_rela(struct module *me, u32 *location,
return -EINVAL;
}
+static int apply_r_riscv_add16_rela(struct module *me, u32 *location,
+ Elf_Addr v)
+{
+ *(u16 *)location += (u16)v;
+ return 0;
+}
+
static int apply_r_riscv_add32_rela(struct module *me, u32 *location,
Elf_Addr v)
{
@@ -282,6 +289,13 @@ static int apply_r_riscv_add64_rela(struct module *me, u32 *location,
return 0;
}
+static int apply_r_riscv_sub16_rela(struct module *me, u32 *location,
+ Elf_Addr v)
+{
+ *(u16 *)location -= (u16)v;
+ return 0;
+}
+
static int apply_r_riscv_sub32_rela(struct module *me, u32 *location,
Elf_Addr v)
{
@@ -315,8 +329,10 @@ static int (*reloc_handlers_rela[]) (struct module *me, u32 *location,
[R_RISCV_CALL] = apply_r_riscv_call_rela,
[R_RISCV_RELAX] = apply_r_riscv_relax_rela,
[R_RISCV_ALIGN] = apply_r_riscv_align_rela,
+ [R_RISCV_ADD16] = apply_r_riscv_add16_rela,
[R_RISCV_ADD32] = apply_r_riscv_add32_rela,
[R_RISCV_ADD64] = apply_r_riscv_add64_rela,
+ [R_RISCV_SUB16] = apply_r_riscv_sub16_rela,
[R_RISCV_SUB32] = apply_r_riscv_sub32_rela,
[R_RISCV_SUB64] = apply_r_riscv_sub64_rela,
};
@@ -429,21 +445,6 @@ void *module_alloc(unsigned long size)
}
#endif
-static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- const char *name)
-{
- const Elf_Shdr *s, *se;
- const char *secstrs = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
-
- for (s = sechdrs, se = sechdrs + hdr->e_shnum; s < se; s++) {
- if (strcmp(name, secstrs + s->sh_name) == 0)
- return s;
- }
-
- return NULL;
-}
-
int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c
index a20568bd1f1a..7441ac8a6843 100644
--- a/arch/riscv/kernel/probes/simulate-insn.c
+++ b/arch/riscv/kernel/probes/simulate-insn.c
@@ -136,13 +136,6 @@ bool __kprobes simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *re
#define branch_offset(opcode) \
sign_extend32((branch_imm(opcode)), 12)
-#define BRANCH_BEQ 0x0
-#define BRANCH_BNE 0x1
-#define BRANCH_BLT 0x4
-#define BRANCH_BGE 0x5
-#define BRANCH_BLTU 0x6
-#define BRANCH_BGEU 0x7
-
bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs)
{
/*
@@ -169,22 +162,22 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r
offset_tmp = branch_offset(opcode);
switch (branch_funct3(opcode)) {
- case BRANCH_BEQ:
+ case RVG_FUNCT3_BEQ:
offset = (rs1_val == rs2_val) ? offset_tmp : 4;
break;
- case BRANCH_BNE:
+ case RVG_FUNCT3_BNE:
offset = (rs1_val != rs2_val) ? offset_tmp : 4;
break;
- case BRANCH_BLT:
+ case RVG_FUNCT3_BLT:
offset = ((long)rs1_val < (long)rs2_val) ? offset_tmp : 4;
break;
- case BRANCH_BGE:
+ case RVG_FUNCT3_BGE:
offset = ((long)rs1_val >= (long)rs2_val) ? offset_tmp : 4;
break;
- case BRANCH_BLTU:
+ case RVG_FUNCT3_BLTU:
offset = (rs1_val < rs2_val) ? offset_tmp : 4;
break;
- case BRANCH_BGEU:
+ case RVG_FUNCT3_BGEU:
offset = (rs1_val >= rs2_val) ? offset_tmp : 4;
break;
default:
diff --git a/arch/riscv/kernel/probes/simulate-insn.h b/arch/riscv/kernel/probes/simulate-insn.h
index de8474146a9b..61e35db31001 100644
--- a/arch/riscv/kernel/probes/simulate-insn.h
+++ b/arch/riscv/kernel/probes/simulate-insn.h
@@ -3,14 +3,7 @@
#ifndef _RISCV_KERNEL_PROBES_SIMULATE_INSN_H
#define _RISCV_KERNEL_PROBES_SIMULATE_INSN_H
-#define __RISCV_INSN_FUNCS(name, mask, val) \
-static __always_inline bool riscv_insn_is_##name(probe_opcode_t code) \
-{ \
- BUILD_BUG_ON(~(mask) & (val)); \
- return (code & (mask)) == (val); \
-} \
-bool simulate_##name(u32 opcode, unsigned long addr, \
- struct pt_regs *regs)
+#include <asm/insn.h>
#define RISCV_INSN_REJECTED(name, code) \
do { \
@@ -19,9 +12,6 @@ bool simulate_##name(u32 opcode, unsigned long addr, \
} \
} while (0)
-__RISCV_INSN_FUNCS(system, 0x7f, 0x73);
-__RISCV_INSN_FUNCS(fence, 0x7f, 0x0f);
-
#define RISCV_INSN_SET_SIMULATE(name, code) \
do { \
if (riscv_insn_is_##name(code)) { \
@@ -30,18 +20,9 @@ __RISCV_INSN_FUNCS(fence, 0x7f, 0x0f);
} \
} while (0)
-__RISCV_INSN_FUNCS(c_j, 0xe003, 0xa001);
-__RISCV_INSN_FUNCS(c_jr, 0xf07f, 0x8002);
-__RISCV_INSN_FUNCS(c_jal, 0xe003, 0x2001);
-__RISCV_INSN_FUNCS(c_jalr, 0xf07f, 0x9002);
-__RISCV_INSN_FUNCS(c_beqz, 0xe003, 0xc001);
-__RISCV_INSN_FUNCS(c_bnez, 0xe003, 0xe001);
-__RISCV_INSN_FUNCS(c_ebreak, 0xffff, 0x9002);
-
-__RISCV_INSN_FUNCS(auipc, 0x7f, 0x17);
-__RISCV_INSN_FUNCS(branch, 0x7f, 0x63);
-
-__RISCV_INSN_FUNCS(jal, 0x7f, 0x6f);
-__RISCV_INSN_FUNCS(jalr, 0x707f, 0x67);
+bool simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *regs);
+bool simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs);
+bool simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs);
+bool simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs);
#endif /* _RISCV_KERNEL_PROBES_SIMULATE_INSN_H */
diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
index 5ab1c7e1a6ed..a72879b4249a 100644
--- a/arch/riscv/kernel/riscv_ksyms.c
+++ b/arch/riscv/kernel/riscv_ksyms.c
@@ -12,6 +12,9 @@
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
+EXPORT_SYMBOL(strcmp);
+EXPORT_SYMBOL(strlen);
+EXPORT_SYMBOL(strncmp);
EXPORT_SYMBOL(__memset);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(__memmove);
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 86acd690d529..376d2827e736 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -300,6 +300,9 @@ void __init setup_arch(char **cmdline_p)
riscv_init_cbom_blocksize();
riscv_fill_hwcap();
apply_boot_alternatives();
+ if (IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM) &&
+ riscv_isa_extension_available(NULL, ZICBOM))
+ riscv_noncoherent_supported();
}
static int __init topology_init(void)
diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c
index 549bde5c970a..f6fda94e8e59 100644
--- a/arch/riscv/kernel/traps.c
+++ b/arch/riscv/kernel/traps.c
@@ -29,22 +29,46 @@ int show_unhandled_signals = 1;
static DEFINE_SPINLOCK(die_lock);
+static void dump_kernel_instr(const char *loglvl, struct pt_regs *regs)
+{
+ char str[sizeof("0000 ") * 12 + 2 + 1], *p = str;
+ const u16 *insns = (u16 *)instruction_pointer(regs);
+ long bad;
+ u16 val;
+ int i;
+
+ for (i = -10; i < 2; i++) {
+ bad = get_kernel_nofault(val, &insns[i]);
+ if (!bad) {
+ p += sprintf(p, i == 0 ? "(%04hx) " : "%04hx ", val);
+ } else {
+ printk("%sCode: Unable to access instruction at 0x%px.\n",
+ loglvl, &insns[i]);
+ return;
+ }
+ }
+ printk("%sCode: %s\n", loglvl, str);
+}
+
void die(struct pt_regs *regs, const char *str)
{
static int die_counter;
int ret;
long cause;
+ unsigned long flags;
oops_enter();
- spin_lock_irq(&die_lock);
+ spin_lock_irqsave(&die_lock, flags);
console_verbose();
bust_spinlocks(1);
pr_emerg("%s [#%d]\n", str, ++die_counter);
print_modules();
- if (regs)
+ if (regs) {
show_regs(regs);
+ dump_kernel_instr(KERN_EMERG, regs);
+ }
cause = regs ? regs->cause : -1;
ret = notify_die(DIE_OOPS, str, regs, 0, cause, SIGSEGV);
@@ -54,7 +78,7 @@ void die(struct pt_regs *regs, const char *str)
bust_spinlocks(0);
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
- spin_unlock_irq(&die_lock);
+ spin_unlock_irqrestore(&die_lock, flags);
oops_exit();
if (in_interrupt())
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index 5c30212d8d1c..cc2d1e8c8736 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -22,11 +22,6 @@ struct vdso_data {
};
#endif
-extern char vdso_start[], vdso_end[];
-#ifdef CONFIG_COMPAT
-extern char compat_vdso_start[], compat_vdso_end[];
-#endif
-
enum vvar_pages {
VVAR_DATA_PAGE_OFFSET,
VVAR_TIMENS_PAGE_OFFSET,
diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S
index 150b1a572e61..4a0606633290 100644
--- a/arch/riscv/kernel/vdso/vdso.lds.S
+++ b/arch/riscv/kernel/vdso/vdso.lds.S
@@ -40,6 +40,13 @@ SECTIONS
. = 0x800;
.text : { *(.text .text.*) } :text
+ . = ALIGN(4);
+ .alternative : {
+ __alt_start = .;
+ *(.alternative)
+ __alt_end = .;
+ }
+
.data : {
*(.got.plt) *(.got)
*(.data .data.* .gnu.linkonce.d.*)
diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
index 643ab60e9efb..53a8ad65b255 100644
--- a/arch/riscv/kernel/vmlinux.lds.S
+++ b/arch/riscv/kernel/vmlinux.lds.S
@@ -5,6 +5,7 @@
*/
#define RO_EXCEPTION_TABLE_ALIGN 4
+#define RUNTIME_DISCARD_EXIT
#ifdef CONFIG_XIP_KERNEL
#include "vmlinux-xip.lds.S"
@@ -85,6 +86,9 @@ SECTIONS
/* Start of init data section */
__init_data_begin = .;
INIT_DATA_SECTION(16)
+ .init.bss : {
+ *(.init.bss) /* from the EFI stub */
+ }
.exit.data :
{
EXIT_DATA
@@ -95,6 +99,10 @@ SECTIONS
*(.rel.dyn*)
}
+ .rela.dyn : {
+ *(.rela*)
+ }
+
__init_data_end = .;
. = ALIGN(8);
@@ -140,6 +148,7 @@ SECTIONS
STABS_DEBUG
DWARF_DEBUG
ELF_DETAILS
+ .riscv.attributes 0 : { *(.riscv.attributes) }
DISCARDS
}
diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig
index f36a737d5f96..d5a658a047a7 100644
--- a/arch/riscv/kvm/Kconfig
+++ b/arch/riscv/kvm/Kconfig
@@ -20,6 +20,7 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)"
depends on RISCV_SBI && MMU
+ select KVM_GENERIC_HARDWARE_ENABLING
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select KVM_MMIO
diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile
index 019df9208bdd..278e97c06e0a 100644
--- a/arch/riscv/kvm/Makefile
+++ b/arch/riscv/kvm/Makefile
@@ -25,3 +25,4 @@ kvm-y += vcpu_sbi_base.o
kvm-y += vcpu_sbi_replace.o
kvm-y += vcpu_sbi_hsm.o
kvm-y += vcpu_timer.o
+kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index 58c5489d3031..41ad7639a17b 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -20,16 +20,6 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -EINVAL;
}
-int kvm_arch_check_processor_compat(void *opaque)
-{
- return 0;
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
- return 0;
-}
-
int kvm_arch_hardware_enable(void)
{
unsigned long hideleg, hedeleg;
@@ -49,7 +39,8 @@ int kvm_arch_hardware_enable(void)
hideleg |= (1UL << IRQ_VS_EXT);
csr_write(CSR_HIDELEG, hideleg);
- csr_write(CSR_HCOUNTEREN, -1UL);
+ /* VS should access only the time counter directly. Everything else should trap */
+ csr_write(CSR_HCOUNTEREN, 0x02);
csr_write(CSR_HVIP, 0);
@@ -70,7 +61,7 @@ void kvm_arch_hardware_disable(void)
csr_write(CSR_HIDELEG, 0);
}
-int kvm_arch_init(void *opaque)
+static int __init riscv_kvm_init(void)
{
const char *str;
@@ -115,16 +106,7 @@ int kvm_arch_init(void *opaque)
kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
- return 0;
-}
-
-void kvm_arch_exit(void)
-{
-}
-
-static int __init riscv_kvm_init(void)
-{
- return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ return kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
}
module_init(riscv_kvm_init);
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 34b57e0be2ef..78211aed36fa 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -20,12 +20,12 @@
#include <asm/pgtable.h>
#ifdef CONFIG_64BIT
-static unsigned long gstage_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
-static unsigned long gstage_pgd_levels = 3;
+static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
+static unsigned long gstage_pgd_levels __ro_after_init = 3;
#define gstage_index_bits 9
#else
-static unsigned long gstage_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
-static unsigned long gstage_pgd_levels = 2;
+static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
+static unsigned long gstage_pgd_levels __ro_after_init = 2;
#define gstage_index_bits 10
#endif
@@ -585,7 +585,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (!kvm->arch.pgd)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
&ptep, &ptep_level))
@@ -603,7 +603,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (!kvm->arch.pgd)
return false;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE);
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT,
&ptep, &ptep_level))
@@ -645,12 +645,12 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu,
if (logging || (vma->vm_flags & VM_PFNMAP))
vma_pagesize = PAGE_SIZE;
- if (vma_pagesize == PMD_SIZE || vma_pagesize == PGDIR_SIZE)
+ if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
gfn = (gpa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
mmap_read_unlock(current->mm);
- if (vma_pagesize != PGDIR_SIZE &&
+ if (vma_pagesize != PUD_SIZE &&
vma_pagesize != PMD_SIZE &&
vma_pagesize != PAGE_SIZE) {
kvm_err("Invalid VMA page size 0x%lx\n", vma_pagesize);
@@ -758,7 +758,7 @@ void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu)
kvm_riscv_local_hfence_gvma_all();
}
-void kvm_riscv_gstage_mode_detect(void)
+void __init kvm_riscv_gstage_mode_detect(void)
{
#ifdef CONFIG_64BIT
/* Try Sv57x4 G-stage mode */
@@ -782,7 +782,7 @@ skip_sv48x4_test:
#endif
}
-unsigned long kvm_riscv_gstage_mode(void)
+unsigned long __init kvm_riscv_gstage_mode(void)
{
return gstage_mode >> HGATP_MODE_SHIFT;
}
diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c
index 309d79b3e5cd..0e5479600695 100644
--- a/arch/riscv/kvm/tlb.c
+++ b/arch/riscv/kvm/tlb.c
@@ -15,8 +15,7 @@
#include <asm/hwcap.h>
#include <asm/insn-def.h>
-#define has_svinval() \
- static_branch_unlikely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_SVINVAL])
+#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL)
void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid,
gpa_t gpa, gpa_t gpsz,
@@ -181,6 +180,7 @@ void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu)
void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu)
{
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD);
local_flush_icache_all();
}
@@ -264,15 +264,18 @@ void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu)
d.addr, d.size, d.order);
break;
case KVM_RISCV_HFENCE_VVMA_ASID_GVA:
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD);
kvm_riscv_local_hfence_vvma_asid_gva(
READ_ONCE(v->vmid), d.asid,
d.addr, d.size, d.order);
break;
case KVM_RISCV_HFENCE_VVMA_ASID_ALL:
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD);
kvm_riscv_local_hfence_vvma_asid_all(
READ_ONCE(v->vmid), d.asid);
break;
case KVM_RISCV_HFENCE_VVMA_GVA:
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD);
kvm_riscv_local_hfence_vvma_gva(
READ_ONCE(v->vmid),
d.addr, d.size, d.order);
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index 7c08567097f0..7d010b0be54e 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -138,6 +138,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu)
WRITE_ONCE(vcpu->arch.irqs_pending, 0);
WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0);
+ kvm_riscv_vcpu_pmu_reset(vcpu);
+
vcpu->arch.hfence_head = 0;
vcpu->arch.hfence_tail = 0;
memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue));
@@ -194,6 +196,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
/* Setup VCPU timer */
kvm_riscv_vcpu_timer_init(vcpu);
+ /* setup performance monitoring */
+ kvm_riscv_vcpu_pmu_init(vcpu);
+
/* Reset VCPU */
kvm_riscv_reset_vcpu(vcpu);
@@ -216,6 +221,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
/* Cleanup VCPU timer */
kvm_riscv_vcpu_timer_deinit(vcpu);
+ kvm_riscv_vcpu_pmu_deinit(vcpu);
+
/* Free unused pages pre-allocated for G-stage page table mappings */
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
}
diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c
index c9f741ab26f5..4ea101a73d8b 100644
--- a/arch/riscv/kvm/vcpu_exit.c
+++ b/arch/riscv/kvm/vcpu_exit.c
@@ -160,6 +160,9 @@ void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu,
/* Set Guest PC to Guest exception vector */
vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC);
+
+ /* Set Guest privilege mode to supervisor */
+ vcpu->arch.guest_context.sstatus |= SR_SPP;
}
/*
@@ -179,6 +182,12 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
ret = -EFAULT;
run->exit_reason = KVM_EXIT_UNKNOWN;
switch (trap->scause) {
+ case EXC_INST_ILLEGAL:
+ if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) {
+ kvm_riscv_vcpu_trap_redirect(vcpu, trap);
+ ret = 1;
+ }
+ break;
case EXC_VIRTUAL_INST_FAULT:
if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV)
ret = kvm_riscv_vcpu_virtual_insn(vcpu, run, trap);
diff --git a/arch/riscv/kvm/vcpu_insn.c b/arch/riscv/kvm/vcpu_insn.c
index 0bb52761a3f7..f689337b78ff 100644
--- a/arch/riscv/kvm/vcpu_insn.c
+++ b/arch/riscv/kvm/vcpu_insn.c
@@ -213,7 +213,9 @@ struct csr_func {
unsigned long wr_mask);
};
-static const struct csr_func csr_funcs[] = { };
+static const struct csr_func csr_funcs[] = {
+ KVM_RISCV_VCPU_HPMCOUNTER_CSR_FUNCS
+};
/**
* kvm_riscv_vcpu_csr_return -- Handle CSR read/write after user space
diff --git a/arch/riscv/kvm/vcpu_pmu.c b/arch/riscv/kvm/vcpu_pmu.c
new file mode 100644
index 000000000000..86391a5061dd
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_pmu.c
@@ -0,0 +1,633 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Rivos Inc
+ *
+ * Authors:
+ * Atish Patra <atishp@rivosinc.com>
+ */
+
+#define pr_fmt(fmt) "riscv-kvm-pmu: " fmt
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <linux/perf/riscv_pmu.h>
+#include <asm/csr.h>
+#include <asm/kvm_vcpu_sbi.h>
+#include <asm/kvm_vcpu_pmu.h>
+#include <linux/bitops.h>
+
+#define kvm_pmu_num_counters(pmu) ((pmu)->num_hw_ctrs + (pmu)->num_fw_ctrs)
+#define get_event_type(x) (((x) & SBI_PMU_EVENT_IDX_TYPE_MASK) >> 16)
+#define get_event_code(x) ((x) & SBI_PMU_EVENT_IDX_CODE_MASK)
+
+static enum perf_hw_id hw_event_perf_map[SBI_PMU_HW_GENERAL_MAX] = {
+ [SBI_PMU_HW_CPU_CYCLES] = PERF_COUNT_HW_CPU_CYCLES,
+ [SBI_PMU_HW_INSTRUCTIONS] = PERF_COUNT_HW_INSTRUCTIONS,
+ [SBI_PMU_HW_CACHE_REFERENCES] = PERF_COUNT_HW_CACHE_REFERENCES,
+ [SBI_PMU_HW_CACHE_MISSES] = PERF_COUNT_HW_CACHE_MISSES,
+ [SBI_PMU_HW_BRANCH_INSTRUCTIONS] = PERF_COUNT_HW_BRANCH_INSTRUCTIONS,
+ [SBI_PMU_HW_BRANCH_MISSES] = PERF_COUNT_HW_BRANCH_MISSES,
+ [SBI_PMU_HW_BUS_CYCLES] = PERF_COUNT_HW_BUS_CYCLES,
+ [SBI_PMU_HW_STALLED_CYCLES_FRONTEND] = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND,
+ [SBI_PMU_HW_STALLED_CYCLES_BACKEND] = PERF_COUNT_HW_STALLED_CYCLES_BACKEND,
+ [SBI_PMU_HW_REF_CPU_CYCLES] = PERF_COUNT_HW_REF_CPU_CYCLES,
+};
+
+static u64 kvm_pmu_get_sample_period(struct kvm_pmc *pmc)
+{
+ u64 counter_val_mask = GENMASK(pmc->cinfo.width, 0);
+ u64 sample_period;
+
+ if (!pmc->counter_val)
+ sample_period = counter_val_mask + 1;
+ else
+ sample_period = (-pmc->counter_val) & counter_val_mask;
+
+ return sample_period;
+}
+
+static u32 kvm_pmu_get_perf_event_type(unsigned long eidx)
+{
+ enum sbi_pmu_event_type etype = get_event_type(eidx);
+ u32 type = PERF_TYPE_MAX;
+
+ switch (etype) {
+ case SBI_PMU_EVENT_TYPE_HW:
+ type = PERF_TYPE_HARDWARE;
+ break;
+ case SBI_PMU_EVENT_TYPE_CACHE:
+ type = PERF_TYPE_HW_CACHE;
+ break;
+ case SBI_PMU_EVENT_TYPE_RAW:
+ case SBI_PMU_EVENT_TYPE_FW:
+ type = PERF_TYPE_RAW;
+ break;
+ default:
+ break;
+ }
+
+ return type;
+}
+
+static bool kvm_pmu_is_fw_event(unsigned long eidx)
+{
+ return get_event_type(eidx) == SBI_PMU_EVENT_TYPE_FW;
+}
+
+static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ perf_event_disable(pmc->perf_event);
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ }
+}
+
+static u64 kvm_pmu_get_perf_event_hw_config(u32 sbi_event_code)
+{
+ return hw_event_perf_map[sbi_event_code];
+}
+
+static u64 kvm_pmu_get_perf_event_cache_config(u32 sbi_event_code)
+{
+ u64 config = U64_MAX;
+ unsigned int cache_type, cache_op, cache_result;
+
+ /* All the cache event masks lie within 0xFF. No separate masking is necessary */
+ cache_type = (sbi_event_code & SBI_PMU_EVENT_CACHE_ID_CODE_MASK) >>
+ SBI_PMU_EVENT_CACHE_ID_SHIFT;
+ cache_op = (sbi_event_code & SBI_PMU_EVENT_CACHE_OP_ID_CODE_MASK) >>
+ SBI_PMU_EVENT_CACHE_OP_SHIFT;
+ cache_result = sbi_event_code & SBI_PMU_EVENT_CACHE_RESULT_ID_CODE_MASK;
+
+ if (cache_type >= PERF_COUNT_HW_CACHE_MAX ||
+ cache_op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+ cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+ return config;
+
+ config = cache_type | (cache_op << 8) | (cache_result << 16);
+
+ return config;
+}
+
+static u64 kvm_pmu_get_perf_event_config(unsigned long eidx, uint64_t evt_data)
+{
+ enum sbi_pmu_event_type etype = get_event_type(eidx);
+ u32 ecode = get_event_code(eidx);
+ u64 config = U64_MAX;
+
+ switch (etype) {
+ case SBI_PMU_EVENT_TYPE_HW:
+ if (ecode < SBI_PMU_HW_GENERAL_MAX)
+ config = kvm_pmu_get_perf_event_hw_config(ecode);
+ break;
+ case SBI_PMU_EVENT_TYPE_CACHE:
+ config = kvm_pmu_get_perf_event_cache_config(ecode);
+ break;
+ case SBI_PMU_EVENT_TYPE_RAW:
+ config = evt_data & RISCV_PMU_RAW_EVENT_MASK;
+ break;
+ case SBI_PMU_EVENT_TYPE_FW:
+ if (ecode < SBI_PMU_FW_MAX)
+ config = (1ULL << 63) | ecode;
+ break;
+ default:
+ break;
+ }
+
+ return config;
+}
+
+static int kvm_pmu_get_fixed_pmc_index(unsigned long eidx)
+{
+ u32 etype = kvm_pmu_get_perf_event_type(eidx);
+ u32 ecode = get_event_code(eidx);
+
+ if (etype != SBI_PMU_EVENT_TYPE_HW)
+ return -EINVAL;
+
+ if (ecode == SBI_PMU_HW_CPU_CYCLES)
+ return 0;
+ else if (ecode == SBI_PMU_HW_INSTRUCTIONS)
+ return 2;
+ else
+ return -EINVAL;
+}
+
+static int kvm_pmu_get_programmable_pmc_index(struct kvm_pmu *kvpmu, unsigned long eidx,
+ unsigned long cbase, unsigned long cmask)
+{
+ int ctr_idx = -1;
+ int i, pmc_idx;
+ int min, max;
+
+ if (kvm_pmu_is_fw_event(eidx)) {
+ /* Firmware counters are mapped 1:1 starting from num_hw_ctrs for simplicity */
+ min = kvpmu->num_hw_ctrs;
+ max = min + kvpmu->num_fw_ctrs;
+ } else {
+ /* First 3 counters are reserved for fixed counters */
+ min = 3;
+ max = kvpmu->num_hw_ctrs;
+ }
+
+ for_each_set_bit(i, &cmask, BITS_PER_LONG) {
+ pmc_idx = i + cbase;
+ if ((pmc_idx >= min && pmc_idx < max) &&
+ !test_bit(pmc_idx, kvpmu->pmc_in_use)) {
+ ctr_idx = pmc_idx;
+ break;
+ }
+ }
+
+ return ctr_idx;
+}
+
+static int pmu_get_pmc_index(struct kvm_pmu *pmu, unsigned long eidx,
+ unsigned long cbase, unsigned long cmask)
+{
+ int ret;
+
+ /* Fixed counters need to be have fixed mapping as they have different width */
+ ret = kvm_pmu_get_fixed_pmc_index(eidx);
+ if (ret >= 0)
+ return ret;
+
+ return kvm_pmu_get_programmable_pmc_index(pmu, eidx, cbase, cmask);
+}
+
+static int pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
+ unsigned long *out_val)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u64 enabled, running;
+ int fevent_code;
+
+ pmc = &kvpmu->pmc[cidx];
+
+ if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
+ fevent_code = get_event_code(pmc->event_idx);
+ pmc->counter_val = kvpmu->fw_event[fevent_code].value;
+ } else if (pmc->perf_event) {
+ pmc->counter_val += perf_event_read_value(pmc->perf_event, &enabled, &running);
+ } else {
+ return -EINVAL;
+ }
+ *out_val = pmc->counter_val;
+
+ return 0;
+}
+
+static int kvm_pmu_validate_counter_mask(struct kvm_pmu *kvpmu, unsigned long ctr_base,
+ unsigned long ctr_mask)
+{
+ /* Make sure the we have a valid counter mask requested from the caller */
+ if (!ctr_mask || (ctr_base + __fls(ctr_mask) >= kvm_pmu_num_counters(kvpmu)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int kvm_pmu_create_perf_event(struct kvm_pmc *pmc, struct perf_event_attr *attr,
+ unsigned long flags, unsigned long eidx, unsigned long evtdata)
+{
+ struct perf_event *event;
+
+ kvm_pmu_release_perf_event(pmc);
+ attr->config = kvm_pmu_get_perf_event_config(eidx, evtdata);
+ if (flags & SBI_PMU_CFG_FLAG_CLEAR_VALUE) {
+ //TODO: Do we really want to clear the value in hardware counter
+ pmc->counter_val = 0;
+ }
+
+ /*
+ * Set the default sample_period for now. The guest specified value
+ * will be updated in the start call.
+ */
+ attr->sample_period = kvm_pmu_get_sample_period(pmc);
+
+ event = perf_event_create_kernel_counter(attr, -1, current, NULL, pmc);
+ if (IS_ERR(event)) {
+ pr_err("kvm pmu event creation failed for eidx %lx: %ld\n", eidx, PTR_ERR(event));
+ return PTR_ERR(event);
+ }
+
+ pmc->perf_event = event;
+ if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
+ perf_event_enable(pmc->perf_event);
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_incr_fw(struct kvm_vcpu *vcpu, unsigned long fid)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_fw_event *fevent;
+
+ if (!kvpmu || fid >= SBI_PMU_FW_MAX)
+ return -EINVAL;
+
+ fevent = &kvpmu->fw_event[fid];
+ if (fevent->started)
+ fevent->value++;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_read_hpm(struct kvm_vcpu *vcpu, unsigned int csr_num,
+ unsigned long *val, unsigned long new_val,
+ unsigned long wr_mask)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ int cidx, ret = KVM_INSN_CONTINUE_NEXT_SEPC;
+
+ if (!kvpmu || !kvpmu->init_done) {
+ /*
+ * In absence of sscofpmf in the platform, the guest OS may use
+ * the legacy PMU driver to read cycle/instret. In that case,
+ * just return 0 to avoid any illegal trap. However, any other
+ * hpmcounter access should result in illegal trap as they must
+ * be access through SBI PMU only.
+ */
+ if (csr_num == CSR_CYCLE || csr_num == CSR_INSTRET) {
+ *val = 0;
+ return ret;
+ } else {
+ return KVM_INSN_ILLEGAL_TRAP;
+ }
+ }
+
+ /* The counter CSR are read only. Thus, any write should result in illegal traps */
+ if (wr_mask)
+ return KVM_INSN_ILLEGAL_TRAP;
+
+ cidx = csr_num - CSR_CYCLE;
+
+ if (pmu_ctr_read(vcpu, cidx, val) < 0)
+ return KVM_INSN_ILLEGAL_TRAP;
+
+ return ret;
+}
+
+int kvm_riscv_vcpu_pmu_num_ctrs(struct kvm_vcpu *vcpu,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+
+ retdata->out_val = kvm_pmu_num_counters(kvpmu);
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_info(struct kvm_vcpu *vcpu, unsigned long cidx,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+
+ if (cidx > RISCV_KVM_MAX_COUNTERS || cidx == 1) {
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+ return 0;
+ }
+
+ retdata->out_val = kvpmu->pmc[cidx].cinfo.value;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_start(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags, u64 ival,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ int i, pmc_index, sbiret = 0;
+ struct kvm_pmc *pmc;
+ int fevent_code;
+
+ if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ /* Start the counters that have been configured and requested by the guest */
+ for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
+ pmc_index = i + ctr_base;
+ if (!test_bit(pmc_index, kvpmu->pmc_in_use))
+ continue;
+ pmc = &kvpmu->pmc[pmc_index];
+ if (flags & SBI_PMU_START_FLAG_SET_INIT_VALUE)
+ pmc->counter_val = ival;
+ if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
+ fevent_code = get_event_code(pmc->event_idx);
+ if (fevent_code >= SBI_PMU_FW_MAX) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ /* Check if the counter was already started for some reason */
+ if (kvpmu->fw_event[fevent_code].started) {
+ sbiret = SBI_ERR_ALREADY_STARTED;
+ continue;
+ }
+
+ kvpmu->fw_event[fevent_code].started = true;
+ kvpmu->fw_event[fevent_code].value = pmc->counter_val;
+ } else if (pmc->perf_event) {
+ if (unlikely(pmc->started)) {
+ sbiret = SBI_ERR_ALREADY_STARTED;
+ continue;
+ }
+ perf_event_period(pmc->perf_event, kvm_pmu_get_sample_period(pmc));
+ perf_event_enable(pmc->perf_event);
+ pmc->started = true;
+ } else {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ }
+ }
+
+out:
+ retdata->err_val = sbiret;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_stop(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ int i, pmc_index, sbiret = 0;
+ u64 enabled, running;
+ struct kvm_pmc *pmc;
+ int fevent_code;
+
+ if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ /* Stop the counters that have been configured and requested by the guest */
+ for_each_set_bit(i, &ctr_mask, RISCV_MAX_COUNTERS) {
+ pmc_index = i + ctr_base;
+ if (!test_bit(pmc_index, kvpmu->pmc_in_use))
+ continue;
+ pmc = &kvpmu->pmc[pmc_index];
+ if (pmc->cinfo.type == SBI_PMU_CTR_TYPE_FW) {
+ fevent_code = get_event_code(pmc->event_idx);
+ if (fevent_code >= SBI_PMU_FW_MAX) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ if (!kvpmu->fw_event[fevent_code].started)
+ sbiret = SBI_ERR_ALREADY_STOPPED;
+
+ kvpmu->fw_event[fevent_code].started = false;
+ } else if (pmc->perf_event) {
+ if (pmc->started) {
+ /* Stop counting the counter */
+ perf_event_disable(pmc->perf_event);
+ pmc->started = false;
+ } else {
+ sbiret = SBI_ERR_ALREADY_STOPPED;
+ }
+
+ if (flags & SBI_PMU_STOP_FLAG_RESET) {
+ /* Relase the counter if this is a reset request */
+ pmc->counter_val += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ kvm_pmu_release_perf_event(pmc);
+ }
+ } else {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ }
+ if (flags & SBI_PMU_STOP_FLAG_RESET) {
+ pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
+ clear_bit(pmc_index, kvpmu->pmc_in_use);
+ }
+ }
+
+out:
+ retdata->err_val = sbiret;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_cfg_match(struct kvm_vcpu *vcpu, unsigned long ctr_base,
+ unsigned long ctr_mask, unsigned long flags,
+ unsigned long eidx, u64 evtdata,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ctr_idx, ret, sbiret = 0;
+ bool is_fevent;
+ unsigned long event_code;
+ u32 etype = kvm_pmu_get_perf_event_type(eidx);
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc = NULL;
+ struct perf_event_attr attr = {
+ .type = etype,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = true,
+ /*
+ * It should never reach here if the platform doesn't support the sscofpmf
+ * extension as mode filtering won't work without it.
+ */
+ .exclude_host = true,
+ .exclude_hv = true,
+ .exclude_user = !!(flags & SBI_PMU_CFG_FLAG_SET_UINH),
+ .exclude_kernel = !!(flags & SBI_PMU_CFG_FLAG_SET_SINH),
+ .config1 = RISCV_PMU_CONFIG1_GUEST_EVENTS,
+ };
+
+ if (kvm_pmu_validate_counter_mask(kvpmu, ctr_base, ctr_mask) < 0) {
+ sbiret = SBI_ERR_INVALID_PARAM;
+ goto out;
+ }
+
+ event_code = get_event_code(eidx);
+ is_fevent = kvm_pmu_is_fw_event(eidx);
+ if (is_fevent && event_code >= SBI_PMU_FW_MAX) {
+ sbiret = SBI_ERR_NOT_SUPPORTED;
+ goto out;
+ }
+
+ /*
+ * SKIP_MATCH flag indicates the caller is aware of the assigned counter
+ * for this event. Just do a sanity check if it already marked used.
+ */
+ if (flags & SBI_PMU_CFG_FLAG_SKIP_MATCH) {
+ if (!test_bit(ctr_base + __ffs(ctr_mask), kvpmu->pmc_in_use)) {
+ sbiret = SBI_ERR_FAILURE;
+ goto out;
+ }
+ ctr_idx = ctr_base + __ffs(ctr_mask);
+ } else {
+ ctr_idx = pmu_get_pmc_index(kvpmu, eidx, ctr_base, ctr_mask);
+ if (ctr_idx < 0) {
+ sbiret = SBI_ERR_NOT_SUPPORTED;
+ goto out;
+ }
+ }
+
+ pmc = &kvpmu->pmc[ctr_idx];
+ pmc->idx = ctr_idx;
+
+ if (is_fevent) {
+ if (flags & SBI_PMU_CFG_FLAG_AUTO_START)
+ kvpmu->fw_event[event_code].started = true;
+ } else {
+ ret = kvm_pmu_create_perf_event(pmc, &attr, flags, eidx, evtdata);
+ if (ret)
+ return ret;
+ }
+
+ set_bit(ctr_idx, kvpmu->pmc_in_use);
+ pmc->event_idx = eidx;
+ retdata->out_val = ctr_idx;
+out:
+ retdata->err_val = sbiret;
+
+ return 0;
+}
+
+int kvm_riscv_vcpu_pmu_ctr_read(struct kvm_vcpu *vcpu, unsigned long cidx,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ret;
+
+ ret = pmu_ctr_read(vcpu, cidx, &retdata->out_val);
+ if (ret == -EINVAL)
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+
+ return 0;
+}
+
+void kvm_riscv_vcpu_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i = 0, ret, num_hw_ctrs = 0, hpm_width = 0;
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ /*
+ * PMU functionality should be only available to guests if privilege mode
+ * filtering is available in the host. Otherwise, guest will always count
+ * events while the execution is in hypervisor mode.
+ */
+ if (!riscv_isa_extension_available(NULL, SSCOFPMF))
+ return;
+
+ ret = riscv_pmu_get_hpm_info(&hpm_width, &num_hw_ctrs);
+ if (ret < 0 || !hpm_width || !num_hw_ctrs)
+ return;
+
+ /*
+ * Increase the number of hardware counters to offset the time counter.
+ */
+ kvpmu->num_hw_ctrs = num_hw_ctrs + 1;
+ kvpmu->num_fw_ctrs = SBI_PMU_FW_MAX;
+ memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
+
+ if (kvpmu->num_hw_ctrs > RISCV_KVM_MAX_HW_CTRS) {
+ pr_warn_once("Limiting the hardware counters to 32 as specified by the ISA");
+ kvpmu->num_hw_ctrs = RISCV_KVM_MAX_HW_CTRS;
+ }
+
+ /*
+ * There is no correlation between the logical hardware counter and virtual counters.
+ * However, we need to encode a hpmcounter CSR in the counter info field so that
+ * KVM can trap n emulate the read. This works well in the migration use case as
+ * KVM doesn't care if the actual hpmcounter is available in the hardware or not.
+ */
+ for (i = 0; i < kvm_pmu_num_counters(kvpmu); i++) {
+ /* TIME CSR shouldn't be read from perf interface */
+ if (i == 1)
+ continue;
+ pmc = &kvpmu->pmc[i];
+ pmc->idx = i;
+ pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
+ if (i < kvpmu->num_hw_ctrs) {
+ pmc->cinfo.type = SBI_PMU_CTR_TYPE_HW;
+ if (i < 3)
+ /* CY, IR counters */
+ pmc->cinfo.width = 63;
+ else
+ pmc->cinfo.width = hpm_width;
+ /*
+ * The CSR number doesn't have any relation with the logical
+ * hardware counters. The CSR numbers are encoded sequentially
+ * to avoid maintaining a map between the virtual counter
+ * and CSR number.
+ */
+ pmc->cinfo.csr = CSR_CYCLE + i;
+ } else {
+ pmc->cinfo.type = SBI_PMU_CTR_TYPE_FW;
+ pmc->cinfo.width = BITS_PER_LONG - 1;
+ }
+ }
+
+ kvpmu->init_done = true;
+}
+
+void kvm_riscv_vcpu_pmu_deinit(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ int i;
+
+ if (!kvpmu)
+ return;
+
+ for_each_set_bit(i, kvpmu->pmc_in_use, RISCV_MAX_COUNTERS) {
+ pmc = &kvpmu->pmc[i];
+ pmc->counter_val = 0;
+ kvm_pmu_release_perf_event(pmc);
+ pmc->event_idx = SBI_PMU_EVENT_IDX_INVALID;
+ }
+ bitmap_zero(kvpmu->pmc_in_use, RISCV_MAX_COUNTERS);
+ memset(&kvpmu->fw_event, 0, SBI_PMU_FW_MAX * sizeof(struct kvm_fw_event));
+}
+
+void kvm_riscv_vcpu_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ kvm_riscv_vcpu_pmu_deinit(vcpu);
+}
diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c
index f96991d230bf..15fde15f9fb8 100644
--- a/arch/riscv/kvm/vcpu_sbi.c
+++ b/arch/riscv/kvm/vcpu_sbi.c
@@ -12,26 +12,6 @@
#include <asm/sbi.h>
#include <asm/kvm_vcpu_sbi.h>
-static int kvm_linux_err_map_sbi(int err)
-{
- switch (err) {
- case 0:
- return SBI_SUCCESS;
- case -EPERM:
- return SBI_ERR_DENIED;
- case -EINVAL:
- return SBI_ERR_INVALID_PARAM;
- case -EFAULT:
- return SBI_ERR_INVALID_ADDRESS;
- case -EOPNOTSUPP:
- return SBI_ERR_NOT_SUPPORTED;
- case -EALREADY:
- return SBI_ERR_ALREADY_AVAILABLE;
- default:
- return SBI_ERR_FAILURE;
- };
-}
-
#ifndef CONFIG_RISCV_SBI_V01
static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = {
.extid_start = -1UL,
@@ -40,6 +20,16 @@ static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = {
};
#endif
+#ifdef CONFIG_RISCV_PMU_SBI
+extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu;
+#else
+static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = {
+ .extid_start = -1UL,
+ .extid_end = -1UL,
+ .handler = NULL,
+};
+#endif
+
static const struct kvm_vcpu_sbi_extension *sbi_ext[] = {
&vcpu_sbi_ext_v01,
&vcpu_sbi_ext_base,
@@ -48,6 +38,7 @@ static const struct kvm_vcpu_sbi_extension *sbi_ext[] = {
&vcpu_sbi_ext_rfence,
&vcpu_sbi_ext_srst,
&vcpu_sbi_ext_hsm,
+ &vcpu_sbi_ext_pmu,
&vcpu_sbi_ext_experimental,
&vcpu_sbi_ext_vendor,
};
@@ -125,11 +116,14 @@ int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
int ret = 1;
bool next_sepc = true;
- bool userspace_exit = false;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
const struct kvm_vcpu_sbi_extension *sbi_ext;
- struct kvm_cpu_trap utrap = { 0 };
- unsigned long out_val = 0;
+ struct kvm_cpu_trap utrap = {0};
+ struct kvm_vcpu_sbi_return sbi_ret = {
+ .out_val = 0,
+ .err_val = 0,
+ .utrap = &utrap,
+ };
bool ext_is_v01 = false;
sbi_ext = kvm_vcpu_sbi_find_ext(cp->a7);
@@ -139,42 +133,46 @@ int kvm_riscv_vcpu_sbi_ecall(struct kvm_vcpu *vcpu, struct kvm_run *run)
cp->a7 <= SBI_EXT_0_1_SHUTDOWN)
ext_is_v01 = true;
#endif
- ret = sbi_ext->handler(vcpu, run, &out_val, &utrap, &userspace_exit);
+ ret = sbi_ext->handler(vcpu, run, &sbi_ret);
} else {
/* Return error for unsupported SBI calls */
cp->a0 = SBI_ERR_NOT_SUPPORTED;
goto ecall_done;
}
+ /*
+ * When the SBI extension returns a Linux error code, it exits the ioctl
+ * loop and forwards the error to userspace.
+ */
+ if (ret < 0) {
+ next_sepc = false;
+ goto ecall_done;
+ }
+
/* Handle special error cases i.e trap, exit or userspace forward */
- if (utrap.scause) {
+ if (sbi_ret.utrap->scause) {
/* No need to increment sepc or exit ioctl loop */
ret = 1;
- utrap.sepc = cp->sepc;
- kvm_riscv_vcpu_trap_redirect(vcpu, &utrap);
+ sbi_ret.utrap->sepc = cp->sepc;
+ kvm_riscv_vcpu_trap_redirect(vcpu, sbi_ret.utrap);
next_sepc = false;
goto ecall_done;
}
/* Exit ioctl loop or Propagate the error code the guest */
- if (userspace_exit) {
+ if (sbi_ret.uexit) {
next_sepc = false;
ret = 0;
} else {
- /**
- * SBI extension handler always returns an Linux error code. Convert
- * it to the SBI specific error code that can be propagated the SBI
- * caller.
- */
- ret = kvm_linux_err_map_sbi(ret);
- cp->a0 = ret;
+ cp->a0 = sbi_ret.err_val;
ret = 1;
}
ecall_done:
if (next_sepc)
cp->sepc += 4;
- if (!ext_is_v01)
- cp->a1 = out_val;
+ /* a1 should only be updated when we continue the ioctl loop */
+ if (!ext_is_v01 && ret == 1)
+ cp->a1 = sbi_ret.out_val;
return ret;
}
diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c
index 5d65c634d301..9945aff34c14 100644
--- a/arch/riscv/kvm/vcpu_sbi_base.c
+++ b/arch/riscv/kvm/vcpu_sbi_base.c
@@ -14,11 +14,11 @@
#include <asm/kvm_vcpu_sbi.h>
static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long *out_val,
- struct kvm_cpu_trap *trap, bool *exit)
+ struct kvm_vcpu_sbi_return *retdata)
{
- int ret = 0;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ const struct kvm_vcpu_sbi_extension *sbi_ext;
+ unsigned long *out_val = &retdata->out_val;
switch (cp->a6) {
case SBI_EXT_BASE_GET_SPEC_VERSION:
@@ -42,9 +42,12 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
* forward it to the userspace
*/
kvm_riscv_vcpu_sbi_forward(vcpu, run);
- *exit = true;
- } else
- *out_val = kvm_vcpu_sbi_find_ext(cp->a0) ? 1 : 0;
+ retdata->uexit = true;
+ } else {
+ sbi_ext = kvm_vcpu_sbi_find_ext(cp->a0);
+ *out_val = sbi_ext && sbi_ext->probe ?
+ sbi_ext->probe(vcpu) : !!sbi_ext;
+ }
break;
case SBI_EXT_BASE_GET_MVENDORID:
*out_val = vcpu->arch.mvendorid;
@@ -56,11 +59,11 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
*out_val = vcpu->arch.mimpid;
break;
default:
- ret = -EOPNOTSUPP;
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
break;
}
- return ret;
+ return 0;
}
const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = {
@@ -70,17 +73,15 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base = {
};
static int kvm_sbi_ext_forward_handler(struct kvm_vcpu *vcpu,
- struct kvm_run *run,
- unsigned long *out_val,
- struct kvm_cpu_trap *utrap,
- bool *exit)
+ struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
{
/*
* Both SBI experimental and vendor extensions are
* unconditionally forwarded to userspace.
*/
kvm_riscv_vcpu_sbi_forward(vcpu, run);
- *exit = true;
+ retdata->uexit = true;
return 0;
}
diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c
index 2e915cafd551..7dca0e9381d9 100644
--- a/arch/riscv/kvm/vcpu_sbi_hsm.c
+++ b/arch/riscv/kvm/vcpu_sbi_hsm.c
@@ -21,9 +21,9 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu)
target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid);
if (!target_vcpu)
- return -EINVAL;
+ return SBI_ERR_INVALID_PARAM;
if (!target_vcpu->arch.power_off)
- return -EALREADY;
+ return SBI_ERR_ALREADY_AVAILABLE;
reset_cntx = &target_vcpu->arch.guest_reset_context;
/* start address */
@@ -42,7 +42,7 @@ static int kvm_sbi_hsm_vcpu_start(struct kvm_vcpu *vcpu)
static int kvm_sbi_hsm_vcpu_stop(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.power_off)
- return -EINVAL;
+ return SBI_ERR_FAILURE;
kvm_riscv_vcpu_power_off(vcpu);
@@ -57,7 +57,7 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu)
target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid);
if (!target_vcpu)
- return -EINVAL;
+ return SBI_ERR_INVALID_PARAM;
if (!target_vcpu->arch.power_off)
return SBI_HSM_STATE_STARTED;
else if (vcpu->stat.generic.blocking)
@@ -67,9 +67,7 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu)
}
static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long *out_val,
- struct kvm_cpu_trap *utrap,
- bool *exit)
+ struct kvm_vcpu_sbi_return *retdata)
{
int ret = 0;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
@@ -88,27 +86,29 @@ static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
case SBI_EXT_HSM_HART_STATUS:
ret = kvm_sbi_hsm_vcpu_get_status(vcpu);
if (ret >= 0) {
- *out_val = ret;
- ret = 0;
+ retdata->out_val = ret;
+ retdata->err_val = 0;
}
- break;
+ return 0;
case SBI_EXT_HSM_HART_SUSPEND:
switch (cp->a0) {
case SBI_HSM_SUSPEND_RET_DEFAULT:
kvm_riscv_vcpu_wfi(vcpu);
break;
case SBI_HSM_SUSPEND_NON_RET_DEFAULT:
- ret = -EOPNOTSUPP;
+ ret = SBI_ERR_NOT_SUPPORTED;
break;
default:
- ret = -EINVAL;
+ ret = SBI_ERR_INVALID_PARAM;
}
break;
default:
- ret = -EOPNOTSUPP;
+ ret = SBI_ERR_NOT_SUPPORTED;
}
- return ret;
+ retdata->err_val = ret;
+
+ return 0;
}
const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm = {
diff --git a/arch/riscv/kvm/vcpu_sbi_pmu.c b/arch/riscv/kvm/vcpu_sbi_pmu.c
new file mode 100644
index 000000000000..7eca72df2cbd
--- /dev/null
+++ b/arch/riscv/kvm/vcpu_sbi_pmu.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Rivos Inc
+ *
+ * Authors:
+ * Atish Patra <atishp@rivosinc.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/kvm_host.h>
+#include <asm/csr.h>
+#include <asm/sbi.h>
+#include <asm/kvm_vcpu_sbi.h>
+
+static int kvm_sbi_ext_pmu_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
+ struct kvm_vcpu_sbi_return *retdata)
+{
+ int ret = 0;
+ struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+ unsigned long funcid = cp->a6;
+ u64 temp;
+
+ if (!kvpmu->init_done) {
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ return 0;
+ }
+
+ switch (funcid) {
+ case SBI_EXT_PMU_NUM_COUNTERS:
+ ret = kvm_riscv_vcpu_pmu_num_ctrs(vcpu, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_GET_INFO:
+ ret = kvm_riscv_vcpu_pmu_ctr_info(vcpu, cp->a0, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_CFG_MATCH:
+#if defined(CONFIG_32BIT)
+ temp = ((uint64_t)cp->a5 << 32) | cp->a4;
+#else
+ temp = cp->a4;
+#endif
+ /*
+ * This can fail if perf core framework fails to create an event.
+ * Forward the error to userspace because it's an error which
+ * happened within the host kernel. The other option would be
+ * to convert to an SBI error and forward to the guest.
+ */
+ ret = kvm_riscv_vcpu_pmu_ctr_cfg_match(vcpu, cp->a0, cp->a1,
+ cp->a2, cp->a3, temp, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_START:
+#if defined(CONFIG_32BIT)
+ temp = ((uint64_t)cp->a4 << 32) | cp->a3;
+#else
+ temp = cp->a3;
+#endif
+ ret = kvm_riscv_vcpu_pmu_ctr_start(vcpu, cp->a0, cp->a1, cp->a2,
+ temp, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_STOP:
+ ret = kvm_riscv_vcpu_pmu_ctr_stop(vcpu, cp->a0, cp->a1, cp->a2, retdata);
+ break;
+ case SBI_EXT_PMU_COUNTER_FW_READ:
+ ret = kvm_riscv_vcpu_pmu_ctr_read(vcpu, cp->a0, retdata);
+ break;
+ default:
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
+ }
+
+ return ret;
+}
+
+static unsigned long kvm_sbi_ext_pmu_probe(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *kvpmu = vcpu_to_pmu(vcpu);
+
+ return kvpmu->init_done;
+}
+
+const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = {
+ .extid_start = SBI_EXT_PMU,
+ .extid_end = SBI_EXT_PMU,
+ .handler = kvm_sbi_ext_pmu_handler,
+ .probe = kvm_sbi_ext_pmu_probe,
+};
diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c
index 03a0198389f0..7c4d5d38a339 100644
--- a/arch/riscv/kvm/vcpu_sbi_replace.c
+++ b/arch/riscv/kvm/vcpu_sbi_replace.c
@@ -11,19 +11,21 @@
#include <linux/kvm_host.h>
#include <asm/sbi.h>
#include <asm/kvm_vcpu_timer.h>
+#include <asm/kvm_vcpu_pmu.h>
#include <asm/kvm_vcpu_sbi.h>
static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long *out_val,
- struct kvm_cpu_trap *utrap, bool *exit)
+ struct kvm_vcpu_sbi_return *retdata)
{
- int ret = 0;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
u64 next_cycle;
- if (cp->a6 != SBI_EXT_TIME_SET_TIMER)
- return -EINVAL;
+ if (cp->a6 != SBI_EXT_TIME_SET_TIMER) {
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+ return 0;
+ }
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_SET_TIMER);
#if __riscv_xlen == 32
next_cycle = ((u64)cp->a1 << 32) | (u64)cp->a0;
#else
@@ -31,7 +33,7 @@ static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
#endif
kvm_riscv_vcpu_timer_next_event(vcpu, next_cycle);
- return ret;
+ return 0;
}
const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time = {
@@ -41,8 +43,7 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time = {
};
static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long *out_val,
- struct kvm_cpu_trap *utrap, bool *exit)
+ struct kvm_vcpu_sbi_return *retdata)
{
int ret = 0;
unsigned long i;
@@ -51,9 +52,12 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
unsigned long hmask = cp->a0;
unsigned long hbase = cp->a1;
- if (cp->a6 != SBI_EXT_IPI_SEND_IPI)
- return -EINVAL;
+ if (cp->a6 != SBI_EXT_IPI_SEND_IPI) {
+ retdata->err_val = SBI_ERR_INVALID_PARAM;
+ return 0;
+ }
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_IPI_SENT);
kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
if (hbase != -1UL) {
if (tmp->vcpu_id < hbase)
@@ -64,6 +68,7 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
ret = kvm_riscv_vcpu_set_interrupt(tmp, IRQ_VS_SOFT);
if (ret < 0)
break;
+ kvm_riscv_vcpu_pmu_incr_fw(tmp, SBI_PMU_FW_IPI_RCVD);
}
return ret;
@@ -76,10 +81,8 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_ipi = {
};
static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long *out_val,
- struct kvm_cpu_trap *utrap, bool *exit)
+ struct kvm_vcpu_sbi_return *retdata)
{
- int ret = 0;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
unsigned long hmask = cp->a0;
unsigned long hbase = cp->a1;
@@ -88,6 +91,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
switch (funcid) {
case SBI_EXT_RFENCE_REMOTE_FENCE_I:
kvm_riscv_fence_i(vcpu->kvm, hbase, hmask);
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
if (cp->a2 == 0 && cp->a3 == 0)
@@ -95,6 +99,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
else
kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask,
cp->a2, cp->a3, PAGE_SHIFT);
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
if (cp->a2 == 0 && cp->a3 == 0)
@@ -105,6 +110,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
hbase, hmask,
cp->a2, cp->a3,
PAGE_SHIFT, cp->a4);
+ kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_SENT);
break;
case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID:
@@ -116,10 +122,10 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
*/
break;
default:
- ret = -EOPNOTSUPP;
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
}
- return ret;
+ return 0;
}
const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence = {
@@ -130,14 +136,12 @@ const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence = {
static int kvm_sbi_ext_srst_handler(struct kvm_vcpu *vcpu,
struct kvm_run *run,
- unsigned long *out_val,
- struct kvm_cpu_trap *utrap, bool *exit)
+ struct kvm_vcpu_sbi_return *retdata)
{
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
unsigned long funcid = cp->a6;
u32 reason = cp->a1;
u32 type = cp->a0;
- int ret = 0;
switch (funcid) {
case SBI_EXT_SRST_RESET:
@@ -146,24 +150,24 @@ static int kvm_sbi_ext_srst_handler(struct kvm_vcpu *vcpu,
kvm_riscv_vcpu_sbi_system_reset(vcpu, run,
KVM_SYSTEM_EVENT_SHUTDOWN,
reason);
- *exit = true;
+ retdata->uexit = true;
break;
case SBI_SRST_RESET_TYPE_COLD_REBOOT:
case SBI_SRST_RESET_TYPE_WARM_REBOOT:
kvm_riscv_vcpu_sbi_system_reset(vcpu, run,
KVM_SYSTEM_EVENT_RESET,
reason);
- *exit = true;
+ retdata->uexit = true;
break;
default:
- ret = -EOPNOTSUPP;
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
}
break;
default:
- ret = -EOPNOTSUPP;
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
}
- return ret;
+ return 0;
}
const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst = {
diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c
index 489f225ee66d..8f4c4fa16227 100644
--- a/arch/riscv/kvm/vcpu_sbi_v01.c
+++ b/arch/riscv/kvm/vcpu_sbi_v01.c
@@ -14,9 +14,7 @@
#include <asm/kvm_vcpu_sbi.h>
static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
- unsigned long *out_val,
- struct kvm_cpu_trap *utrap,
- bool *exit)
+ struct kvm_vcpu_sbi_return *retdata)
{
ulong hmask;
int i, ret = 0;
@@ -24,6 +22,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
struct kvm_vcpu *rvcpu;
struct kvm *kvm = vcpu->kvm;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
+ struct kvm_cpu_trap *utrap = retdata->utrap;
switch (cp->a7) {
case SBI_EXT_0_1_CONSOLE_GETCHAR:
@@ -33,7 +32,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
* handled in kernel so we forward these to user-space
*/
kvm_riscv_vcpu_sbi_forward(vcpu, run);
- *exit = true;
+ retdata->uexit = true;
break;
case SBI_EXT_0_1_SET_TIMER:
#if __riscv_xlen == 32
@@ -48,8 +47,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
break;
case SBI_EXT_0_1_SEND_IPI:
if (cp->a0)
- hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0,
- utrap);
+ hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, utrap);
else
hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1;
if (utrap->scause)
@@ -65,14 +63,13 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
case SBI_EXT_0_1_SHUTDOWN:
kvm_riscv_vcpu_sbi_system_reset(vcpu, run,
KVM_SYSTEM_EVENT_SHUTDOWN, 0);
- *exit = true;
+ retdata->uexit = true;
break;
case SBI_EXT_0_1_REMOTE_FENCE_I:
case SBI_EXT_0_1_REMOTE_SFENCE_VMA:
case SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID:
if (cp->a0)
- hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0,
- utrap);
+ hmask = kvm_riscv_vcpu_unpriv_read(vcpu, false, cp->a0, utrap);
else
hmask = (1UL << atomic_read(&kvm->online_vcpus)) - 1;
if (utrap->scause)
@@ -103,7 +100,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
}
break;
default:
- ret = -EINVAL;
+ retdata->err_val = SBI_ERR_NOT_SUPPORTED;
break;
}
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
index 6cd93995fb65..5246da1c9167 100644
--- a/arch/riscv/kvm/vmid.c
+++ b/arch/riscv/kvm/vmid.c
@@ -17,10 +17,10 @@
static unsigned long vmid_version = 1;
static unsigned long vmid_next;
-static unsigned long vmid_bits;
+static unsigned long vmid_bits __ro_after_init;
static DEFINE_SPINLOCK(vmid_lock);
-void kvm_riscv_gstage_vmid_detect(void)
+void __init kvm_riscv_gstage_vmid_detect(void)
{
unsigned long old;
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 25d5c9664e57..6c74b0bedd60 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -3,6 +3,9 @@ lib-y += delay.o
lib-y += memcpy.o
lib-y += memset.o
lib-y += memmove.o
+lib-y += strcmp.o
+lib-y += strlen.o
+lib-y += strncmp.o
lib-$(CONFIG_MMU) += uaccess.o
lib-$(CONFIG_64BIT) += tishift.o
diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
new file mode 100644
index 000000000000..986ab23fe787
--- /dev/null
+++ b/arch/riscv/lib/strcmp.S
@@ -0,0 +1,121 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
+
+/* int strcmp(const char *cs, const char *ct) */
+SYM_FUNC_START(strcmp)
+
+ ALTERNATIVE("nop", "j strcmp_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB)
+
+ /*
+ * Returns
+ * a0 - comparison result, value like strcmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ *
+ * Clobbers
+ * t0, t1
+ */
+1:
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne t0, t1, 2f
+ bnez t0, 1b
+ li a0, 0
+ ret
+2:
+ /*
+ * strcmp only needs to return (< 0, 0, > 0) values
+ * not necessarily -1, 0, +1
+ */
+ sub a0, t0, t1
+ ret
+
+/*
+ * Variant of strcmp using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strcmp_zbb:
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - comparison result, value like strcmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ *
+ * Clobbers
+ * t0, t1, t2, t3, t4, t5
+ */
+
+ or t2, a0, a1
+ li t4, -1
+ and t2, t2, SZREG-1
+ bnez t2, 3f
+
+ /* Main loop for aligned string. */
+ .p2align 3
+1:
+ REG_L t0, 0(a0)
+ REG_L t1, 0(a1)
+ orc.b t3, t0
+ bne t3, t4, 2f
+ addi a0, a0, SZREG
+ addi a1, a1, SZREG
+ beq t0, t1, 1b
+
+ /*
+ * Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare.
+ */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ rev8 t0, t0
+ rev8 t1, t1
+#endif
+
+ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
+ sltu a0, t0, t1
+ neg a0, a0
+ ori a0, a0, 1
+ ret
+
+2:
+ /*
+ * Found a null byte.
+ * If words don't match, fall back to simple loop.
+ */
+ bne t0, t1, 3f
+
+ /* Otherwise, strings are equal. */
+ li a0, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+ .p2align 3
+3:
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne t0, t1, 4f
+ bnez t0, 3b
+
+4:
+ sub a0, t0, t1
+ ret
+
+.option pop
+#endif
+SYM_FUNC_END(strcmp)
diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
new file mode 100644
index 000000000000..8345ceeee3f6
--- /dev/null
+++ b/arch/riscv/lib/strlen.S
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
+
+/* int strlen(const char *s) */
+SYM_FUNC_START(strlen)
+
+ ALTERNATIVE("nop", "j strlen_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB)
+
+ /*
+ * Returns
+ * a0 - string length
+ *
+ * Parameters
+ * a0 - String to measure
+ *
+ * Clobbers:
+ * t0, t1
+ */
+ mv t1, a0
+1:
+ lbu t0, 0(t1)
+ beqz t0, 2f
+ addi t1, t1, 1
+ j 1b
+2:
+ sub a0, t1, a0
+ ret
+
+/*
+ * Variant of strlen using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strlen_zbb:
+
+#ifdef CONFIG_CPU_BIG_ENDIAN
+# define CZ clz
+# define SHIFT sll
+#else
+# define CZ ctz
+# define SHIFT srl
+#endif
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - string length
+ *
+ * Parameters
+ * a0 - String to measure
+ *
+ * Clobbers
+ * t0, t1, t2, t3
+ */
+
+ /* Number of irrelevant bytes in the first word. */
+ andi t2, a0, SZREG-1
+
+ /* Align pointer. */
+ andi t0, a0, -SZREG
+
+ li t3, SZREG
+ sub t3, t3, t2
+ slli t2, t2, 3
+
+ /* Get the first word. */
+ REG_L t1, 0(t0)
+
+ /*
+ * Shift away the partial data we loaded to remove the irrelevant bytes
+ * preceding the string with the effect of adding NUL bytes at the
+ * end of the string's first word.
+ */
+ SHIFT t1, t1, t2
+
+ /* Convert non-NUL into 0xff and NUL into 0x00. */
+ orc.b t1, t1
+
+ /* Convert non-NUL into 0x00 and NUL into 0xff. */
+ not t1, t1
+
+ /*
+ * Search for the first set bit (corresponding to a NUL byte in the
+ * original chunk).
+ */
+ CZ t1, t1
+
+ /*
+ * The first chunk is special: compare against the number
+ * of valid bytes in this chunk.
+ */
+ srli a0, t1, 3
+ bgtu t3, a0, 3f
+
+ /* Prepare for the word comparison loop. */
+ addi t2, t0, SZREG
+ li t3, -1
+
+ /*
+ * Our critical loop is 4 instructions and processes data in
+ * 4 byte or 8 byte chunks.
+ */
+ .p2align 3
+1:
+ REG_L t1, SZREG(t0)
+ addi t0, t0, SZREG
+ orc.b t1, t1
+ beq t1, t3, 1b
+2:
+ not t1, t1
+ CZ t1, t1
+
+ /* Get number of processed words. */
+ sub t2, t0, t2
+
+ /* Add number of characters in the first word. */
+ add a0, a0, t2
+ srli t1, t1, 3
+
+ /* Add number of characters in the last word. */
+ add a0, a0, t1
+3:
+ ret
+
+.option pop
+#endif
+SYM_FUNC_END(strlen)
diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
new file mode 100644
index 000000000000..ee49595075be
--- /dev/null
+++ b/arch/riscv/lib/strncmp.S
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+#include <asm/alternative-macros.h>
+#include <asm/errata_list.h>
+
+/* int strncmp(const char *cs, const char *ct, size_t count) */
+SYM_FUNC_START(strncmp)
+
+ ALTERNATIVE("nop", "j strncmp_zbb", 0, RISCV_ISA_EXT_ZBB, CONFIG_RISCV_ISA_ZBB)
+
+ /*
+ * Returns
+ * a0 - comparison result, value like strncmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ * a2 - number of characters to compare
+ *
+ * Clobbers
+ * t0, t1, t2
+ */
+ li t2, 0
+1:
+ beq a2, t2, 2f
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne t0, t1, 3f
+ addi t2, t2, 1
+ bnez t0, 1b
+2:
+ li a0, 0
+ ret
+3:
+ /*
+ * strncmp only needs to return (< 0, 0, > 0) values
+ * not necessarily -1, 0, +1
+ */
+ sub a0, t0, t1
+ ret
+
+/*
+ * Variant of strncmp using the ZBB extension if available
+ */
+#ifdef CONFIG_RISCV_ISA_ZBB
+strncmp_zbb:
+
+.option push
+.option arch,+zbb
+
+ /*
+ * Returns
+ * a0 - comparison result, like strncmp
+ *
+ * Parameters
+ * a0 - string1
+ * a1 - string2
+ * a2 - number of characters to compare
+ *
+ * Clobbers
+ * t0, t1, t2, t3, t4, t5, t6
+ */
+
+ or t2, a0, a1
+ li t5, -1
+ and t2, t2, SZREG-1
+ add t4, a0, a2
+ bnez t2, 4f
+
+ /* Adjust limit for fast-path. */
+ andi t6, t4, -SZREG
+
+ /* Main loop for aligned string. */
+ .p2align 3
+1:
+ bgt a0, t6, 3f
+ REG_L t0, 0(a0)
+ REG_L t1, 0(a1)
+ orc.b t3, t0
+ bne t3, t5, 2f
+ addi a0, a0, SZREG
+ addi a1, a1, SZREG
+ beq t0, t1, 1b
+
+ /*
+ * Words don't match, and no null byte in the first
+ * word. Get bytes in big-endian order and compare.
+ */
+#ifndef CONFIG_CPU_BIG_ENDIAN
+ rev8 t0, t0
+ rev8 t1, t1
+#endif
+
+ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
+ sltu a0, t0, t1
+ neg a0, a0
+ ori a0, a0, 1
+ ret
+
+2:
+ /*
+ * Found a null byte.
+ * If words don't match, fall back to simple loop.
+ */
+ bne t0, t1, 3f
+
+ /* Otherwise, strings are equal. */
+ li a0, 0
+ ret
+
+ /* Simple loop for misaligned strings. */
+3:
+ /* Restore limit for slow-path. */
+ .p2align 3
+4:
+ bge a0, t4, 6f
+ lbu t0, 0(a0)
+ lbu t1, 0(a1)
+ addi a0, a0, 1
+ addi a1, a1, 1
+ bne t0, t1, 5f
+ bnez t0, 4b
+
+5:
+ sub a0, t0, t1
+ ret
+
+6:
+ li a0, 0
+ ret
+
+.option pop
+#endif
+SYM_FUNC_END(strncmp)
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index d86f7cebd4a7..eb0774d9c03b 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -267,10 +267,12 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (!user_mode(regs) && addr < TASK_SIZE &&
- unlikely(!(regs->status & SR_SUM)))
- die_kernel_fault("access to user memory without uaccess routines",
- addr, regs);
+ if (!user_mode(regs) && addr < TASK_SIZE && unlikely(!(regs->status & SR_SUM))) {
+ if (fixup_exception(regs))
+ return;
+
+ die_kernel_fault("access to user memory without uaccess routines", addr, regs);
+ }
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile
index dd58e1d99397..d16bf715a586 100644
--- a/arch/riscv/purgatory/Makefile
+++ b/arch/riscv/purgatory/Makefile
@@ -2,6 +2,7 @@
OBJECT_FILES_NON_STANDARD := y
purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o
+purgatory-y += strcmp.o strlen.o strncmp.o
targets += $(purgatory-y)
PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
@@ -18,6 +19,15 @@ $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE
$(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE
$(call if_changed_rule,as_o_S)
+$(obj)/strcmp.o: $(srctree)/arch/riscv/lib/strcmp.S FORCE
+ $(call if_changed_rule,as_o_S)
+
+$(obj)/strlen.o: $(srctree)/arch/riscv/lib/strlen.S FORCE
+ $(call if_changed_rule,as_o_S)
+
+$(obj)/strncmp.o: $(srctree)/arch/riscv/lib/strncmp.S FORCE
+ $(call if_changed_rule,as_o_S)
+
$(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE
$(call if_changed_rule,cc_o_c)
@@ -77,6 +87,9 @@ CFLAGS_ctype.o += $(PURGATORY_CFLAGS)
AFLAGS_REMOVE_entry.o += -Wa,-gdwarf-2
AFLAGS_REMOVE_memcpy.o += -Wa,-gdwarf-2
AFLAGS_REMOVE_memset.o += -Wa,-gdwarf-2
+AFLAGS_REMOVE_strcmp.o += -Wa,-gdwarf-2
+AFLAGS_REMOVE_strlen.o += -Wa,-gdwarf-2
+AFLAGS_REMOVE_strncmp.o += -Wa,-gdwarf-2
$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
$(call if_changed,ld)
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 933771b0b07a..078cd1a773a3 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -714,7 +714,9 @@ config EADM_SCH
config VFIO_CCW
def_tristate n
prompt "Support for VFIO-CCW subchannels"
- depends on S390_CCW_IOMMU && VFIO_MDEV
+ depends on S390_CCW_IOMMU
+ depends on VFIO
+ select VFIO_MDEV
help
This driver allows usage of I/O subchannels via VFIO-CCW.
@@ -724,8 +726,10 @@ config VFIO_CCW
config VFIO_AP
def_tristate n
prompt "VFIO support for AP devices"
- depends on S390_AP_IOMMU && VFIO_MDEV && KVM
+ depends on S390_AP_IOMMU && KVM
+ depends on VFIO
depends on ZCRYPT
+ select VFIO_MDEV
help
This driver grants access to Adjunct Processor (AP) devices
via the VFIO mediated device interface.
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 74b35ec2ad28..3c68fe49042c 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -594,7 +594,6 @@ CONFIG_SYNC_FILE=y
CONFIG_VFIO=m
CONFIG_VFIO_PCI=m
CONFIG_MLX5_VFIO_PCI=m
-CONFIG_VFIO_MDEV=m
CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index cec71268e3bc..9ab91632f74c 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -583,7 +583,6 @@ CONFIG_SYNC_FILE=y
CONFIG_VFIO=m
CONFIG_VFIO_PCI=m
CONFIG_MLX5_VFIO_PCI=m
-CONFIG_VFIO_MDEV=m
CONFIG_VIRTIO_PCI=m
CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=y
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d67ce719d16a..2bbc3d54959d 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -1031,7 +1031,6 @@ extern char sie_exit;
extern int kvm_s390_gisc_register(struct kvm *kvm, u32 gisc);
extern int kvm_s390_gisc_unregister(struct kvm *kvm, u32 gisc);
-static inline void kvm_arch_hardware_disable(void) {}
static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
diff --git a/arch/s390/include/asm/msi.h b/arch/s390/include/asm/msi.h
new file mode 100644
index 000000000000..399343ed9ffb
--- /dev/null
+++ b/arch/s390/include/asm/msi.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_MSI_H
+#define _ASM_S390_MSI_H
+#include <asm-generic/msi.h>
+
+/*
+ * Work around S390 not using irq_domain at all so we can't set
+ * IRQ_DOMAIN_FLAG_ISOLATED_MSI. See for an explanation how it works:
+ *
+ * https://lore.kernel.org/r/31af8174-35e9-ebeb-b9ef-74c90d4bfd93@linux.ibm.com/
+ *
+ * Note this is less isolated than the ARM/x86 versions as userspace can trigger
+ * MSI belonging to kernel devices within the same gisa.
+ */
+#define arch_is_isolated_msi() true
+
+#endif
diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
index 91e63426bdc5..7119c04c51c5 100644
--- a/arch/s390/include/asm/pci_dma.h
+++ b/arch/s390/include/asm/pci_dma.h
@@ -186,9 +186,10 @@ static inline unsigned long *get_st_pto(unsigned long entry)
/* Prototypes */
void dma_free_seg_table(unsigned long);
-unsigned long *dma_alloc_cpu_table(void);
+unsigned long *dma_alloc_cpu_table(gfp_t gfp);
void dma_cleanup_tables(unsigned long *);
-unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr);
+unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr,
+ gfp_t gfp);
void dma_update_cpu_trans(unsigned long *entry, phys_addr_t page_addr, int flags);
extern const struct dma_map_ops s390_pci_dma_ops;
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index 9e2b95a222a9..34f9542636e9 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -22,10 +22,10 @@ KBUILD_AFLAGS += -DBUILD_VDSO
KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING
KBUILD_AFLAGS_64 := $(filter-out -m64,$(KBUILD_AFLAGS))
-KBUILD_AFLAGS_64 += -m64 -s
+KBUILD_AFLAGS_64 += -m64
KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS))
-KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin
+KBUILD_CFLAGS_64 += -m64 -fPIC -fno-common -fno-builtin
ldflags-y := -fPIC -shared -soname=linux-vdso64.so.1 \
--hash-style=both --build-id=sha1 -T
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 0243b6e38d36..3eb85f254881 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1162,6 +1162,115 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
}
/**
+ * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address.
+ * @kvm: Virtual machine instance.
+ * @gpa: Absolute guest address of the location to be changed.
+ * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a
+ * non power of two will result in failure.
+ * @old_addr: Pointer to old value. If the location at @gpa contains this value,
+ * the exchange will succeed. After calling cmpxchg_guest_abs_with_key()
+ * *@old_addr contains the value at @gpa before the attempt to
+ * exchange the value.
+ * @new: The value to place at @gpa.
+ * @access_key: The access key to use for the guest access.
+ * @success: output value indicating if an exchange occurred.
+ *
+ * Atomically exchange the value at @gpa by @new, if it contains *@old.
+ * Honors storage keys.
+ *
+ * Return: * 0: successful exchange
+ * * >0: a program interruption code indicating the reason cmpxchg could
+ * not be attempted
+ * * -EINVAL: address misaligned or len not power of two
+ * * -EAGAIN: transient failure (len 1 or 2)
+ * * -EOPNOTSUPP: read-only memslot (should never occur)
+ */
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len,
+ __uint128_t *old_addr, __uint128_t new,
+ u8 access_key, bool *success)
+{
+ gfn_t gfn = gpa_to_gfn(gpa);
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ bool writable;
+ hva_t hva;
+ int ret;
+
+ if (!IS_ALIGNED(gpa, len))
+ return -EINVAL;
+
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+ if (kvm_is_error_hva(hva))
+ return PGM_ADDRESSING;
+ /*
+ * Check if it's a read-only memslot, even though that cannot occur
+ * since those are unsupported.
+ * Don't try to actually handle that case.
+ */
+ if (!writable)
+ return -EOPNOTSUPP;
+
+ hva += offset_in_page(gpa);
+ /*
+ * The cmpxchg_user_key macro depends on the type of "old", so we need
+ * a case for each valid length and get some code duplication as long
+ * as we don't introduce a new macro.
+ */
+ switch (len) {
+ case 1: {
+ u8 old;
+
+ ret = cmpxchg_user_key((u8 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 2: {
+ u16 old;
+
+ ret = cmpxchg_user_key((u16 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 4: {
+ u32 old;
+
+ ret = cmpxchg_user_key((u32 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 8: {
+ u64 old;
+
+ ret = cmpxchg_user_key((u64 __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ case 16: {
+ __uint128_t old;
+
+ ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, *old_addr, new, access_key);
+ *success = !ret && old == *old_addr;
+ *old_addr = old;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+ if (*success)
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ /*
+ * Assume that the fault is caused by protection, either key protection
+ * or user page write protection.
+ */
+ if (ret == -EFAULT)
+ ret = PGM_PROTECTION;
+ return ret;
+}
+
+/**
* guest_translate_address_with_key - translate guest logical into guest absolute address
* @vcpu: virtual cpu
* @gva: Guest virtual address
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 9408d6cc8e2c..b320d12aa049 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -206,6 +206,9 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
void *data, unsigned long len, enum gacc_mode mode);
+int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, __uint128_t *old,
+ __uint128_t new, u8 access_key, bool *success);
+
/**
* write_guest_with_key - copy data from kernel space to guest space
* @vcpu: virtual cpu
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index ab26aa53ee37..9250fde1f97d 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -3103,9 +3103,9 @@ static enum hrtimer_restart gisa_vcpu_kicker(struct hrtimer *timer)
static void process_gib_alert_list(void)
{
struct kvm_s390_gisa_interrupt *gi;
+ u32 final, gisa_phys, origin = 0UL;
struct kvm_s390_gisa *gisa;
struct kvm *kvm;
- u32 final, origin = 0UL;
do {
/*
@@ -3131,9 +3131,10 @@ static void process_gib_alert_list(void)
* interruptions asap.
*/
while (origin & GISA_ADDR_MASK) {
- gisa = (struct kvm_s390_gisa *)(u64)origin;
+ gisa_phys = origin;
+ gisa = phys_to_virt(gisa_phys);
origin = gisa->next_alert;
- gisa->next_alert = (u32)(u64)gisa;
+ gisa->next_alert = gisa_phys;
kvm = container_of(gisa, struct sie_page2, gisa)->kvm;
gi = &kvm->arch.gisa_int;
if (hrtimer_active(&gi->timer))
@@ -3415,8 +3416,9 @@ void kvm_s390_gib_destroy(void)
gib = NULL;
}
-int kvm_s390_gib_init(u8 nisc)
+int __init kvm_s390_gib_init(u8 nisc)
{
+ u32 gib_origin;
int rc = 0;
if (!css_general_characteristics.aiv) {
@@ -3438,7 +3440,8 @@ int kvm_s390_gib_init(u8 nisc)
}
gib->nisc = nisc;
- if (chsc_sgib((u32)(u64)gib)) {
+ gib_origin = virt_to_phys(gib);
+ if (chsc_sgib(gib_origin)) {
pr_err("Associating the GIB with the AIV facility failed\n");
free_page((unsigned long)gib);
gib = NULL;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index e4890e04b210..39b36562c043 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -256,17 +256,6 @@ debug_info_t *kvm_s390_dbf;
debug_info_t *kvm_s390_dbf_uv;
/* Section: not file related */
-int kvm_arch_hardware_enable(void)
-{
- /* every s390 is virtualization enabled ;-) */
- return 0;
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- return 0;
-}
-
/* forward declarations */
static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end);
@@ -329,25 +318,6 @@ static struct notifier_block kvm_clock_notifier = {
.notifier_call = kvm_clock_sync,
};
-int kvm_arch_hardware_setup(void *opaque)
-{
- gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_pte_notifier(&gmap_notifier);
- vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
- gmap_register_pte_notifier(&vsie_gmap_notifier);
- atomic_notifier_chain_register(&s390_epoch_delta_notifier,
- &kvm_clock_notifier);
- return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- gmap_unregister_pte_notifier(&gmap_notifier);
- gmap_unregister_pte_notifier(&vsie_gmap_notifier);
- atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
- &kvm_clock_notifier);
-}
-
static void allow_cpu_feat(unsigned long nr)
{
set_bit_inv(nr, kvm_s390_available_cpu_feat);
@@ -385,7 +355,7 @@ static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
#define INSN_SORTL 0xb938
#define INSN_DFLTCC 0xb939
-static void kvm_s390_cpu_feat_init(void)
+static void __init kvm_s390_cpu_feat_init(void)
{
int i;
@@ -488,7 +458,7 @@ static void kvm_s390_cpu_feat_init(void)
*/
}
-int kvm_arch_init(void *opaque)
+static int __init __kvm_s390_init(void)
{
int rc = -ENOMEM;
@@ -498,11 +468,11 @@ int kvm_arch_init(void *opaque)
kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long));
if (!kvm_s390_dbf_uv)
- goto out;
+ goto err_kvm_uv;
if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) ||
debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view))
- goto out;
+ goto err_debug_view;
kvm_s390_cpu_feat_init();
@@ -510,30 +480,49 @@ int kvm_arch_init(void *opaque)
rc = kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
if (rc) {
pr_err("A FLIC registration call failed with rc=%d\n", rc);
- goto out;
+ goto err_flic;
}
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM)) {
rc = kvm_s390_pci_init();
if (rc) {
pr_err("Unable to allocate AIFT for PCI\n");
- goto out;
+ goto err_pci;
}
}
rc = kvm_s390_gib_init(GAL_ISC);
if (rc)
- goto out;
+ goto err_gib;
+
+ gmap_notifier.notifier_call = kvm_gmap_notifier;
+ gmap_register_pte_notifier(&gmap_notifier);
+ vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+ gmap_register_pte_notifier(&vsie_gmap_notifier);
+ atomic_notifier_chain_register(&s390_epoch_delta_notifier,
+ &kvm_clock_notifier);
return 0;
-out:
- kvm_arch_exit();
+err_gib:
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
+ kvm_s390_pci_exit();
+err_pci:
+err_flic:
+err_debug_view:
+ debug_unregister(kvm_s390_dbf_uv);
+err_kvm_uv:
+ debug_unregister(kvm_s390_dbf);
return rc;
}
-void kvm_arch_exit(void)
+static void __kvm_s390_exit(void)
{
+ gmap_unregister_pte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&vsie_gmap_notifier);
+ atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
+ &kvm_clock_notifier);
+
kvm_s390_gib_destroy();
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV_KVM))
kvm_s390_pci_exit();
@@ -584,7 +573,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_VCPU_RESETS:
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_S390_DIAG318:
- case KVM_CAP_S390_MEM_OP_EXTENSION:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -598,6 +586,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_MEM_OP:
r = MEM_OP_MAX_SIZE;
break;
+ case KVM_CAP_S390_MEM_OP_EXTENSION:
+ /*
+ * Flag bits indicating which extensions are supported.
+ * If r > 0, the base extension must also be supported/indicated,
+ * in order to maintain backwards compatibility.
+ */
+ r = KVM_S390_MEMOP_EXTENSION_CAP_BASE |
+ KVM_S390_MEMOP_EXTENSION_CAP_CMPXCHG;
+ break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
case KVM_CAP_MAX_VCPU_ID:
@@ -2764,41 +2761,33 @@ static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd)
return r;
}
-static bool access_key_invalid(u8 access_key)
-{
- return access_key > 0xf;
-}
-
-static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+static int mem_op_validate_common(struct kvm_s390_mem_op *mop, u64 supported_flags)
{
- void __user *uaddr = (void __user *)mop->buf;
- u64 supported_flags;
- void *tmpbuf = NULL;
- int r, srcu_idx;
-
- supported_flags = KVM_S390_MEMOP_F_SKEY_PROTECTION
- | KVM_S390_MEMOP_F_CHECK_ONLY;
if (mop->flags & ~supported_flags || !mop->size)
return -EINVAL;
if (mop->size > MEM_OP_MAX_SIZE)
return -E2BIG;
- /*
- * This is technically a heuristic only, if the kvm->lock is not
- * taken, it is not guaranteed that the vm is/remains non-protected.
- * This is ok from a kernel perspective, wrongdoing is detected
- * on the access, -EFAULT is returned and the vm may crash the
- * next time it accesses the memory in question.
- * There is no sane usecase to do switching and a memop on two
- * different CPUs at the same time.
- */
- if (kvm_s390_pv_get_handle(kvm))
- return -EINVAL;
if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
- if (access_key_invalid(mop->key))
+ if (mop->key > 0xf)
return -EINVAL;
} else {
mop->key = 0;
}
+ return 0;
+}
+
+static int kvm_s390_vm_mem_op_abs(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
+ enum gacc_mode acc_mode;
+ void *tmpbuf = NULL;
+ int r, srcu_idx;
+
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION |
+ KVM_S390_MEMOP_F_CHECK_ONLY);
+ if (r)
+ return r;
+
if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
tmpbuf = vmalloc(mop->size);
if (!tmpbuf)
@@ -2812,35 +2801,25 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
goto out_unlock;
}
- switch (mop->op) {
- case KVM_S390_MEMOP_ABSOLUTE_READ: {
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_FETCH, mop->key);
- } else {
- r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
- mop->size, GACC_FETCH, mop->key);
- if (r == 0) {
- if (copy_to_user(uaddr, tmpbuf, mop->size))
- r = -EFAULT;
- }
- }
- break;
+ acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE;
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+ r = check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key);
+ goto out_unlock;
}
- case KVM_S390_MEMOP_ABSOLUTE_WRITE: {
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gpa_range(kvm, mop->gaddr, mop->size, GACC_STORE, mop->key);
- } else {
- if (copy_from_user(tmpbuf, uaddr, mop->size)) {
- r = -EFAULT;
- break;
- }
- r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
- mop->size, GACC_STORE, mop->key);
+ if (acc_mode == GACC_FETCH) {
+ r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+ mop->size, GACC_FETCH, mop->key);
+ if (r)
+ goto out_unlock;
+ if (copy_to_user(uaddr, tmpbuf, mop->size))
+ r = -EFAULT;
+ } else {
+ if (copy_from_user(tmpbuf, uaddr, mop->size)) {
+ r = -EFAULT;
+ goto out_unlock;
}
- break;
- }
- default:
- r = -EINVAL;
+ r = access_guest_abs_with_key(kvm, mop->gaddr, tmpbuf,
+ mop->size, GACC_STORE, mop->key);
}
out_unlock:
@@ -2850,6 +2829,75 @@ out_unlock:
return r;
}
+static int kvm_s390_vm_mem_op_cmpxchg(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ void __user *uaddr = (void __user *)mop->buf;
+ void __user *old_addr = (void __user *)mop->old_addr;
+ union {
+ __uint128_t quad;
+ char raw[sizeof(__uint128_t)];
+ } old = { .quad = 0}, new = { .quad = 0 };
+ unsigned int off_in_quad = sizeof(new) - mop->size;
+ int r, srcu_idx;
+ bool success;
+
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_SKEY_PROTECTION);
+ if (r)
+ return r;
+ /*
+ * This validates off_in_quad. Checking that size is a power
+ * of two is not necessary, as cmpxchg_guest_abs_with_key
+ * takes care of that
+ */
+ if (mop->size > sizeof(new))
+ return -EINVAL;
+ if (copy_from_user(&new.raw[off_in_quad], uaddr, mop->size))
+ return -EFAULT;
+ if (copy_from_user(&old.raw[off_in_quad], old_addr, mop->size))
+ return -EFAULT;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ if (kvm_is_error_gpa(kvm, mop->gaddr)) {
+ r = PGM_ADDRESSING;
+ goto out_unlock;
+ }
+
+ r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old.quad,
+ new.quad, mop->key, &success);
+ if (!success && copy_to_user(old_addr, &old.raw[off_in_quad], mop->size))
+ r = -EFAULT;
+
+out_unlock:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ return r;
+}
+
+static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop)
+{
+ /*
+ * This is technically a heuristic only, if the kvm->lock is not
+ * taken, it is not guaranteed that the vm is/remains non-protected.
+ * This is ok from a kernel perspective, wrongdoing is detected
+ * on the access, -EFAULT is returned and the vm may crash the
+ * next time it accesses the memory in question.
+ * There is no sane usecase to do switching and a memop on two
+ * different CPUs at the same time.
+ */
+ if (kvm_s390_pv_get_handle(kvm))
+ return -EINVAL;
+
+ switch (mop->op) {
+ case KVM_S390_MEMOP_ABSOLUTE_READ:
+ case KVM_S390_MEMOP_ABSOLUTE_WRITE:
+ return kvm_s390_vm_mem_op_abs(kvm, mop);
+ case KVM_S390_MEMOP_ABSOLUTE_CMPXCHG:
+ return kvm_s390_vm_mem_op_cmpxchg(kvm, mop);
+ default:
+ return -EINVAL;
+ }
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5249,62 +5297,54 @@ static long kvm_s390_vcpu_mem_op(struct kvm_vcpu *vcpu,
struct kvm_s390_mem_op *mop)
{
void __user *uaddr = (void __user *)mop->buf;
+ enum gacc_mode acc_mode;
void *tmpbuf = NULL;
- int r = 0;
- const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION
- | KVM_S390_MEMOP_F_CHECK_ONLY
- | KVM_S390_MEMOP_F_SKEY_PROTECTION;
+ int r;
- if (mop->flags & ~supported_flags || mop->ar >= NUM_ACRS || !mop->size)
+ r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION |
+ KVM_S390_MEMOP_F_CHECK_ONLY |
+ KVM_S390_MEMOP_F_SKEY_PROTECTION);
+ if (r)
+ return r;
+ if (mop->ar >= NUM_ACRS)
return -EINVAL;
- if (mop->size > MEM_OP_MAX_SIZE)
- return -E2BIG;
if (kvm_s390_pv_cpu_is_protected(vcpu))
return -EINVAL;
- if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) {
- if (access_key_invalid(mop->key))
- return -EINVAL;
- } else {
- mop->key = 0;
- }
if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) {
tmpbuf = vmalloc(mop->size);
if (!tmpbuf)
return -ENOMEM;
}
- switch (mop->op) {
- case KVM_S390_MEMOP_LOGICAL_READ:
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
- GACC_FETCH, mop->key);
- break;
- }
+ acc_mode = mop->op == KVM_S390_MEMOP_LOGICAL_READ ? GACC_FETCH : GACC_STORE;
+ if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
+ r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
+ acc_mode, mop->key);
+ goto out_inject;
+ }
+ if (acc_mode == GACC_FETCH) {
r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
mop->size, mop->key);
- if (r == 0) {
- if (copy_to_user(uaddr, tmpbuf, mop->size))
- r = -EFAULT;
- }
- break;
- case KVM_S390_MEMOP_LOGICAL_WRITE:
- if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
- r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size,
- GACC_STORE, mop->key);
- break;
+ if (r)
+ goto out_inject;
+ if (copy_to_user(uaddr, tmpbuf, mop->size)) {
+ r = -EFAULT;
+ goto out_free;
}
+ } else {
if (copy_from_user(tmpbuf, uaddr, mop->size)) {
r = -EFAULT;
- break;
+ goto out_free;
}
r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf,
mop->size, mop->key);
- break;
}
+out_inject:
if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0)
kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+out_free:
vfree(tmpbuf);
return r;
}
@@ -5633,23 +5673,40 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (kvm_s390_pv_get_handle(kvm))
return -EINVAL;
- if (change == KVM_MR_DELETE || change == KVM_MR_FLAGS_ONLY)
- return 0;
+ if (change != KVM_MR_DELETE && change != KVM_MR_FLAGS_ONLY) {
+ /*
+ * A few sanity checks. We can have memory slots which have to be
+ * located/ended at a segment boundary (1MB). The memory in userland is
+ * ok to be fragmented into various different vmas. It is okay to mmap()
+ * and munmap() stuff in this slot after doing this call at any time
+ */
- /* A few sanity checks. We can have memory slots which have to be
- located/ended at a segment boundary (1MB). The memory in userland is
- ok to be fragmented into various different vmas. It is okay to mmap()
- and munmap() stuff in this slot after doing this call at any time */
+ if (new->userspace_addr & 0xffffful)
+ return -EINVAL;
- if (new->userspace_addr & 0xffffful)
- return -EINVAL;
+ size = new->npages * PAGE_SIZE;
+ if (size & 0xffffful)
+ return -EINVAL;
- size = new->npages * PAGE_SIZE;
- if (size & 0xffffful)
- return -EINVAL;
+ if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
+ return -EINVAL;
+ }
- if ((new->base_gfn * PAGE_SIZE) + size > kvm->arch.mem_limit)
- return -EINVAL;
+ if (!kvm->arch.migration_mode)
+ return 0;
+
+ /*
+ * Turn off migration mode when:
+ * - userspace creates a new memslot with dirty logging off,
+ * - userspace modifies an existing memslot (MOVE or FLAGS_ONLY) and
+ * dirty logging is turned off.
+ * Migration mode expects dirty page logging being enabled to store
+ * its dirty bitmap.
+ */
+ if (change != KVM_MR_DELETE &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ WARN(kvm_s390_vm_stop_migration(kvm),
+ "Failed to stop migration mode");
return 0;
}
@@ -5696,7 +5753,7 @@ static inline unsigned long nonhyp_mask(int i)
static int __init kvm_s390_init(void)
{
- int i;
+ int i, r;
if (!sclp.has_sief2) {
pr_info("SIE is not available\n");
@@ -5712,12 +5769,23 @@ static int __init kvm_s390_init(void)
kvm_s390_fac_base[i] |=
stfle_fac_list[i] & nonhyp_mask(i);
- return kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ r = __kvm_s390_init();
+ if (r)
+ return r;
+
+ r = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
+ if (r) {
+ __kvm_s390_exit();
+ return r;
+ }
+ return 0;
}
static void __exit kvm_s390_exit(void)
{
kvm_exit();
+
+ __kvm_s390_exit();
}
module_init(kvm_s390_init);
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index d48588c207d8..0261d42c7d01 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -470,7 +470,7 @@ void kvm_s390_gisa_clear(struct kvm *kvm);
void kvm_s390_gisa_destroy(struct kvm *kvm);
void kvm_s390_gisa_disable(struct kvm *kvm);
void kvm_s390_gisa_enable(struct kvm *kvm);
-int kvm_s390_gib_init(u8 nisc);
+int __init kvm_s390_gib_init(u8 nisc);
void kvm_s390_gib_destroy(void);
/* implemented in guestdbg.c */
diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c
index ec51e810e381..b124d586db55 100644
--- a/arch/s390/kvm/pci.c
+++ b/arch/s390/kvm/pci.c
@@ -672,7 +672,7 @@ out:
return r;
}
-int kvm_s390_pci_init(void)
+int __init kvm_s390_pci_init(void)
{
zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm;
zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm;
diff --git a/arch/s390/kvm/pci.h b/arch/s390/kvm/pci.h
index 486d06ef563f..ff0972dd5e71 100644
--- a/arch/s390/kvm/pci.h
+++ b/arch/s390/kvm/pci.h
@@ -60,7 +60,7 @@ void kvm_s390_pci_clear_list(struct kvm *kvm);
int kvm_s390_pci_zpci_op(struct kvm *kvm, struct kvm_s390_zpci_op *args);
-int kvm_s390_pci_init(void);
+int __init kvm_s390_pci_init(void);
void kvm_s390_pci_exit(void);
static inline bool kvm_s390_pci_interp_allowed(void)
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index ea478d11fbd1..2d9b01d7ca4c 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -27,11 +27,11 @@ static int zpci_refresh_global(struct zpci_dev *zdev)
zdev->iommu_pages * PAGE_SIZE);
}
-unsigned long *dma_alloc_cpu_table(void)
+unsigned long *dma_alloc_cpu_table(gfp_t gfp)
{
unsigned long *table, *entry;
- table = kmem_cache_alloc(dma_region_table_cache, GFP_ATOMIC);
+ table = kmem_cache_alloc(dma_region_table_cache, gfp);
if (!table)
return NULL;
@@ -45,11 +45,11 @@ static void dma_free_cpu_table(void *table)
kmem_cache_free(dma_region_table_cache, table);
}
-static unsigned long *dma_alloc_page_table(void)
+static unsigned long *dma_alloc_page_table(gfp_t gfp)
{
unsigned long *table, *entry;
- table = kmem_cache_alloc(dma_page_table_cache, GFP_ATOMIC);
+ table = kmem_cache_alloc(dma_page_table_cache, gfp);
if (!table)
return NULL;
@@ -63,7 +63,7 @@ static void dma_free_page_table(void *table)
kmem_cache_free(dma_page_table_cache, table);
}
-static unsigned long *dma_get_seg_table_origin(unsigned long *rtep)
+static unsigned long *dma_get_seg_table_origin(unsigned long *rtep, gfp_t gfp)
{
unsigned long old_rte, rte;
unsigned long *sto;
@@ -72,7 +72,7 @@ static unsigned long *dma_get_seg_table_origin(unsigned long *rtep)
if (reg_entry_isvalid(rte)) {
sto = get_rt_sto(rte);
} else {
- sto = dma_alloc_cpu_table();
+ sto = dma_alloc_cpu_table(gfp);
if (!sto)
return NULL;
@@ -90,7 +90,7 @@ static unsigned long *dma_get_seg_table_origin(unsigned long *rtep)
return sto;
}
-static unsigned long *dma_get_page_table_origin(unsigned long *step)
+static unsigned long *dma_get_page_table_origin(unsigned long *step, gfp_t gfp)
{
unsigned long old_ste, ste;
unsigned long *pto;
@@ -99,7 +99,7 @@ static unsigned long *dma_get_page_table_origin(unsigned long *step)
if (reg_entry_isvalid(ste)) {
pto = get_st_pto(ste);
} else {
- pto = dma_alloc_page_table();
+ pto = dma_alloc_page_table(gfp);
if (!pto)
return NULL;
set_st_pto(&ste, virt_to_phys(pto));
@@ -116,18 +116,19 @@ static unsigned long *dma_get_page_table_origin(unsigned long *step)
return pto;
}
-unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr)
+unsigned long *dma_walk_cpu_trans(unsigned long *rto, dma_addr_t dma_addr,
+ gfp_t gfp)
{
unsigned long *sto, *pto;
unsigned int rtx, sx, px;
rtx = calc_rtx(dma_addr);
- sto = dma_get_seg_table_origin(&rto[rtx]);
+ sto = dma_get_seg_table_origin(&rto[rtx], gfp);
if (!sto)
return NULL;
sx = calc_sx(dma_addr);
- pto = dma_get_page_table_origin(&sto[sx]);
+ pto = dma_get_page_table_origin(&sto[sx], gfp);
if (!pto)
return NULL;
@@ -170,7 +171,8 @@ static int __dma_update_trans(struct zpci_dev *zdev, phys_addr_t pa,
return -EINVAL;
for (i = 0; i < nr_pages; i++) {
- entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
+ entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr,
+ GFP_ATOMIC);
if (!entry) {
rc = -ENOMEM;
goto undo_cpu_trans;
@@ -186,7 +188,8 @@ undo_cpu_trans:
while (i-- > 0) {
page_addr -= PAGE_SIZE;
dma_addr -= PAGE_SIZE;
- entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr);
+ entry = dma_walk_cpu_trans(zdev->dma_table, dma_addr,
+ GFP_ATOMIC);
if (!entry)
break;
dma_update_cpu_trans(entry, page_addr, flags);
@@ -576,7 +579,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev)
spin_lock_init(&zdev->iommu_bitmap_lock);
- zdev->dma_table = dma_alloc_cpu_table();
+ zdev->dma_table = dma_alloc_cpu_table(GFP_KERNEL);
if (!zdev->dma_table) {
rc = -ENOMEM;
goto out;
diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile
index d237bc6841cb..32573b4f9bd2 100644
--- a/arch/s390/purgatory/Makefile
+++ b/arch/s390/purgatory/Makefile
@@ -24,7 +24,7 @@ KCSAN_SANITIZE := n
KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes
KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare
KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding
-KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common
+KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common
KBUILD_CFLAGS += -fno-stack-protector
KBUILD_CFLAGS += $(CLANG_FLAGS)
KBUILD_CFLAGS += $(call cc-option,-fno-PIE)
diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile
index 591125c42d49..b5e29f99c02c 100644
--- a/arch/sh/boot/compressed/Makefile
+++ b/arch/sh/boot/compressed/Makefile
@@ -8,13 +8,6 @@
OBJECTS := head_32.o misc.o cache.o piggy.o \
ashiftrt.o ashldi3.o ashrsi3.o ashlsi3.o lshrsi3.o
-# These were previously generated files. When you are building the kernel
-# with O=, make sure to remove the stale files in the output tree. Otherwise,
-# the build system wrongly compiles the stale ones.
-ifdef building_out_of_srctree
-$(shell rm -f $(addprefix $(obj)/, ashiftrt.S ashldi3.c ashrsi3.S ashlsi3.S lshrsi3.S))
-endif
-
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 \
vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo $(OBJECTS)
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index ad4ff3b0e91e..541a9b18e343 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -25,9 +25,12 @@ config UML
select GENERIC_IRQ_SHOW
select GENERIC_CPU_DEVICES
select HAVE_GCC_PLUGINS
+ select ARCH_SUPPORTS_LTO_CLANG
+ select ARCH_SUPPORTS_LTO_CLANG_THIN
select TRACE_IRQFLAGS_SUPPORT
select TTY # Needed for line.c
select HAVE_ARCH_VMAP_STACK
+ select HAVE_RUST if X86_64
config MMU
bool
@@ -242,4 +245,8 @@ source "arch/um/drivers/Kconfig"
config ARCH_SUSPEND_POSSIBLE
def_bool y
+menu "Power management options"
+
source "kernel/power/Kconfig"
+
+endmenu
diff --git a/arch/um/Makefile b/arch/um/Makefile
index f1d4d67157be..8186d4761bda 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -68,6 +68,8 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -D__arch_um__ \
-Din6addr_loopback=kernel_in6addr_loopback \
-Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr
+KBUILD_RUSTFLAGS += -Crelocation-model=pie
+
KBUILD_AFLAGS += $(ARCH_INCLUDE)
USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -I%,,$(KBUILD_CFLAGS))) \
@@ -139,11 +141,10 @@ ifeq ($(CONFIG_LD_IS_BFD),y)
LDFLAGS_EXECSTACK += $(call ld-option,--no-warn-rwx-segments)
endif
-LD_FLAGS_CMDLINE = $(foreach opt,$(KBUILD_LDFLAGS),-Wl,$(opt))
+LD_FLAGS_CMDLINE = $(foreach opt,$(KBUILD_LDFLAGS) $(LDFLAGS_EXECSTACK),-Wl,$(opt))
# Used by link-vmlinux.sh which has special support for um link
-export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE)
-export LDFLAGS_vmlinux := $(LDFLAGS_EXECSTACK)
+export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE) $(CC_FLAGS_LTO)
# When cleaning we don't include .config, so we don't include
# TT or skas makefiles and don't clean skas_ptregs.h.
diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index a4f0a19fbe14..36911b1fddcf 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -261,6 +261,7 @@ config UML_NET_VECTOR
config UML_NET_VDE
bool "VDE transport (obsolete)"
depends on UML_NET
+ depends on !MODVERSIONS
select MAY_HAVE_RUNTIME_DEPS
help
This User-Mode Linux network transport allows one or more running
@@ -309,6 +310,7 @@ config UML_NET_MCAST
config UML_NET_PCAP
bool "pcap transport (obsolete)"
depends on UML_NET
+ depends on !MODVERSIONS
select MAY_HAVE_RUNTIME_DEPS
help
The pcap transport makes a pcap packet stream on the host look
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index e1dc4292bd22..dee6f66353b3 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -72,4 +72,4 @@ CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH)
CFLAGS_xterm.o += '-DCONFIG_XTERM_CHAN_DEFAULT_EMULATOR="$(CONFIG_XTERM_CHAN_DEFAULT_EMULATOR)"'
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/drivers/pcap_kern.c b/arch/um/drivers/pcap_kern.c
index cfe4cb17694c..25ee2c97ca21 100644
--- a/arch/um/drivers/pcap_kern.c
+++ b/arch/um/drivers/pcap_kern.c
@@ -15,7 +15,7 @@ struct pcap_init {
char *filter;
};
-void pcap_init(struct net_device *dev, void *data)
+void pcap_init_kern(struct net_device *dev, void *data)
{
struct uml_net_private *pri;
struct pcap_data *ppri;
@@ -44,7 +44,7 @@ static int pcap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
}
static const struct net_kern_info pcap_kern_info = {
- .init = pcap_init,
+ .init = pcap_init_kern,
.protocol = eth_protocol,
.read = pcap_read,
.write = pcap_write,
diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index ded7c47d2fbe..131b7cb29576 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -767,6 +767,7 @@ static int vector_config(char *str, char **error_out)
if (parsed == NULL) {
*error_out = "vector_config failed to parse parameters";
+ kfree(params);
return -EINVAL;
}
diff --git a/arch/um/drivers/vector_user.h b/arch/um/drivers/vector_user.h
index 3a73d17a0161..59ed5f9e6e41 100644
--- a/arch/um/drivers/vector_user.h
+++ b/arch/um/drivers/vector_user.h
@@ -68,8 +68,6 @@ struct vector_fds {
};
#define VECTOR_READ 1
-#define VECTOR_WRITE (1 < 1)
-#define VECTOR_HEADERS (1 < 2)
extern struct arglist *uml_parse_vector_ifspec(char *arg);
diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c
index 3ac220dafec4..7699ca5f35d4 100644
--- a/arch/um/drivers/virt-pci.c
+++ b/arch/um/drivers/virt-pci.c
@@ -8,6 +8,7 @@
#include <linux/virtio.h>
#include <linux/virtio_config.h>
#include <linux/logic_iomem.h>
+#include <linux/of_platform.h>
#include <linux/irqdomain.h>
#include <linux/virtio_pcidev.h>
#include <linux/virtio-uml.h>
@@ -39,6 +40,8 @@ struct um_pci_device {
unsigned long status;
int irq;
+
+ bool platform;
};
struct um_pci_device_reg {
@@ -48,13 +51,15 @@ struct um_pci_device_reg {
static struct pci_host_bridge *bridge;
static DEFINE_MUTEX(um_pci_mtx);
+static struct um_pci_device *um_pci_platform_device;
static struct um_pci_device_reg um_pci_devices[MAX_DEVICES];
static struct fwnode_handle *um_pci_fwnode;
static struct irq_domain *um_pci_inner_domain;
static struct irq_domain *um_pci_msi_domain;
static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)];
-#define UM_VIRT_PCI_MAXDELAY 40000
+static unsigned int um_pci_max_delay_us = 40000;
+module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644);
struct um_pci_message_buffer {
struct virtio_pcidev_msg hdr;
@@ -132,8 +137,11 @@ static int um_pci_send_cmd(struct um_pci_device *dev,
out ? 1 : 0,
posted ? cmd : HANDLE_NO_FREE(cmd),
GFP_ATOMIC);
- if (ret)
+ if (ret) {
+ if (posted)
+ kfree(cmd);
goto out;
+ }
if (posted) {
virtqueue_kick(dev->cmd_vq);
@@ -155,7 +163,7 @@ static int um_pci_send_cmd(struct um_pci_device *dev,
kfree(completed);
if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) ||
- ++delay_count > UM_VIRT_PCI_MAXDELAY,
+ ++delay_count > um_pci_max_delay_us,
"um virt-pci delay: %d", delay_count)) {
ret = -EIO;
break;
@@ -480,6 +488,9 @@ static void um_pci_handle_irq_message(struct virtqueue *vq,
struct virtio_device *vdev = vq->vdev;
struct um_pci_device *dev = vdev->priv;
+ if (!dev->irq)
+ return;
+
/* we should properly chain interrupts, but on ARCH=um we don't care */
switch (msg->op) {
@@ -533,6 +544,25 @@ static void um_pci_irq_vq_cb(struct virtqueue *vq)
}
}
+/* Copied from arch/x86/kernel/devicetree.c */
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+ struct device_node *np;
+
+ for_each_node_by_type(np, "pci") {
+ const void *prop;
+ unsigned int bus_min;
+
+ prop = of_get_property(np, "bus-range", NULL);
+ if (!prop)
+ continue;
+ bus_min = be32_to_cpup(prop);
+ if (bus->number == bus_min)
+ return np;
+ }
+ return NULL;
+}
+
static int um_pci_init_vqs(struct um_pci_device *dev)
{
struct virtqueue *vqs[2];
@@ -561,6 +591,55 @@ static int um_pci_init_vqs(struct um_pci_device *dev)
return 0;
}
+static void __um_pci_virtio_platform_remove(struct virtio_device *vdev,
+ struct um_pci_device *dev)
+{
+ virtio_reset_device(vdev);
+ vdev->config->del_vqs(vdev);
+
+ mutex_lock(&um_pci_mtx);
+ um_pci_platform_device = NULL;
+ mutex_unlock(&um_pci_mtx);
+
+ kfree(dev);
+}
+
+static int um_pci_virtio_platform_probe(struct virtio_device *vdev,
+ struct um_pci_device *dev)
+{
+ int ret;
+
+ dev->platform = true;
+
+ mutex_lock(&um_pci_mtx);
+
+ if (um_pci_platform_device) {
+ mutex_unlock(&um_pci_mtx);
+ ret = -EBUSY;
+ goto out_free;
+ }
+
+ ret = um_pci_init_vqs(dev);
+ if (ret) {
+ mutex_unlock(&um_pci_mtx);
+ goto out_free;
+ }
+
+ um_pci_platform_device = dev;
+
+ mutex_unlock(&um_pci_mtx);
+
+ ret = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev);
+ if (ret)
+ __um_pci_virtio_platform_remove(vdev, dev);
+
+ return ret;
+
+out_free:
+ kfree(dev);
+ return ret;
+}
+
static int um_pci_virtio_probe(struct virtio_device *vdev)
{
struct um_pci_device *dev;
@@ -574,6 +653,9 @@ static int um_pci_virtio_probe(struct virtio_device *vdev)
dev->vdev = vdev;
vdev->priv = dev;
+ if (of_device_is_compatible(vdev->dev.of_node, "simple-bus"))
+ return um_pci_virtio_platform_probe(vdev, dev);
+
mutex_lock(&um_pci_mtx);
for (i = 0; i < MAX_DEVICES; i++) {
if (um_pci_devices[i].dev)
@@ -623,9 +705,11 @@ static void um_pci_virtio_remove(struct virtio_device *vdev)
struct um_pci_device *dev = vdev->priv;
int i;
- /* Stop all virtqueues */
- virtio_reset_device(vdev);
- vdev->config->del_vqs(vdev);
+ if (dev->platform) {
+ of_platform_depopulate(&vdev->dev);
+ __um_pci_virtio_platform_remove(vdev, dev);
+ return;
+ }
device_set_wakeup_enable(&vdev->dev, false);
@@ -633,12 +717,27 @@ static void um_pci_virtio_remove(struct virtio_device *vdev)
for (i = 0; i < MAX_DEVICES; i++) {
if (um_pci_devices[i].dev != dev)
continue;
+
um_pci_devices[i].dev = NULL;
irq_free_desc(dev->irq);
+
+ break;
}
mutex_unlock(&um_pci_mtx);
- um_pci_rescan();
+ if (i < MAX_DEVICES) {
+ struct pci_dev *pci_dev;
+
+ pci_dev = pci_get_slot(bridge->bus, i);
+ if (pci_dev)
+ pci_stop_and_remove_bus_device_locked(pci_dev);
+ }
+
+ /* Stop all virtqueues */
+ virtio_reset_device(vdev);
+ dev->cmd_vq = NULL;
+ dev->irq_vq = NULL;
+ vdev->config->del_vqs(vdev);
kfree(dev);
}
@@ -860,6 +959,30 @@ void *pci_root_bus_fwnode(struct pci_bus *bus)
return um_pci_fwnode;
}
+static long um_pci_map_platform(unsigned long offset, size_t size,
+ const struct logic_iomem_ops **ops,
+ void **priv)
+{
+ if (!um_pci_platform_device)
+ return -ENOENT;
+
+ *ops = &um_pci_device_bar_ops;
+ *priv = &um_pci_platform_device->resptr[0];
+
+ return 0;
+}
+
+static const struct logic_iomem_region_ops um_pci_platform_ops = {
+ .map = um_pci_map_platform,
+};
+
+static struct resource virt_platform_resource = {
+ .name = "platform",
+ .start = 0x10000000,
+ .end = 0x1fffffff,
+ .flags = IORESOURCE_MEM,
+};
+
static int __init um_pci_init(void)
{
int err, i;
@@ -868,6 +991,8 @@ static int __init um_pci_init(void)
&um_pci_cfgspace_ops));
WARN_ON(logic_iomem_add_region(&virt_iomem_resource,
&um_pci_iomem_ops));
+ WARN_ON(logic_iomem_add_region(&virt_platform_resource,
+ &um_pci_platform_ops));
if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0,
"No virtio device ID configured for PCI - no PCI support\n"))
diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index 588930a0ced1..8adca2000e51 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -168,7 +168,8 @@ static void vhost_user_check_reset(struct virtio_uml_device *vu_dev,
if (!vu_dev->registered)
return;
- virtio_break_device(&vu_dev->vdev);
+ vu_dev->registered = 0;
+
schedule_work(&pdata->conn_broken_wk);
}
@@ -412,7 +413,7 @@ static irqreturn_t vu_req_read_message(struct virtio_uml_device *vu_dev,
if (msg.msg.header.flags & VHOST_USER_FLAG_NEED_REPLY)
vhost_user_reply(vu_dev, &msg.msg, response);
irq_rc = IRQ_HANDLED;
- };
+ }
/* mask EAGAIN as we try non-blocking read until socket is empty */
vu_dev->recv_rc = (rc == -EAGAIN) ? 0 : rc;
return irq_rc;
@@ -1136,6 +1137,15 @@ void virtio_uml_set_no_vq_suspend(struct virtio_device *vdev,
static void vu_of_conn_broken(struct work_struct *wk)
{
+ struct virtio_uml_platform_data *pdata;
+ struct virtio_uml_device *vu_dev;
+
+ pdata = container_of(wk, struct virtio_uml_platform_data, conn_broken_wk);
+
+ vu_dev = platform_get_drvdata(pdata->pdev);
+
+ virtio_break_device(&vu_dev->vdev);
+
/*
* We can't remove the device from the devicetree so the only thing we
* can do is warn.
@@ -1266,8 +1276,14 @@ static int vu_unregister_cmdline_device(struct device *dev, void *data)
static void vu_conn_broken(struct work_struct *wk)
{
struct virtio_uml_platform_data *pdata;
+ struct virtio_uml_device *vu_dev;
pdata = container_of(wk, struct virtio_uml_platform_data, conn_broken_wk);
+
+ vu_dev = platform_get_drvdata(pdata->pdev);
+
+ virtio_break_device(&vu_dev->vdev);
+
vu_unregister_cmdline_device(&pdata->pdev->dev, NULL);
}
diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h
index bb5f06480da9..7414154b8e9a 100644
--- a/arch/um/include/asm/processor-generic.h
+++ b/arch/um/include/asm/processor-generic.h
@@ -91,7 +91,7 @@ struct cpuinfo_um {
extern struct cpuinfo_um boot_cpu_data;
-#define cpu_data (&boot_cpu_data)
+#define cpu_data(cpu) boot_cpu_data
#define current_cpu_data boot_cpu_data
#define cache_line_size() (boot_cpu_data.cache_alignment)
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 1c2d4b29a3d4..811188be954c 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o
USER_OBJS := config.o
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
targets := config.c config.tmp capflags.c
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 58938d75871a..827a0d3fa589 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -29,8 +29,8 @@ void flush_thread(void)
ret = unmap(&current->mm->context.id, 0, TASK_SIZE, 1, &data);
if (ret) {
- printk(KERN_ERR "flush_thread - clearing address space failed, "
- "err = %d\n", ret);
+ printk(KERN_ERR "%s - clearing address space failed, err = %d\n",
+ __func__, ret);
force_sig(SIGKILL);
}
get_safe_registers(current_pt_regs()->regs.gp,
diff --git a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile
index f3d494a4fd9b..f93972a25765 100644
--- a/arch/um/kernel/skas/Makefile
+++ b/arch/um/kernel/skas/Makefile
@@ -14,4 +14,4 @@ UNPROFILE_OBJS := clone.o
KCOV_INSTRUMENT := n
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index ad449173a1a1..7d050ab0f78a 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -314,8 +314,8 @@ static inline int update_p4d_range(pgd_t *pgd, unsigned long addr,
return ret;
}
-void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
- unsigned long end_addr, int force)
+static void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
+ unsigned long end_addr, int force)
{
pgd_t *pgd;
struct host_vm_change hvc;
@@ -597,6 +597,8 @@ void force_flush_all(void)
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
+ mmap_read_lock(mm);
for_each_vma(vmi, vma)
fix_range(mm, vma->vm_start, vma->vm_end, 1);
+ mmap_read_unlock(mm);
}
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 786b44dc20c9..8dcda617b8bf 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -96,7 +96,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
static void *c_start(struct seq_file *m, loff_t *pos)
{
- return *pos < nr_cpu_ids ? cpu_data + *pos : NULL;
+ return *pos < nr_cpu_ids ? &boot_cpu_data + *pos : NULL;
}
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
diff --git a/arch/um/kernel/vmlinux.lds.S b/arch/um/kernel/vmlinux.lds.S
index 16e49bfa2b42..53d719c04ba9 100644
--- a/arch/um/kernel/vmlinux.lds.S
+++ b/arch/um/kernel/vmlinux.lds.S
@@ -1,4 +1,4 @@
-
+#define RUNTIME_DISCARD_EXIT
KERNEL_STACK_SIZE = 4096 * (1 << CONFIG_KERNEL_STACK_ORDER);
#ifdef CONFIG_LD_SCRIPT_STATIC
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile
index 77ac50baa3f8..544e0b344c75 100644
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -18,4 +18,4 @@ USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \
main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \
tty.o umid.o util.o
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/drivers/Makefile b/arch/um/os-Linux/drivers/Makefile
index d79e75f1b69a..cf2d75bb1884 100644
--- a/arch/um/os-Linux/drivers/Makefile
+++ b/arch/um/os-Linux/drivers/Makefile
@@ -10,4 +10,4 @@ obj-y =
obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o
obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index 98ea910ef87c..cf7e49c08b21 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -127,12 +127,10 @@ int os_mod_epoll_fd(int events, int fd, void *data)
int os_del_epoll_fd(int fd)
{
struct epoll_event event;
- int result;
/* This is quiet as we use this as IO ON/OFF - so it is often
* invoked on a non-existent fd
*/
- result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
- return result;
+ return epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
}
void os_set_ioignore(void)
diff --git a/arch/um/os-Linux/skas/Makefile b/arch/um/os-Linux/skas/Makefile
index c4566e788815..75f11989d2e9 100644
--- a/arch/um/os-Linux/skas/Makefile
+++ b/arch/um/os-Linux/skas/Makefile
@@ -7,4 +7,4 @@ obj-y := mem.o process.o
USER_OBJS := $(obj-y)
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index 3b4975ee67e2..953fb10f3f93 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -60,8 +60,8 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr)
printk(UM_KERN_ERR "Registers - \n");
for (i = 0; i < MAX_REG_NR; i++)
printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]);
- panic("do_syscall_stub : PTRACE_SETREGS failed, errno = %d\n",
- -n);
+ panic("%s : PTRACE_SETREGS failed, errno = %d\n",
+ __func__, -n);
}
err = ptrace(PTRACE_CONT, pid, 0, 0);
@@ -81,20 +81,17 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr)
offset = *((unsigned long *) mm_idp->stack + 1);
if (offset) {
data = (unsigned long *)(mm_idp->stack + offset - STUB_DATA);
- printk(UM_KERN_ERR "do_syscall_stub : ret = %ld, offset = %ld, "
- "data = %p\n", ret, offset, data);
+ printk(UM_KERN_ERR "%s : ret = %ld, offset = %ld, data = %p\n",
+ __func__, ret, offset, data);
syscall = (unsigned long *)((unsigned long)data + data[0]);
- printk(UM_KERN_ERR "do_syscall_stub: syscall %ld failed, "
- "return value = 0x%lx, expected return value = 0x%lx\n",
- syscall[0], ret, syscall[7]);
- printk(UM_KERN_ERR " syscall parameters: "
- "0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
+ printk(UM_KERN_ERR "%s: syscall %ld failed, return value = 0x%lx, expected return value = 0x%lx\n",
+ __func__, syscall[0], ret, syscall[7]);
+ printk(UM_KERN_ERR " syscall parameters: 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
syscall[1], syscall[2], syscall[3],
syscall[4], syscall[5], syscall[6]);
for (n = 1; n < data[0]/sizeof(long); n++) {
if (n == 1)
- printk(UM_KERN_ERR " additional syscall "
- "data:");
+ printk(UM_KERN_ERR " additional syscall data:");
if (n % 4 == 1)
printk("\n" UM_KERN_ERR " ");
printk(" 0x%lx", data[n]);
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index b24db6017ded..b1ea53285af1 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -118,8 +118,8 @@ void wait_stub_done(int pid)
err = ptrace(PTRACE_CONT, pid, 0, 0);
if (err) {
- printk(UM_KERN_ERR "wait_stub_done : continue failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : continue failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
}
@@ -130,11 +130,10 @@ void wait_stub_done(int pid)
bad_wait:
err = ptrace_dump_regs(pid);
if (err)
- printk(UM_KERN_ERR "Failed to get registers from stub, "
- "errno = %d\n", -err);
- printk(UM_KERN_ERR "wait_stub_done : failed to wait for SIGTRAP, "
- "pid = %d, n = %d, errno = %d, status = 0x%x\n", pid, n, errno,
- status);
+ printk(UM_KERN_ERR "Failed to get registers from stub, errno = %d\n",
+ -err);
+ printk(UM_KERN_ERR "%s : failed to wait for SIGTRAP, pid = %d, n = %d, errno = %d, status = 0x%x\n",
+ __func__, pid, n, errno, status);
fatal_sigsegv();
}
@@ -195,15 +194,15 @@ static void handle_trap(int pid, struct uml_pt_regs *regs,
err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET,
__NR_getpid);
if (err < 0) {
- printk(UM_KERN_ERR "handle_trap - nullifying syscall "
- "failed, errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s - nullifying syscall failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
err = ptrace(PTRACE_SYSCALL, pid, 0, 0);
if (err < 0) {
- printk(UM_KERN_ERR "handle_trap - continuing to end of "
- "syscall failed, errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s - continuing to end of syscall failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
@@ -212,11 +211,10 @@ static void handle_trap(int pid, struct uml_pt_regs *regs,
(WSTOPSIG(status) != SIGTRAP + 0x80)) {
err = ptrace_dump_regs(pid);
if (err)
- printk(UM_KERN_ERR "Failed to get registers "
- "from process, errno = %d\n", -err);
- printk(UM_KERN_ERR "handle_trap - failed to wait at "
- "end of syscall, errno = %d, status = %d\n",
- errno, status);
+ printk(UM_KERN_ERR "Failed to get registers from process, errno = %d\n",
+ -err);
+ printk(UM_KERN_ERR "%s - failed to wait at end of syscall, errno = %d, status = %d\n",
+ __func__, errno, status);
fatal_sigsegv();
}
}
@@ -256,8 +254,8 @@ static int userspace_tramp(void *stack)
addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset);
if (addr == MAP_FAILED) {
- printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, "
- "errno = %d\n", STUB_CODE, errno);
+ printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, errno = %d\n",
+ STUB_CODE, errno);
exit(1);
}
@@ -267,8 +265,7 @@ static int userspace_tramp(void *stack)
UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_FIXED | MAP_SHARED, fd, offset);
if (addr == MAP_FAILED) {
- printk(UM_KERN_ERR "mapping segfault stack "
- "at 0x%lx failed, errno = %d\n",
+ printk(UM_KERN_ERR "mapping segfault stack at 0x%lx failed, errno = %d\n",
STUB_DATA, errno);
exit(1);
}
@@ -286,8 +283,8 @@ static int userspace_tramp(void *stack)
sa.sa_sigaction = (void *) v;
sa.sa_restorer = NULL;
if (sigaction(SIGSEGV, &sa, NULL) < 0) {
- printk(UM_KERN_ERR "userspace_tramp - setting SIGSEGV "
- "handler failed - errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s - setting SIGSEGV handler failed - errno = %d\n",
+ __func__, errno);
exit(1);
}
}
@@ -322,8 +319,8 @@ int start_userspace(unsigned long stub_stack)
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (stack == MAP_FAILED) {
err = -errno;
- printk(UM_KERN_ERR "start_userspace : mmap failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : mmap failed, errno = %d\n",
+ __func__, errno);
return err;
}
@@ -336,8 +333,8 @@ int start_userspace(unsigned long stub_stack)
pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack);
if (pid < 0) {
err = -errno;
- printk(UM_KERN_ERR "start_userspace : clone failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
+ __func__, errno);
return err;
}
@@ -345,31 +342,31 @@ int start_userspace(unsigned long stub_stack)
CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
if (n < 0) {
err = -errno;
- printk(UM_KERN_ERR "start_userspace : wait failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
+ __func__, errno);
goto out_kill;
}
} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
err = -EINVAL;
- printk(UM_KERN_ERR "start_userspace : expected SIGSTOP, got "
- "status = %d\n", status);
+ printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n",
+ __func__, status);
goto out_kill;
}
if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
(void *) PTRACE_O_TRACESYSGOOD) < 0) {
err = -errno;
- printk(UM_KERN_ERR "start_userspace : PTRACE_OLDSETOPTIONS "
- "failed, errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS failed, errno = %d\n",
+ __func__, errno);
goto out_kill;
}
if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) {
err = -errno;
- printk(UM_KERN_ERR "start_userspace : munmap failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : munmap failed, errno = %d\n",
+ __func__, errno);
goto out_kill;
}
@@ -403,14 +400,14 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs)
* just kill the process.
*/
if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) {
- printk(UM_KERN_ERR "userspace - ptrace set regs "
- "failed, errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
if (put_fp_registers(pid, regs->fp)) {
- printk(UM_KERN_ERR "userspace - ptrace set fp regs "
- "failed, errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
@@ -421,28 +418,28 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs)
singlestepping(NULL));
if (ptrace(op, pid, 0, 0)) {
- printk(UM_KERN_ERR "userspace - ptrace continue "
- "failed, op = %d, errno = %d\n", op, errno);
+ printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n",
+ __func__, op, errno);
fatal_sigsegv();
}
CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
if (err < 0) {
- printk(UM_KERN_ERR "userspace - wait failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s - wait failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
regs->is_user = 1;
if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
- printk(UM_KERN_ERR "userspace - PTRACE_GETREGS failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
if (get_fp_registers(pid, regs->fp)) {
- printk(UM_KERN_ERR "userspace - get_fp_registers failed, "
- "errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n",
+ __func__, errno);
fatal_sigsegv();
}
@@ -494,8 +491,8 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs)
unblock_signals_trace();
break;
default:
- printk(UM_KERN_ERR "userspace - child stopped "
- "with signal %d\n", sig);
+ printk(UM_KERN_ERR "%s - child stopped with signal %d\n",
+ __func__, sig);
fatal_sigsegv();
}
pid = userspace_pid[0];
@@ -555,15 +552,15 @@ int copy_context_skas0(unsigned long new_stack, int pid)
err = ptrace_setregs(pid, thread_regs);
if (err < 0) {
err = -errno;
- printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_SETREGS "
- "failed, pid = %d, errno = %d\n", pid, -err);
+ printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid = %d, errno = %d\n",
+ __func__, pid, -err);
return err;
}
err = put_fp_registers(pid, thread_fp_regs);
if (err < 0) {
- printk(UM_KERN_ERR "copy_context_skas0 : put_fp_registers "
- "failed, pid = %d, err = %d\n", pid, err);
+ printk(UM_KERN_ERR "%s : put_fp_registers failed, pid = %d, err = %d\n",
+ __func__, pid, err);
return err;
}
@@ -574,8 +571,8 @@ int copy_context_skas0(unsigned long new_stack, int pid)
err = ptrace(PTRACE_CONT, pid, 0, 0);
if (err) {
err = -errno;
- printk(UM_KERN_ERR "Failed to continue new process, pid = %d, "
- "errno = %d\n", pid, errno);
+ printk(UM_KERN_ERR "Failed to continue new process, pid = %d, errno = %d\n",
+ pid, errno);
return err;
}
@@ -583,8 +580,8 @@ int copy_context_skas0(unsigned long new_stack, int pid)
pid = data->parent_err;
if (pid < 0) {
- printk(UM_KERN_ERR "copy_context_skas0 - stub-parent reports "
- "error %d\n", -pid);
+ printk(UM_KERN_ERR "%s - stub-parent reports error %d\n",
+ __func__, -pid);
return pid;
}
@@ -594,8 +591,8 @@ int copy_context_skas0(unsigned long new_stack, int pid)
*/
wait_stub_done(pid);
if (child_data->child_err != STUB_DATA) {
- printk(UM_KERN_ERR "copy_context_skas0 - stub-child %d reports "
- "error %ld\n", pid, data->child_err);
+ printk(UM_KERN_ERR "%s - stub-child %d reports error %ld\n",
+ __func__, pid, data->child_err);
err = data->child_err;
goto out_kill;
}
@@ -603,8 +600,8 @@ int copy_context_skas0(unsigned long new_stack, int pid)
if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
(void *)PTRACE_O_TRACESYSGOOD) < 0) {
err = -errno;
- printk(UM_KERN_ERR "copy_context_skas0 : PTRACE_OLDSETOPTIONS "
- "failed, errno = %d\n", errno);
+ printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS failed, errno = %d\n",
+ __func__, errno);
goto out_kill;
}
@@ -672,8 +669,8 @@ int start_idle_thread(void *stack, jmp_buf *switch_buf)
kmalloc_ok = 0;
return 1;
default:
- printk(UM_KERN_ERR "Bad sigsetjmp return in "
- "start_idle_thread - %d\n", n);
+ printk(UM_KERN_ERR "Bad sigsetjmp return in %s - %d\n",
+ __func__, n);
fatal_sigsegv();
}
longjmp(*switch_buf, 1);
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index b3c1ae084180..b70559b821df 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -1,6 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
core-y += arch/x86/crypto/
+#
+# Disable SSE and other FP/SIMD instructions to match normal x86
+#
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
+
ifeq ($(CONFIG_X86_32),y)
START := 0x8048000
@@ -17,7 +23,7 @@ LDS_EXTRA := -Ui386
export LDS_EXTRA
# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
-include arch/x86/Makefile_32.cpu
+include $(srctree)/arch/x86/Makefile_32.cpu
# prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 1acff356d97a..6b6cfe607bdb 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -50,7 +50,7 @@ KBUILD_CFLAGS += $(call cc-option,-fmacro-prefix-map=$(srctree)/=)
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
KBUILD_CFLAGS += -D__DISABLE_EXPORTS
# Disable relocation relaxation in case the link is not PIE.
-KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
+KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
# sev.c indirectly inludes inat-table.h which is generated during
diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S
index ad0d51f03cb4..6a255e6809bc 100644
--- a/arch/x86/coco/tdx/tdcall.S
+++ b/arch/x86/coco/tdx/tdcall.S
@@ -13,6 +13,12 @@
/*
* Bitmasks of exposed registers (with VMM).
*/
+#define TDX_RDX BIT(2)
+#define TDX_RBX BIT(3)
+#define TDX_RSI BIT(6)
+#define TDX_RDI BIT(7)
+#define TDX_R8 BIT(8)
+#define TDX_R9 BIT(9)
#define TDX_R10 BIT(10)
#define TDX_R11 BIT(11)
#define TDX_R12 BIT(12)
@@ -27,9 +33,9 @@
* details can be found in TDX GHCI specification, section
* titled "TDCALL [TDG.VP.VMCALL] leaf".
*/
-#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \
- TDX_R12 | TDX_R13 | \
- TDX_R14 | TDX_R15 )
+#define TDVMCALL_EXPOSE_REGS_MASK \
+ ( TDX_RDX | TDX_RBX | TDX_RSI | TDX_RDI | TDX_R8 | TDX_R9 | \
+ TDX_R10 | TDX_R11 | TDX_R12 | TDX_R13 | TDX_R14 | TDX_R15 )
.section .noinstr.text, "ax"
@@ -126,25 +132,38 @@ SYM_FUNC_START(__tdx_hypercall)
push %r14
push %r13
push %r12
+ push %rbx
+
+ /* Free RDI and RSI to be used as TDVMCALL arguments */
+ movq %rdi, %rax
+ push %rsi
+
+ /* Copy hypercall registers from arg struct: */
+ movq TDX_HYPERCALL_r8(%rax), %r8
+ movq TDX_HYPERCALL_r9(%rax), %r9
+ movq TDX_HYPERCALL_r10(%rax), %r10
+ movq TDX_HYPERCALL_r11(%rax), %r11
+ movq TDX_HYPERCALL_r12(%rax), %r12
+ movq TDX_HYPERCALL_r13(%rax), %r13
+ movq TDX_HYPERCALL_r14(%rax), %r14
+ movq TDX_HYPERCALL_r15(%rax), %r15
+ movq TDX_HYPERCALL_rdi(%rax), %rdi
+ movq TDX_HYPERCALL_rsi(%rax), %rsi
+ movq TDX_HYPERCALL_rbx(%rax), %rbx
+ movq TDX_HYPERCALL_rdx(%rax), %rdx
+
+ push %rax
/* Mangle function call ABI into TDCALL ABI: */
/* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */
xor %eax, %eax
- /* Copy hypercall registers from arg struct: */
- movq TDX_HYPERCALL_r10(%rdi), %r10
- movq TDX_HYPERCALL_r11(%rdi), %r11
- movq TDX_HYPERCALL_r12(%rdi), %r12
- movq TDX_HYPERCALL_r13(%rdi), %r13
- movq TDX_HYPERCALL_r14(%rdi), %r14
- movq TDX_HYPERCALL_r15(%rdi), %r15
-
movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx
tdcall
/*
- * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that
+ * RAX!=0 indicates a failure of the TDVMCALL mechanism itself and that
* something has gone horribly wrong with the TDX module.
*
* The return status of the hypercall operation is in a separate
@@ -154,30 +173,46 @@ SYM_FUNC_START(__tdx_hypercall)
testq %rax, %rax
jne .Lpanic
- /* TDVMCALL leaf return code is in R10 */
- movq %r10, %rax
+ pop %rax
/* Copy hypercall result registers to arg struct if needed */
- testq $TDX_HCALL_HAS_OUTPUT, %rsi
+ testq $TDX_HCALL_HAS_OUTPUT, (%rsp)
jz .Lout
- movq %r10, TDX_HYPERCALL_r10(%rdi)
- movq %r11, TDX_HYPERCALL_r11(%rdi)
- movq %r12, TDX_HYPERCALL_r12(%rdi)
- movq %r13, TDX_HYPERCALL_r13(%rdi)
- movq %r14, TDX_HYPERCALL_r14(%rdi)
- movq %r15, TDX_HYPERCALL_r15(%rdi)
+ movq %r8, TDX_HYPERCALL_r8(%rax)
+ movq %r9, TDX_HYPERCALL_r9(%rax)
+ movq %r10, TDX_HYPERCALL_r10(%rax)
+ movq %r11, TDX_HYPERCALL_r11(%rax)
+ movq %r12, TDX_HYPERCALL_r12(%rax)
+ movq %r13, TDX_HYPERCALL_r13(%rax)
+ movq %r14, TDX_HYPERCALL_r14(%rax)
+ movq %r15, TDX_HYPERCALL_r15(%rax)
+ movq %rdi, TDX_HYPERCALL_rdi(%rax)
+ movq %rsi, TDX_HYPERCALL_rsi(%rax)
+ movq %rbx, TDX_HYPERCALL_rbx(%rax)
+ movq %rdx, TDX_HYPERCALL_rdx(%rax)
.Lout:
+ /* TDVMCALL leaf return code is in R10 */
+ movq %r10, %rax
+
/*
* Zero out registers exposed to the VMM to avoid speculative execution
* with VMM-controlled values. This needs to include all registers
- * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15
- * context will be restored.
+ * present in TDVMCALL_EXPOSE_REGS_MASK, except RBX, and R12-R15 which
+ * will be restored.
*/
+ xor %r8d, %r8d
+ xor %r9d, %r9d
xor %r10d, %r10d
xor %r11d, %r11d
+ xor %rdi, %rdi
+ xor %rdx, %rdx
+
+ /* Remove TDX_HCALL_* flags from the stack */
+ pop %rsi
/* Restore callee-saved GPRs as mandated by the x86_64 ABI */
+ pop %rbx
pop %r12
pop %r13
pop %r14
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 3bd111d5e6a0..055300e08fb3 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -19,9 +19,14 @@
#define TDX_GET_VEINFO 3
#define TDX_GET_REPORT 4
#define TDX_ACCEPT_PAGE 6
+#define TDX_WR 8
+
+/* TDCS fields. To be used by TDG.VM.WR and TDG.VM.RD module calls */
+#define TDCS_NOTIFY_ENABLES 0x9100000000000010
/* TDX hypercall Leaf IDs */
#define TDVMCALL_MAP_GPA 0x10001
+#define TDVMCALL_REPORT_FATAL_ERROR 0x10003
/* MMIO direction */
#define EPT_READ 0
@@ -37,6 +42,7 @@
#define VE_GET_PORT_NUM(e) ((e) >> 16)
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
+#define ATTR_DEBUG BIT(0)
#define ATTR_SEPT_VE_DISABLE BIT(28)
/* TDX Module call error codes */
@@ -141,6 +147,41 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport)
}
EXPORT_SYMBOL_GPL(tdx_mcall_get_report0);
+static void __noreturn tdx_panic(const char *msg)
+{
+ struct tdx_hypercall_args args = {
+ .r10 = TDX_HYPERCALL_STANDARD,
+ .r11 = TDVMCALL_REPORT_FATAL_ERROR,
+ .r12 = 0, /* Error code: 0 is Panic */
+ };
+ union {
+ /* Define register order according to the GHCI */
+ struct { u64 r14, r15, rbx, rdi, rsi, r8, r9, rdx; };
+
+ char str[64];
+ } message;
+
+ /* VMM assumes '\0' in byte 65, if the message took all 64 bytes */
+ strncpy(message.str, msg, 64);
+
+ args.r8 = message.r8;
+ args.r9 = message.r9;
+ args.r14 = message.r14;
+ args.r15 = message.r15;
+ args.rdi = message.rdi;
+ args.rsi = message.rsi;
+ args.rbx = message.rbx;
+ args.rdx = message.rdx;
+
+ /*
+ * This hypercall should never return and it is not safe
+ * to keep the guest running. Call it forever if it
+ * happens to return.
+ */
+ while (1)
+ __tdx_hypercall(&args, 0);
+}
+
static void tdx_parse_tdinfo(u64 *cc_mask)
{
struct tdx_module_output out;
@@ -172,8 +213,15 @@ static void tdx_parse_tdinfo(u64 *cc_mask)
* TD-private memory. Only VMM-shared memory (MMIO) will #VE.
*/
td_attr = out.rdx;
- if (!(td_attr & ATTR_SEPT_VE_DISABLE))
- panic("TD misconfiguration: SEPT_VE_DISABLE attibute must be set.\n");
+ if (!(td_attr & ATTR_SEPT_VE_DISABLE)) {
+ const char *msg = "TD misconfiguration: SEPT_VE_DISABLE attribute must be set.";
+
+ /* Relax SEPT_VE_DISABLE check for debug TD. */
+ if (td_attr & ATTR_DEBUG)
+ pr_warn("%s\n", msg);
+ else
+ tdx_panic(msg);
+ }
}
/*
@@ -617,6 +665,11 @@ static int virt_exception_user(struct pt_regs *regs, struct ve_info *ve)
}
}
+static inline bool is_private_gpa(u64 gpa)
+{
+ return gpa == cc_mkenc(gpa);
+}
+
/*
* Handle the kernel #VE.
*
@@ -635,6 +688,8 @@ static int virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve)
case EXIT_REASON_CPUID:
return handle_cpuid(regs, ve);
case EXIT_REASON_EPT_VIOLATION:
+ if (is_private_gpa(ve->gpa))
+ panic("Unexpected EPT-violation on private memory.");
return handle_mmio(regs, ve);
case EXIT_REASON_IO_INSTRUCTION:
return handle_io(regs, ve);
@@ -801,6 +856,9 @@ void __init tdx_early_init(void)
tdx_parse_tdinfo(&cc_mask);
cc_set_mask(cc_mask);
+ /* Kernel does not use NOTIFY_ENABLES and does not need random #VEs */
+ tdx_module_call(TDX_WR, 0, TDCS_NOTIFY_ENABLES, 0, -1ULL, NULL);
+
/*
* All bits above GPA width are reserved and kernel treats shared bit
* as flag, not as part of physical address.
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 9fce2d1247a7..a3fb996a86a1 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -6495,6 +6495,7 @@ __init int intel_pmu_init(void)
x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints;
x86_pmu.extra_regs = intel_spr_extra_regs;
x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.pebs_ept = 1;
x86_pmu.pebs_aliases = NULL;
x86_pmu.pebs_prec_dist = true;
x86_pmu.pebs_block = true;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index b0354dc869d2..a2e566e53076 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2366,8 +2366,10 @@ void __init intel_ds_init(void)
x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
break;
- case 4:
case 5:
+ x86_pmu.pebs_ept = 1;
+ fallthrough;
+ case 4:
x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl;
x86_pmu.pebs_record_size = sizeof(struct pebs_basic);
if (x86_pmu.intel_cap.pebs_baseline) {
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 389ea336258f..73c9672c123b 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -315,6 +315,9 @@
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */
#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */
+#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */
+#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */
+#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */
#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 566ac26239ba..0b73a809e9e1 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -269,6 +269,9 @@ enum hv_isolation_type {
/* TSC invariant control */
#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118
+/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
+#define HV_EXPOSE_INVARIANT_TSC BIT_ULL(0)
+
/* Register name aliases for temporary compatibility */
#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT
#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 72184b0b2219..b241af4ce9b4 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -582,18 +582,14 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_MC, xenpv_exc_machine_check);
/* NMI */
-#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
+#if IS_ENABLED(CONFIG_KVM_INTEL)
/*
- * Special NOIST entry point for VMX which invokes this on the kernel
- * stack. asm_exc_nmi() requires an IST to work correctly vs. the NMI
- * 'executing' marker.
- *
- * On 32bit this just uses the regular NMI entry point because 32-bit does
- * not have ISTs.
+ * Special entry point for VMX which invokes this on the kernel stack, even for
+ * 64-bit, i.e. without using an IST. asm_exc_nmi() requires an IST to work
+ * correctly vs. the NMI 'executing' marker. Used for 32-bit kernels as well
+ * to avoid more ifdeffery.
*/
-DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_noist);
-#else
-#define asm_exc_nmi_noist asm_exc_nmi
+DECLARE_IDTENTRY(X86_TRAP_NMI, exc_nmi_kvm_vmx);
#endif
DECLARE_IDTENTRY_NMI(X86_TRAP_NMI, exc_nmi);
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index abccd51dcfca..8dc345cc6318 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -14,6 +14,7 @@ BUILD_BUG_ON(1)
* to make a definition optional, but in this case the default will
* be __static_call_return0.
*/
+KVM_X86_OP(check_processor_compatibility)
KVM_X86_OP(hardware_enable)
KVM_X86_OP(hardware_disable)
KVM_X86_OP(hardware_unsetup)
@@ -76,7 +77,6 @@ KVM_X86_OP(set_nmi_mask)
KVM_X86_OP(enable_nmi_window)
KVM_X86_OP(enable_irq_window)
KVM_X86_OP_OPTIONAL(update_cr8_intercept)
-KVM_X86_OP(check_apicv_inhibit_reasons)
KVM_X86_OP(refresh_apicv_exec_ctrl)
KVM_X86_OP_OPTIONAL(hwapic_irr_update)
KVM_X86_OP_OPTIONAL(hwapic_isr_update)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6aaae18f1854..808c292ad3f4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -134,8 +134,6 @@
#define INVALID_PAGE (~(hpa_t)0)
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-#define INVALID_GPA (~(gpa_t)0)
-
/* KVM Hugepage definitions for x86 */
#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G
#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1)
@@ -514,6 +512,7 @@ struct kvm_pmc {
#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1)
#define KVM_PMC_MAX_FIXED 3
+#define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1)
#define KVM_AMD_PMC_MAX_GENERIC 6
struct kvm_pmu {
unsigned nr_arch_gp_counters;
@@ -678,6 +677,11 @@ struct kvm_vcpu_hv {
} nested;
};
+struct kvm_hypervisor_cpuid {
+ u32 base;
+ u32 limit;
+};
+
/* Xen HVM per vcpu emulation context */
struct kvm_vcpu_xen {
u64 hypercall_rip;
@@ -698,6 +702,7 @@ struct kvm_vcpu_xen {
struct hrtimer timer;
int poll_evtchn;
struct timer_list poll_timer;
+ struct kvm_hypervisor_cpuid cpuid;
};
struct kvm_queued_exception {
@@ -826,7 +831,7 @@ struct kvm_vcpu_arch {
int cpuid_nent;
struct kvm_cpuid_entry2 *cpuid_entries;
- u32 kvm_cpuid_base;
+ struct kvm_hypervisor_cpuid kvm_cpuid;
u64 reserved_gpa_bits;
int maxphyaddr;
@@ -1022,19 +1027,30 @@ struct kvm_arch_memory_slot {
};
/*
- * We use as the mode the number of bits allocated in the LDR for the
- * logical processor ID. It happens that these are all powers of two.
- * This makes it is very easy to detect cases where the APICs are
- * configured for multiple modes; in that case, we cannot use the map and
- * hence cannot use kvm_irq_delivery_to_apic_fast either.
+ * Track the mode of the optimized logical map, as the rules for decoding the
+ * destination vary per mode. Enabling the optimized logical map requires all
+ * software-enabled local APIs to be in the same mode, each addressable APIC to
+ * be mapped to only one MDA, and each MDA to map to at most one APIC.
*/
-#define KVM_APIC_MODE_XAPIC_CLUSTER 4
-#define KVM_APIC_MODE_XAPIC_FLAT 8
-#define KVM_APIC_MODE_X2APIC 16
+enum kvm_apic_logical_mode {
+ /* All local APICs are software disabled. */
+ KVM_APIC_MODE_SW_DISABLED,
+ /* All software enabled local APICs in xAPIC cluster addressing mode. */
+ KVM_APIC_MODE_XAPIC_CLUSTER,
+ /* All software enabled local APICs in xAPIC flat addressing mode. */
+ KVM_APIC_MODE_XAPIC_FLAT,
+ /* All software enabled local APICs in x2APIC mode. */
+ KVM_APIC_MODE_X2APIC,
+ /*
+ * Optimized map disabled, e.g. not all local APICs in the same logical
+ * mode, same logical ID assigned to multiple APICs, etc.
+ */
+ KVM_APIC_MODE_MAP_DISABLED,
+};
struct kvm_apic_map {
struct rcu_head rcu;
- u8 mode;
+ enum kvm_apic_logical_mode logical_mode;
u32 max_apic_id;
union {
struct kvm_lapic *xapic_flat_map[8];
@@ -1088,6 +1104,7 @@ struct kvm_hv {
u64 hv_reenlightenment_control;
u64 hv_tsc_emulation_control;
u64 hv_tsc_emulation_status;
+ u64 hv_invtsc_control;
/* How many vCPUs have VP index != vCPU index */
atomic_t num_mismatched_vp_indexes;
@@ -1133,6 +1150,18 @@ struct kvm_x86_msr_filter {
struct msr_bitmap_range ranges[16];
};
+struct kvm_x86_pmu_event_filter {
+ __u32 action;
+ __u32 nevents;
+ __u32 fixed_counter_bitmap;
+ __u32 flags;
+ __u32 nr_includes;
+ __u32 nr_excludes;
+ __u64 *includes;
+ __u64 *excludes;
+ __u64 events[];
+};
+
enum kvm_apicv_inhibit {
/********************************************************************/
@@ -1164,6 +1193,12 @@ enum kvm_apicv_inhibit {
APICV_INHIBIT_REASON_BLOCKIRQ,
/*
+ * APICv is disabled because not all vCPUs have a 1:1 mapping between
+ * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack.
+ */
+ APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED,
+
+ /*
* For simplicity, the APIC acceleration is inhibited
* first time either APIC ID or APIC base are changed by the guest
* from their reset values.
@@ -1201,6 +1236,12 @@ enum kvm_apicv_inhibit {
* AVIC is disabled because SEV doesn't support it.
*/
APICV_INHIBIT_REASON_SEV,
+
+ /*
+ * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1
+ * mapping between logical ID and vCPU.
+ */
+ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED,
};
struct kvm_arch {
@@ -1249,10 +1290,11 @@ struct kvm_arch {
struct kvm_apic_map __rcu *apic_map;
atomic_t apic_map_dirty;
- /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */
- struct rw_semaphore apicv_update_lock;
-
bool apic_access_memslot_enabled;
+ bool apic_access_memslot_inhibited;
+
+ /* Protects apicv_inhibit_reasons */
+ struct rw_semaphore apicv_update_lock;
unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
@@ -1302,7 +1344,6 @@ struct kvm_arch {
u32 bsp_vcpu_id;
u64 disabled_quirks;
- int cpu_dirty_logging_count;
enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
@@ -1338,25 +1379,16 @@ struct kvm_arch {
/* Guest can access the SGX PROVISIONKEY. */
bool sgx_provisioning_allowed;
- struct kvm_pmu_event_filter __rcu *pmu_event_filter;
+ struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter;
struct task_struct *nx_huge_page_recovery_thread;
#ifdef CONFIG_X86_64
- /*
- * Whether the TDP MMU is enabled for this VM. This contains a
- * snapshot of the TDP MMU module parameter from when the VM was
- * created and remains unchanged for the life of the VM. If this is
- * true, TDP MMU handler functions will run for various MMU
- * operations.
- */
- bool tdp_mmu_enabled;
-
/* The number of TDP MMU pages across all roots. */
atomic64_t tdp_mmu_pages;
/*
- * List of kvm_mmu_page structs being used as roots.
- * All kvm_mmu_page structs in the list should have
+ * List of struct kvm_mmu_pages being used as roots.
+ * All struct kvm_mmu_pages in the list should have
* tdp_mmu_page set.
*
* For reads, this list is protected by:
@@ -1520,6 +1552,8 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
struct kvm_x86_ops {
const char *name;
+ int (*check_processor_compatibility)(void);
+
int (*hardware_enable)(void);
void (*hardware_disable)(void);
void (*hardware_unsetup)(void);
@@ -1608,6 +1642,8 @@ struct kvm_x86_ops {
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason);
+ const unsigned long required_apicv_inhibits;
+ bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(int isr);
@@ -1731,9 +1767,6 @@ struct kvm_x86_nested_ops {
};
struct kvm_x86_init_ops {
- int (*cpu_has_kvm_support)(void);
- int (*disabled_by_bios)(void);
- int (*check_processor_compatibility)(void);
int (*hardware_setup)(void);
unsigned int (*handle_intel_pt_intr)(void);
@@ -1760,6 +1793,9 @@ extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops);
+void kvm_x86_vendor_exit(void);
+
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -1982,7 +2018,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
bool kvm_apicv_activated(struct kvm *kvm);
bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
-void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set);
void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
@@ -2054,14 +2090,11 @@ enum {
TASK_SWITCH_GATE = 3,
};
-#define HF_GIF_MASK (1 << 0)
-#define HF_NMI_MASK (1 << 3)
-#define HF_IRET_MASK (1 << 4)
-#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
+#define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */
#ifdef CONFIG_KVM_SMM
-#define HF_SMM_MASK (1 << 6)
-#define HF_SMM_INSIDE_NMI_MASK (1 << 7)
+#define HF_SMM_MASK (1 << 1)
+#define HF_SMM_INSIDE_NMI_MASK (1 << 2)
# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
# define KVM_ADDRESS_SPACE_NUM 2
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 04c17be9b5fd..bc5b4d788c08 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,6 +25,8 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
+void cpu_emergency_disable_virtualization(void);
+
typedef void (*nmi_shootdown_cb)(int, struct pt_regs*);
void nmi_panic_self_stop(struct pt_regs *regs);
void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index 559176887791..4a03993e0785 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -21,12 +21,18 @@
* This is a software only structure and not part of the TDX module/VMM ABI.
*/
struct tdx_hypercall_args {
+ u64 r8;
+ u64 r9;
u64 r10;
u64 r11;
u64 r12;
u64 r13;
u64 r14;
u64 r15;
+ u64 rdi;
+ u64 rsi;
+ u64 rbx;
+ u64 rdx;
};
/* Used to request services from the VMM */
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 8757078d4442..3b12e6b99412 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -126,7 +126,21 @@ static inline void cpu_svm_disable(void)
wrmsrl(MSR_VM_HSAVE_PA, 0);
rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ if (efer & EFER_SVME) {
+ /*
+ * Force GIF=1 prior to disabling SVM to ensure INIT and NMI
+ * aren't blocked, e.g. if a fatal error occurred between CLGI
+ * and STGI. Note, STGI may #UD if SVM is disabled from NMI
+ * context between reading EFER and executing STGI. In that
+ * case, GIF must already be set, otherwise the NMI would have
+ * been blocked, so just eat the fault.
+ */
+ asm_volatile_goto("1: stgi\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ ::: "memory" : fault);
+fault:
+ wrmsrl(MSR_EFER, efer & ~EFER_SVME);
+ }
}
/** Makes sure SVM is disabled, if it is supported on the CPU
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 16f548a661cf..5fc35f889cd1 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -38,9 +38,11 @@ extern struct start_info *xen_start_info;
#include <asm/processor.h>
+#define XEN_SIGNATURE "XenVMMXenVMM"
+
static inline uint32_t xen_cpuid_base(void)
{
- return hypervisor_cpuid_base("XenVMMXenVMM", 2);
+ return hypervisor_cpuid_base(XEN_SIGNATURE, 2);
}
struct pci_dev;
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index e48deab8901d..7f467fe05d42 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/ioctl.h>
+#include <linux/stddef.h>
#define KVM_PIO_PAGE_OFFSET 1
#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
@@ -507,8 +508,8 @@ struct kvm_nested_state {
* KVM_{GET,PUT}_NESTED_STATE ioctl values.
*/
union {
- struct kvm_vmx_nested_state_data vmx[0];
- struct kvm_svm_nested_state_data svm[0];
+ __DECLARE_FLEX_ARRAY(struct kvm_vmx_nested_state_data, vmx);
+ __DECLARE_FLEX_ARRAY(struct kvm_svm_nested_state_data, svm);
} data;
};
@@ -525,6 +526,35 @@ struct kvm_pmu_event_filter {
#define KVM_PMU_EVENT_ALLOW 0
#define KVM_PMU_EVENT_DENY 1
+#define KVM_PMU_EVENT_FLAG_MASKED_EVENTS BIT(0)
+#define KVM_PMU_EVENT_FLAGS_VALID_MASK (KVM_PMU_EVENT_FLAG_MASKED_EVENTS)
+
+/*
+ * Masked event layout.
+ * Bits Description
+ * ---- -----------
+ * 7:0 event select (low bits)
+ * 15:8 umask match
+ * 31:16 unused
+ * 35:32 event select (high bits)
+ * 36:54 unused
+ * 55 exclude bit
+ * 63:56 umask mask
+ */
+
+#define KVM_PMU_ENCODE_MASKED_ENTRY(event_select, mask, match, exclude) \
+ (((event_select) & 0xFFULL) | (((event_select) & 0XF00ULL) << 24) | \
+ (((mask) & 0xFFULL) << 56) | \
+ (((match) & 0xFFULL) << 8) | \
+ ((__u64)(!!(exclude)) << 55))
+
+#define KVM_PMU_MASKED_ENTRY_EVENT_SELECT \
+ (GENMASK_ULL(7, 0) | GENMASK_ULL(35, 32))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK (GENMASK_ULL(63, 56))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MATCH (GENMASK_ULL(15, 8))
+#define KVM_PMU_MASKED_ENTRY_EXCLUDE (BIT_ULL(55))
+#define KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT (56)
+
/* for KVM_{GET,SET,HAS}_DEVICE_ATTR */
#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index ef9e951415c5..283dcd2f62c8 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -76,12 +76,18 @@ static void __used common(void)
OFFSET(TDX_MODULE_r11, tdx_module_output, r11);
BLANK();
+ OFFSET(TDX_HYPERCALL_r8, tdx_hypercall_args, r8);
+ OFFSET(TDX_HYPERCALL_r9, tdx_hypercall_args, r9);
OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10);
OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11);
OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12);
OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13);
OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14);
OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15);
+ OFFSET(TDX_HYPERCALL_rdi, tdx_hypercall_args, rdi);
+ OFFSET(TDX_HYPERCALL_rsi, tdx_hypercall_args, rsi);
+ OFFSET(TDX_HYPERCALL_rbx, tdx_hypercall_args, rbx);
+ OFFSET(TDX_HYPERCALL_rdx, tdx_hypercall_args, rdx);
BLANK();
OFFSET(BP_scratch, boot_params, scratch);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index f924a76c6923..f36dc2f796c5 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -460,7 +460,7 @@ static void __init ms_hyperv_init_platform(void)
* setting of this MSR bit should happen before init_intel()
* is called.
*/
- wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1);
+ wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 305514431f26..cdd92ab43cda 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -37,7 +37,6 @@
#include <linux/kdebug.h>
#include <asm/cpu.h>
#include <asm/reboot.h>
-#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
#include <asm/cmdline.h>
@@ -81,15 +80,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Disable VMX or SVM if needed.
- *
- * We need to disable virtualization on all CPUs.
- * Having VMX or SVM enabled on any CPU may break rebooting
- * after the kdump kernel has finished its task.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
-
/*
* Disable Intel PT to stop its logging
*/
@@ -148,12 +138,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
*/
cpu_crash_vmclear_loaded_vmcss();
- /* Booting kdump kernel with VMX or SVM enabled won't work,
- * because (among other limitations) we can't disable paging
- * with the virt flags.
- */
- cpu_emergency_vmxoff();
- cpu_emergency_svm_disable();
+ cpu_emergency_disable_virtualization();
/*
* Disable Intel PT to stop its logging
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c315b18ec7c8..776f4b1e395b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -560,14 +560,14 @@ nmi_restart:
}
}
-#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)
-DEFINE_IDTENTRY_RAW(exc_nmi_noist)
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+DEFINE_IDTENTRY_RAW(exc_nmi_kvm_vmx)
{
exc_nmi(regs);
}
-#endif
#if IS_MODULE(CONFIG_KVM_INTEL)
-EXPORT_SYMBOL_GPL(asm_exc_nmi_noist);
+EXPORT_SYMBOL_GPL(asm_exc_nmi_kvm_vmx);
+#endif
#endif
#ifdef CONFIG_NMI_CHECK_CPU
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c3636ea4aa71..d03c551defcc 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -528,33 +528,29 @@ static inline void kb_wait(void)
}
}
-static void vmxoff_nmi(int cpu, struct pt_regs *regs)
-{
- cpu_emergency_vmxoff();
-}
+static inline void nmi_shootdown_cpus_on_restart(void);
-/* Use NMIs as IPIs to tell all CPUs to disable virtualization */
-static void emergency_vmx_disable_all(void)
+static void emergency_reboot_disable_virtualization(void)
{
/* Just make sure we won't change CPUs while doing this */
local_irq_disable();
/*
- * Disable VMX on all CPUs before rebooting, otherwise we risk hanging
- * the machine, because the CPU blocks INIT when it's in VMX root.
+ * Disable virtualization on all CPUs before rebooting to avoid hanging
+ * the system, as VMX and SVM block INIT when running in the host.
*
* We can't take any locks and we may be on an inconsistent state, so
- * use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
+ * use NMIs as IPIs to tell the other CPUs to disable VMX/SVM and halt.
*
- * Do the NMI shootdown even if VMX if off on _this_ CPU, as that
- * doesn't prevent a different CPU from being in VMX root operation.
+ * Do the NMI shootdown even if virtualization is off on _this_ CPU, as
+ * other CPUs may have virtualization enabled.
*/
- if (cpu_has_vmx()) {
- /* Safely force _this_ CPU out of VMX root operation. */
- __cpu_emergency_vmxoff();
+ if (cpu_has_vmx() || cpu_has_svm(NULL)) {
+ /* Safely force _this_ CPU out of VMX/SVM operation. */
+ cpu_emergency_disable_virtualization();
- /* Halt and exit VMX root operation on the other CPUs. */
- nmi_shootdown_cpus(vmxoff_nmi);
+ /* Disable VMX/SVM and halt on other CPUs. */
+ nmi_shootdown_cpus_on_restart();
}
}
@@ -590,7 +586,7 @@ static void native_machine_emergency_restart(void)
unsigned short mode;
if (reboot_emergency)
- emergency_vmx_disable_all();
+ emergency_reboot_disable_virtualization();
tboot_shutdown(TB_SHUTDOWN_REBOOT);
@@ -795,6 +791,17 @@ void machine_crash_shutdown(struct pt_regs *regs)
/* This is the CPU performing the emergency shutdown work. */
int crashing_cpu = -1;
+/*
+ * Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
+ * reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
+ * GIF=0, i.e. if the crash occurred between CLGI and STGI.
+ */
+void cpu_emergency_disable_virtualization(void)
+{
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+}
+
#if defined(CONFIG_SMP)
static nmi_shootdown_cb shootdown_callback;
@@ -817,7 +824,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
local_irq_disable();
- shootdown_callback(cpu, regs);
+ if (shootdown_callback)
+ shootdown_callback(cpu, regs);
+
+ /*
+ * Prepare the CPU for reboot _after_ invoking the callback so that the
+ * callback can safely use virtualization instructions, e.g. VMCLEAR.
+ */
+ cpu_emergency_disable_virtualization();
atomic_dec(&waiting_for_crash_ipi);
/* Assume hlt works */
@@ -828,18 +842,32 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
return NMI_HANDLED;
}
-/*
- * Halt all other CPUs, calling the specified function on each of them
+/**
+ * nmi_shootdown_cpus - Stop other CPUs via NMI
+ * @callback: Optional callback to be invoked from the NMI handler
+ *
+ * The NMI handler on the remote CPUs invokes @callback, if not
+ * NULL, first and then disables virtualization to ensure that
+ * INIT is recognized during reboot.
*
- * This function can be used to halt all other CPUs on crash
- * or emergency reboot time. The function passed as parameter
- * will be called inside a NMI handler on all CPUs.
+ * nmi_shootdown_cpus() can only be invoked once. After the first
+ * invocation all other CPUs are stuck in crash_nmi_callback() and
+ * cannot respond to a second NMI.
*/
void nmi_shootdown_cpus(nmi_shootdown_cb callback)
{
unsigned long msecs;
+
local_irq_disable();
+ /*
+ * Avoid certain doom if a shootdown already occurred; re-registering
+ * the NMI handler will cause list corruption, modifying the callback
+ * will do who knows what, etc...
+ */
+ if (WARN_ON_ONCE(crash_ipi_issued))
+ return;
+
/* Make a note of crashing cpu. Will be used in NMI callback. */
crashing_cpu = safe_smp_processor_id();
@@ -867,7 +895,17 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
msecs--;
}
- /* Leave the nmi callback set */
+ /*
+ * Leave the nmi callback set, shootdown is a one-time thing. Clearing
+ * the callback could result in a NULL pointer dereference if a CPU
+ * (finally) responds after the timeout expires.
+ */
+}
+
+static inline void nmi_shootdown_cpus_on_restart(void)
+{
+ if (!crash_ipi_issued)
+ nmi_shootdown_cpus(NULL);
}
/*
@@ -897,6 +935,8 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
/* No other CPUs to shoot down */
}
+static inline void nmi_shootdown_cpus_on_restart(void) { }
+
void run_crash_ipi_callback(struct pt_regs *regs)
{
}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 06db901fabe8..375b33ecafa2 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -32,7 +32,7 @@
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
#include <asm/kexec.h>
-#include <asm/virtext.h>
+#include <asm/reboot.h>
/*
* Some notes on x86 processor bugs affecting SMP operation:
@@ -122,7 +122,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
return NMI_HANDLED;
- cpu_emergency_vmxoff();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
return NMI_HANDLED;
@@ -134,7 +134,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
ack_APIC_irq();
- cpu_emergency_vmxoff();
+ cpu_emergency_disable_virtualization();
stop_this_cpu(NULL);
}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index fbeaa9ddef59..8e578311ca9d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -49,6 +49,7 @@ config KVM
select SRCU
select INTERVAL_TREE
select HAVE_KVM_PM_NOTIFIER if PM
+ select KVM_GENERIC_HARDWARE_ENABLING
help
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 7f1b585f9a67..599aebec2d52 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -8,6 +8,7 @@
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
* Copyright IBM Corporation, 2008
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/export.h>
@@ -25,6 +26,7 @@
#include "mmu.h"
#include "trace.h"
#include "pmu.h"
+#include "xen.h"
/*
* Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be
@@ -180,15 +182,15 @@ static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2
return 0;
}
-static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
+static struct kvm_hypervisor_cpuid kvm_get_hypervisor_cpuid(struct kvm_vcpu *vcpu,
+ const char *sig)
{
- u32 function;
+ struct kvm_hypervisor_cpuid cpuid = {};
struct kvm_cpuid_entry2 *entry;
+ u32 base;
- vcpu->arch.kvm_cpuid_base = 0;
-
- for_each_possible_hypervisor_cpuid_base(function) {
- entry = kvm_find_cpuid_entry(vcpu, function);
+ for_each_possible_hypervisor_cpuid_base(base) {
+ entry = kvm_find_cpuid_entry(vcpu, base);
if (entry) {
u32 signature[3];
@@ -197,19 +199,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
signature[1] = entry->ecx;
signature[2] = entry->edx;
- BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE));
- if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) {
- vcpu->arch.kvm_cpuid_base = function;
+ if (!memcmp(signature, sig, sizeof(signature))) {
+ cpuid.base = base;
+ cpuid.limit = entry->eax;
break;
}
}
}
+
+ return cpuid;
}
static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
struct kvm_cpuid_entry2 *entries, int nent)
{
- u32 base = vcpu->arch.kvm_cpuid_base;
+ u32 base = vcpu->arch.kvm_cpuid.base;
if (!base)
return NULL;
@@ -439,7 +443,8 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_entries = e2;
vcpu->arch.cpuid_nent = nent;
- kvm_update_kvm_cpuid_base(vcpu);
+ vcpu->arch.kvm_cpuid = kvm_get_hypervisor_cpuid(vcpu, KVM_SIGNATURE);
+ vcpu->arch.xen.cpuid = kvm_get_hypervisor_cpuid(vcpu, XEN_SIGNATURE);
kvm_vcpu_after_set_cpuid(vcpu);
return 0;
@@ -663,8 +668,9 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD);
kvm_cpu_cap_mask(CPUID_7_1_EAX,
- F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) | F(AMX_FP16) |
- F(AVX_IFMA)
+ F(AVX_VNNI) | F(AVX512_BF16) | F(CMPCCXADD) |
+ F(FZRM) | F(FSRS) | F(FSRC) |
+ F(AMX_FP16) | F(AVX_IFMA)
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
@@ -701,6 +707,10 @@ void kvm_set_cpu_caps(void)
if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64))
kvm_cpu_cap_set(X86_FEATURE_GBPAGES);
+ kvm_cpu_cap_init_kvm_defined(CPUID_8000_0007_EDX,
+ SF(CONSTANT_TSC)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0008_EBX,
F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
@@ -1169,8 +1179,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx &= ~GENMASK(17, 16);
break;
case 0x80000007: /* Advanced power management */
- /* invariant TSC is CPUID.80000007H:EDX[8] */
- entry->edx &= (1 << 8);
+ cpuid_entry_override(entry, CPUID_8000_0007_EDX);
+
/* mask against host */
entry->edx &= boot_cpu_data.x86_power;
entry->eax = entry->ebx = entry->ecx = 0;
@@ -1485,6 +1495,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) &&
(data & TSX_CTRL_CPUID_CLEAR))
*ebx &= ~(F(RTM) | F(HLE));
+ } else if (function == 0x80000007) {
+ if (kvm_hv_invtsc_suppressed(vcpu))
+ *edx &= ~SF(CONSTANT_TSC);
}
} else {
*eax = *ebx = *ecx = *edx = 0;
diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c
index c1390357126a..ee8c4c3496ed 100644
--- a/arch/x86/kvm/debugfs.c
+++ b/arch/x86/kvm/debugfs.c
@@ -4,6 +4,8 @@
*
* Copyright 2016 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
#include <linux/debugfs.h>
#include "lapic.h"
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 56e1cf7c339e..a20bec931764 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -17,6 +17,7 @@
*
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
@@ -1633,7 +1634,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
case VCPU_SREG_SS:
/*
* segment is not a writable data segment or segment
- * selector's RPL != CPL or segment selector's RPL != CPL
+ * selector's RPL != CPL or DPL != CPL
*/
if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
goto exception;
@@ -1695,11 +1696,11 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
/*
* segment is not a data or readable code segment or
* ((segment is a data or nonconforming code segment)
- * and (both RPL and CPL > DPL))
+ * and ((RPL > DPL) or (CPL > DPL)))
*/
if ((seg_desc.type & 0xa) == 0x8 ||
(((seg_desc.type & 0xc) != 0xc) &&
- (rpl > dpl && cpl > dpl)))
+ (rpl > dpl || cpl > dpl)))
goto exception;
break;
}
@@ -2309,7 +2310,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
- if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0)
+ if (!ctxt->ops->is_smm(ctxt))
return emulate_ud(ctxt);
if (ctxt->ops->leave_smm(ctxt))
@@ -5132,7 +5133,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
const struct x86_emulate_ops *ops = ctxt->ops;
int rc = X86EMUL_CONTINUE;
int saved_dst_type = ctxt->dst.type;
- unsigned emul_flags;
+ bool is_guest_mode = ctxt->ops->is_guest_mode(ctxt);
ctxt->mem_read.pos = 0;
@@ -5147,7 +5148,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- emul_flags = ctxt->ops->get_hflags(ctxt);
if (unlikely(ctxt->d &
(No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) {
if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
@@ -5181,7 +5181,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(&ctxt->dst);
}
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
+ if (unlikely(is_guest_mode) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5210,7 +5210,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -5264,7 +5264,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
+ if (unlikely(is_guest_mode) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index e8296942a868..b28fd020066f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -17,6 +17,7 @@
* Ben-Ami Yassour <benami@il.ibm.com>
* Andrey Smetanin <asmetanin@virtuozzo.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "lapic.h"
@@ -43,6 +44,24 @@
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, HV_VCPUS_PER_SPARSE_BANK)
+/*
+ * As per Hyper-V TLFS, extended hypercalls start from 0x8001
+ * (HvExtCallQueryCapabilities). Response of this hypercalls is a 64 bit value
+ * where each bit tells which extended hypercall is available besides
+ * HvExtCallQueryCapabilities.
+ *
+ * 0x8001 - First extended hypercall, HvExtCallQueryCapabilities, no bit
+ * assigned.
+ *
+ * 0x8002 - Bit 0
+ * 0x8003 - Bit 1
+ * ..
+ * 0x8041 - Bit 63
+ *
+ * Therefore, HV_EXT_CALL_MAX = 0x8001 + 64
+ */
+#define HV_EXT_CALL_MAX (HV_EXT_CALL_QUERY_CAPABILITIES + 64)
+
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
bool vcpu_kick);
@@ -999,6 +1018,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
r = true;
@@ -1283,6 +1303,9 @@ static bool hv_check_msr_access(struct kvm_vcpu_hv *hv_vcpu, u32 msr)
case HV_X64_MSR_TSC_EMULATION_STATUS:
return hv_vcpu->cpuid_cache.features_eax &
HV_ACCESS_REENLIGHTENMENT;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ return hv_vcpu->cpuid_cache.features_eax &
+ HV_ACCESS_TSC_INVARIANT;
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
return hv_vcpu->cpuid_cache.features_edx &
@@ -1410,12 +1433,22 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
if (!host)
return 1;
break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ /* Only bit 0 is supported */
+ if (data & ~HV_EXPOSE_INVARIANT_TSC)
+ return 1;
+
+ /* The feature can't be disabled from the guest */
+ if (!host && hv->hv_invtsc_control && !data)
+ return 1;
+
+ hv->hv_invtsc_control = data;
+ break;
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
return syndbg_set_msr(vcpu, msr, data, host);
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
return 0;
@@ -1536,8 +1569,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
@@ -1585,11 +1617,14 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
case HV_X64_MSR_TSC_EMULATION_STATUS:
data = hv->hv_tsc_emulation_status;
break;
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
+ data = hv->hv_invtsc_control;
+ break;
case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
return syndbg_get_msr(vcpu, msr, pdata, host);
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
return 1;
}
@@ -1654,7 +1689,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
data = APIC_BUS_FREQUENCY;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
+ kvm_pr_unimpl_rdmsr(vcpu, msr);
return 1;
}
*pdata = data;
@@ -2420,6 +2455,9 @@ static bool hv_check_hypercall_access(struct kvm_vcpu_hv *hv_vcpu, u16 code)
case HVCALL_SEND_IPI:
return hv_vcpu->cpuid_cache.enlightenments_eax &
HV_X64_CLUSTER_IPI_RECOMMENDED;
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ return hv_vcpu->cpuid_cache.features_ebx &
+ HV_ENABLE_EXTENDED_HYPERCALLS;
default:
break;
}
@@ -2512,14 +2550,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
break;
}
- vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = hc.param;
- vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
- vcpu->arch.complete_userspace_io =
- kvm_hv_hypercall_complete_userspace;
- return 0;
+ goto hypercall_userspace_exit;
case HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST:
if (unlikely(hc.var_cnt)) {
ret = HV_STATUS_INVALID_HYPERCALL_INPUT;
@@ -2578,15 +2609,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = HV_STATUS_OPERATION_DENIED;
break;
}
- vcpu->run->exit_reason = KVM_EXIT_HYPERV;
- vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
- vcpu->run->hyperv.u.hcall.input = hc.param;
- vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
- vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
- vcpu->arch.complete_userspace_io =
- kvm_hv_hypercall_complete_userspace;
- return 0;
+ goto hypercall_userspace_exit;
}
+ case HV_EXT_CALL_QUERY_CAPABILITIES ... HV_EXT_CALL_MAX:
+ if (unlikely(hc.fast)) {
+ ret = HV_STATUS_INVALID_PARAMETER;
+ break;
+ }
+ goto hypercall_userspace_exit;
default:
ret = HV_STATUS_INVALID_HYPERCALL_CODE;
break;
@@ -2594,6 +2624,15 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
hypercall_complete:
return kvm_hv_hypercall_complete(vcpu, ret);
+
+hypercall_userspace_exit:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+ vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+ vcpu->run->hyperv.u.hcall.input = hc.param;
+ vcpu->run->hyperv.u.hcall.params[0] = hc.ingpa;
+ vcpu->run->hyperv.u.hcall.params[1] = hc.outgpa;
+ vcpu->arch.complete_userspace_io = kvm_hv_hypercall_complete_userspace;
+ return 0;
}
void kvm_hv_init_vm(struct kvm *kvm)
@@ -2733,9 +2772,11 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
ent->eax |= HV_ACCESS_FREQUENCY_MSRS;
ent->eax |= HV_ACCESS_REENLIGHTENMENT;
+ ent->eax |= HV_ACCESS_TSC_INVARIANT;
ent->ebx |= HV_POST_MESSAGES;
ent->ebx |= HV_SIGNAL_EVENTS;
+ ent->ebx |= HV_ENABLE_EXTENDED_HYPERCALLS;
ent->edx |= HV_X64_HYPERCALL_XMM_INPUT_AVAILABLE;
ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE;
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index 9f96414a31c5..f83b8db72b11 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -136,6 +136,33 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
HV_SYNIC_STIMER_COUNT);
}
+/*
+ * With HV_ACCESS_TSC_INVARIANT feature, invariant TSC (CPUID.80000007H:EDX[8])
+ * is only observed after HV_X64_MSR_TSC_INVARIANT_CONTROL was written to.
+ */
+static inline bool kvm_hv_invtsc_suppressed(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * If Hyper-V's invariant TSC control is not exposed to the guest,
+ * the invariant TSC CPUID flag is not suppressed, Windows guests were
+ * observed to be able to handle it correctly. Going forward, VMMs are
+ * encouraged to enable Hyper-V's invariant TSC control when invariant
+ * TSC CPUID flag is set to make KVM's behavior match genuine Hyper-V.
+ */
+ if (!hv_vcpu ||
+ !(hv_vcpu->cpuid_cache.features_eax & HV_ACCESS_TSC_INVARIANT))
+ return false;
+
+ /*
+ * If Hyper-V's invariant TSC control is exposed to the guest, KVM is
+ * responsible for suppressing the invariant TSC CPUID flag if the
+ * Hyper-V control is not enabled.
+ */
+ return !(to_kvm_hv(vcpu->kvm)->hv_invtsc_control & HV_EXPOSE_INVARIANT_TSC);
+}
+
void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
void kvm_hv_setup_tsc_page(struct kvm *kvm,
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index e0a7a0e7a73c..cd57a517d04a 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -30,7 +30,7 @@
* Based on QEMU and Xen.
*/
-#define pr_fmt(fmt) "pit: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
@@ -351,7 +351,7 @@ static void create_pit_timer(struct kvm_pit *pit, u32 val, int is_period)
if (ps->period < min_period) {
pr_info_ratelimited(
- "kvm: requested %lld ns "
+ "requested %lld ns "
"i8254 timer period limited to %lld ns\n",
ps->period, min_period);
ps->period = min_period;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index e1bb6218bb96..4756bcb5724f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,6 +26,8 @@
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
* Port from Qemu.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/bitops.h>
@@ -35,7 +37,7 @@
#include "trace.h"
#define pr_pic_unimpl(fmt, ...) \
- pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+ pr_err_ratelimited("pic: " fmt, ## __VA_ARGS__)
static void pic_irq_request(struct kvm *kvm, int level);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 765943d7cfa5..042dee556125 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -26,6 +26,7 @@
* Yaozu (Eddie) Dong <eddie.dong@intel.com>
* Based on Xen 3.1 code.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index a70952eca905..b2c397dd2bc6 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -7,6 +7,7 @@
* Authors:
* Yaozu (Eddie) Dong <Eddie.dong@intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/export.h>
#include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 3742d9adacfc..16d076a1b91a 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -8,6 +8,7 @@
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/slab.h>
@@ -56,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
if (irq->dest_mode == APIC_DEST_PHYSICAL &&
irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
- printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ pr_info("apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
@@ -199,7 +200,7 @@ int kvm_request_irq_source_id(struct kvm *kvm)
irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
if (irq_source_id >= BITS_PER_LONG) {
- printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ pr_warn("exhausted allocatable IRQ sources!\n");
irq_source_id = -EFAULT;
goto unlock;
}
@@ -221,7 +222,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
mutex_lock(&kvm->irq_lock);
if (irq_source_id < 0 ||
irq_source_id >= BITS_PER_LONG) {
- printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ pr_err("IRQ source ID out of range!\n");
goto unlock;
}
clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index c09174f73a34..4c91f626c058 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -76,6 +76,18 @@ static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu,
}
/*
+ * kvm_register_test_and_mark_available() is a special snowflake that uses an
+ * arch bitop directly to avoid the explicit instrumentation that comes with
+ * the generic bitops. This allows code that cannot be instrumented (noinstr
+ * functions), e.g. the low level VM-Enter/VM-Exit paths, to cache registers.
+ */
+static __always_inline bool kvm_register_test_and_mark_available(struct kvm_vcpu *vcpu,
+ enum kvm_reg reg)
+{
+ return arch___test_and_set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+}
+
+/*
* The "raw" register helpers are only for cases where the full 64 bits of a
* register are read/written irrespective of current vCPU mode. In other words,
* odds are good you shouldn't be using the raw variants.
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 2d9662be8333..ab65f3a47dfd 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -220,7 +220,8 @@ struct x86_emulate_ops {
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
- unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
+ bool (*is_smm)(struct x86_emulate_ctxt *ctxt);
+ bool (*is_guest_mode)(struct x86_emulate_ctxt *ctxt);
int (*leave_smm)(struct x86_emulate_ctxt *ctxt);
void (*triple_fault)(struct x86_emulate_ctxt *ctxt);
int (*set_xcr)(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr);
@@ -275,10 +276,6 @@ enum x86emul_mode {
X86EMUL_MODE_PROT64, /* 64-bit (long) mode. */
};
-/* These match some of the HF_* flags defined in kvm_host.h */
-#define X86EMUL_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
-#define X86EMUL_SMM_MASK (1 << 6)
-
/*
* fastop functions are declared as taking a never-defined fastop parameter,
* so they can't be called from C directly.
diff --git a/arch/x86/kvm/kvm_onhyperv.c b/arch/x86/kvm/kvm_onhyperv.c
index ee4f696a0782..482d6639ef88 100644
--- a/arch/x86/kvm/kvm_onhyperv.c
+++ b/arch/x86/kvm/kvm_onhyperv.c
@@ -2,6 +2,7 @@
/*
* KVM L1 hypervisor optimizations on Hyper-V.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <asm/mshyperv.h>
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4efdb4a4d72c..e542cf285b51 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -15,6 +15,7 @@
*
* Based on Xen 3.1 code, Copyright (c) 2004, Intel Corporation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/kvm.h>
@@ -166,9 +167,19 @@ static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu)
return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE;
}
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
+
static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
- switch (map->mode) {
+ switch (map->logical_mode) {
+ case KVM_APIC_MODE_SW_DISABLED:
+ /* Arbitrarily use the flat map so that @cluster isn't NULL. */
+ *cluster = map->xapic_flat_map;
+ *mask = 0;
+ return true;
case KVM_APIC_MODE_X2APIC: {
u32 offset = (dest_id >> 16) * 16;
u32 max_apic_id = map->max_apic_id;
@@ -193,8 +204,10 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
*cluster = map->xapic_cluster_map[(dest_id >> 4) & 0xf];
*mask = dest_id & 0xf;
return true;
+ case KVM_APIC_MODE_MAP_DISABLED:
+ return false;
default:
- /* Not optimized. */
+ WARN_ON_ONCE(1);
return false;
}
}
@@ -206,6 +219,134 @@ static void kvm_apic_map_free(struct rcu_head *rcu)
kvfree(map);
}
+static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu,
+ bool *xapic_id_mismatch)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 x2apic_id = kvm_x2apic_id(apic);
+ u32 xapic_id = kvm_xapic_id(apic);
+ u32 physical_id;
+
+ /*
+ * Deliberately truncate the vCPU ID when detecting a mismatched APIC
+ * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
+ * 32-bit value. Any unwanted aliasing due to truncation results will
+ * be detected below.
+ */
+ if (!apic_x2apic_mode(apic) && xapic_id != (u8)vcpu->vcpu_id)
+ *xapic_id_mismatch = true;
+
+ /*
+ * Apply KVM's hotplug hack if userspace has enable 32-bit APIC IDs.
+ * Allow sending events to vCPUs by their x2APIC ID even if the target
+ * vCPU is in legacy xAPIC mode, and silently ignore aliased xAPIC IDs
+ * (the x2APIC ID is truncated to 8 bits, causing IDs > 0xff to wrap
+ * and collide).
+ *
+ * Honor the architectural (and KVM's non-optimized) behavior if
+ * userspace has not enabled 32-bit x2APIC IDs. Each APIC is supposed
+ * to process messages independently. If multiple vCPUs have the same
+ * effective APIC ID, e.g. due to the x2APIC wrap or because the guest
+ * manually modified its xAPIC IDs, events targeting that ID are
+ * supposed to be recognized by all vCPUs with said ID.
+ */
+ if (vcpu->kvm->arch.x2apic_format) {
+ /* See also kvm_apic_match_physical_addr(). */
+ if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+ x2apic_id <= new->max_apic_id)
+ new->phys_map[x2apic_id] = apic;
+
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+ } else {
+ /*
+ * Disable the optimized map if the physical APIC ID is already
+ * mapped, i.e. is aliased to multiple vCPUs. The optimized
+ * map requires a strict 1:1 mapping between IDs and vCPUs.
+ */
+ if (apic_x2apic_mode(apic))
+ physical_id = x2apic_id;
+ else
+ physical_id = xapic_id;
+
+ if (new->phys_map[physical_id])
+ return -EINVAL;
+
+ new->phys_map[physical_id] = apic;
+ }
+
+ return 0;
+}
+
+static void kvm_recalculate_logical_map(struct kvm_apic_map *new,
+ struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ enum kvm_apic_logical_mode logical_mode;
+ struct kvm_lapic **cluster;
+ u16 mask;
+ u32 ldr;
+
+ if (new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ return;
+
+ if (!kvm_apic_sw_enabled(apic))
+ return;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ if (!ldr)
+ return;
+
+ if (apic_x2apic_mode(apic)) {
+ logical_mode = KVM_APIC_MODE_X2APIC;
+ } else {
+ ldr = GET_APIC_LOGICAL_ID(ldr);
+ if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+ logical_mode = KVM_APIC_MODE_XAPIC_FLAT;
+ else
+ logical_mode = KVM_APIC_MODE_XAPIC_CLUSTER;
+ }
+
+ /*
+ * To optimize logical mode delivery, all software-enabled APICs must
+ * be configured for the same mode.
+ */
+ if (new->logical_mode == KVM_APIC_MODE_SW_DISABLED) {
+ new->logical_mode = logical_mode;
+ } else if (new->logical_mode != logical_mode) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ /*
+ * In x2APIC mode, the LDR is read-only and derived directly from the
+ * x2APIC ID, thus is guaranteed to be addressable. KVM reuses
+ * kvm_apic_map.phys_map to optimize logical mode x2APIC interrupts by
+ * reversing the LDR calculation to get cluster of APICs, i.e. no
+ * additional work is required.
+ */
+ if (apic_x2apic_mode(apic)) {
+ WARN_ON_ONCE(ldr != kvm_apic_calc_x2apic_ldr(kvm_x2apic_id(apic)));
+ return;
+ }
+
+ if (WARN_ON_ONCE(!kvm_apic_map_get_logical_dest(new, ldr,
+ &cluster, &mask))) {
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ return;
+ }
+
+ if (!mask)
+ return;
+
+ ldr = ffs(mask) - 1;
+ if (!is_power_of_2(mask) || cluster[ldr])
+ new->logical_mode = KVM_APIC_MODE_MAP_DISABLED;
+ else
+ cluster[ldr] = apic;
+}
+
/*
* CLEAN -> DIRTY and UPDATE_IN_PROGRESS -> DIRTY changes happen without a lock.
*
@@ -224,6 +365,7 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
struct kvm_vcpu *vcpu;
unsigned long i;
u32 max_id = 255; /* enough space for any xAPIC ID */
+ bool xapic_id_mismatch = false;
/* Read kvm->arch.apic_map_dirty before kvm->arch.apic_map. */
if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN)
@@ -256,54 +398,41 @@ void kvm_recalculate_apic_map(struct kvm *kvm)
goto out;
new->max_apic_id = max_id;
+ new->logical_mode = KVM_APIC_MODE_SW_DISABLED;
kvm_for_each_vcpu(i, vcpu, kvm) {
- struct kvm_lapic *apic = vcpu->arch.apic;
- struct kvm_lapic **cluster;
- u16 mask;
- u32 ldr;
- u8 xapic_id;
- u32 x2apic_id;
-
if (!kvm_apic_present(vcpu))
continue;
- xapic_id = kvm_xapic_id(apic);
- x2apic_id = kvm_x2apic_id(apic);
-
- /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
- if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
- x2apic_id <= new->max_apic_id)
- new->phys_map[x2apic_id] = apic;
- /*
- * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
- * prevent them from masking VCPUs with APIC ID <= 0xff.
- */
- if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
- new->phys_map[xapic_id] = apic;
-
- if (!kvm_apic_sw_enabled(apic))
- continue;
-
- ldr = kvm_lapic_get_reg(apic, APIC_LDR);
-
- if (apic_x2apic_mode(apic)) {
- new->mode |= KVM_APIC_MODE_X2APIC;
- } else if (ldr) {
- ldr = GET_APIC_LOGICAL_ID(ldr);
- if (kvm_lapic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
- new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
- else
- new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+ if (kvm_recalculate_phys_map(new, vcpu, &xapic_id_mismatch)) {
+ kvfree(new);
+ new = NULL;
+ goto out;
}
- if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
- continue;
-
- if (mask)
- cluster[ffs(mask) - 1] = apic;
+ kvm_recalculate_logical_map(new, vcpu);
}
out:
+ /*
+ * The optimized map is effectively KVM's internal version of APICv,
+ * and all unwanted aliasing that results in disabling the optimized
+ * map also applies to APICv.
+ */
+ if (!new)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED);
+
+ if (!new || new->logical_mode == KVM_APIC_MODE_MAP_DISABLED)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED);
+
+ if (xapic_id_mismatch)
+ kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+ else
+ kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
+
old = rcu_dereference_protected(kvm->arch.apic_map,
lockdep_is_held(&kvm->arch.apic_map_lock));
rcu_assign_pointer(kvm->arch.apic_map, new);
@@ -360,11 +489,6 @@ static inline void kvm_apic_set_dfr(struct kvm_lapic *apic, u32 val)
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
-static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
-{
- return ((id >> 4) << 16) | (1 << (id & 0xf));
-}
-
static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = kvm_apic_calc_x2apic_ldr(id);
@@ -941,8 +1065,7 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
{
if (!kvm->arch.disabled_lapic_found) {
kvm->arch.disabled_lapic_found = true;
- printk(KERN_INFO
- "Disabled LAPIC found during irq injection\n");
+ pr_info("Disabled LAPIC found during irq injection\n");
}
}
@@ -951,7 +1074,7 @@ static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
{
if (kvm->arch.x2apic_broadcast_quirk_disabled) {
if ((irq->dest_id == APIC_BROADCAST &&
- map->mode != KVM_APIC_MODE_X2APIC))
+ map->logical_mode != KVM_APIC_MODE_X2APIC))
return true;
if (irq->dest_id == X2APIC_BROADCAST)
return true;
@@ -1364,7 +1487,6 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
ktime_t remaining, now;
s64 ns;
- u32 tmcct;
ASSERT(apic != NULL);
@@ -1379,10 +1501,7 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
remaining = 0;
ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
- tmcct = div64_u64(ns,
- (APIC_BUS_CYCLE_NS * apic->divide_count));
-
- return tmcct;
+ return div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->divide_count));
}
static void __report_tpr_access(struct kvm_lapic *apic, bool write)
@@ -1442,19 +1561,15 @@ static inline struct kvm_lapic *to_lapic(struct kvm_io_device *dev)
#define APIC_REGS_MASK(first, count) \
(APIC_REG_MASK(first) * ((1ull << (count)) - 1))
-static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
- void *data)
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic)
{
- unsigned char alignment = offset & 0xf;
- u32 result;
- /* this bitmask has a bit cleared for each reserved register */
+ /* Leave bits '0' for reserved and write-only registers. */
u64 valid_reg_mask =
APIC_REG_MASK(APIC_ID) |
APIC_REG_MASK(APIC_LVR) |
APIC_REG_MASK(APIC_TASKPRI) |
APIC_REG_MASK(APIC_PROCPRI) |
APIC_REG_MASK(APIC_LDR) |
- APIC_REG_MASK(APIC_DFR) |
APIC_REG_MASK(APIC_SPIV) |
APIC_REGS_MASK(APIC_ISR, APIC_ISR_NR) |
APIC_REGS_MASK(APIC_TMR, APIC_ISR_NR) |
@@ -1474,21 +1589,33 @@ static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
if (kvm_lapic_lvt_supported(apic, LVT_CMCI))
valid_reg_mask |= APIC_REG_MASK(APIC_LVTCMCI);
- /*
- * ARBPRI and ICR2 are not valid in x2APIC mode. WARN if KVM reads ICR
- * in x2APIC mode as it's an 8-byte register in x2APIC and needs to be
- * manually handled by the caller.
- */
+ /* ARBPRI, DFR, and ICR2 are not valid in x2APIC mode. */
if (!apic_x2apic_mode(apic))
valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI) |
+ APIC_REG_MASK(APIC_DFR) |
APIC_REG_MASK(APIC_ICR2);
- else
- WARN_ON_ONCE(offset == APIC_ICR);
+
+ return valid_reg_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_readable_reg_mask);
+
+static int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
+ void *data)
+{
+ unsigned char alignment = offset & 0xf;
+ u32 result;
+
+ /*
+ * WARN if KVM reads ICR in x2APIC mode, as it's an 8-byte register in
+ * x2APIC and needs to be manually handled by the caller.
+ */
+ WARN_ON_ONCE(apic_x2apic_mode(apic) && offset == APIC_ICR);
if (alignment + len > 4)
return 1;
- if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset)))
+ if (offset > 0x3f0 ||
+ !(kvm_lapic_readable_reg_mask(apic) & APIC_REG_MASK(offset)))
return 1;
result = __apic_read(apic, offset & ~0xf);
@@ -1560,7 +1687,7 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
if (apic->lapic_timer.period < min_period) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
+ "vcpu %i: requested %lld ns "
"lapic timer period limited to %lld ns\n",
apic->vcpu->vcpu_id,
apic->lapic_timer.period, min_period);
@@ -1841,11 +1968,15 @@ static bool set_target_expiration(struct kvm_lapic *apic, u32 count_reg)
if (unlikely(count_reg != APIC_TMICT)) {
deadline = tmict_to_ns(apic,
kvm_lapic_get_reg(apic, count_reg));
- if (unlikely(deadline <= 0))
- deadline = apic->lapic_timer.period;
+ if (unlikely(deadline <= 0)) {
+ if (apic_lvtt_period(apic))
+ deadline = apic->lapic_timer.period;
+ else
+ deadline = 0;
+ }
else if (unlikely(deadline > apic->lapic_timer.period)) {
pr_info_ratelimited(
- "kvm: vcpu %i: requested lapic timer restore with "
+ "vcpu %i: requested lapic timer restore with "
"starting count register %#x=%u (%lld ns) > initial count (%lld ns). "
"Using initial count to start timer.\n",
apic->vcpu->vcpu_id,
@@ -2068,19 +2199,6 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
}
}
-static void kvm_lapic_xapic_id_updated(struct kvm_lapic *apic)
-{
- struct kvm *kvm = apic->vcpu->kvm;
-
- if (KVM_BUG_ON(apic_x2apic_mode(apic), kvm))
- return;
-
- if (kvm_xapic_id(apic) == apic->vcpu->vcpu_id)
- return;
-
- kvm_set_apicv_inhibit(apic->vcpu->kvm, APICV_INHIBIT_REASON_APIC_ID_MODIFIED);
-}
-
static int get_lvt_index(u32 reg)
{
if (reg == APIC_LVTCMCI)
@@ -2101,7 +2219,6 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic)) {
kvm_apic_set_xapic_id(apic, val >> 24);
- kvm_lapic_xapic_id_updated(apic);
} else {
ret = 1;
}
@@ -2219,10 +2336,14 @@ static int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic))
- kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
- else
+ /*
+ * Self-IPI exists only when x2APIC is enabled. Bits 7:0 hold
+ * the vector, everything else is reserved.
+ */
+ if (!apic_x2apic_mode(apic) || (val & ~APIC_VECTOR_MASK))
ret = 1;
+ else
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | val, 0);
break;
default:
ret = 1;
@@ -2284,23 +2405,18 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
struct kvm_lapic *apic = vcpu->arch.apic;
u64 val;
- if (apic_x2apic_mode(apic)) {
- if (KVM_BUG_ON(kvm_lapic_msr_read(apic, offset, &val), vcpu->kvm))
- return;
- } else {
- val = kvm_lapic_get_reg(apic, offset);
- }
-
/*
* ICR is a single 64-bit register when x2APIC is enabled. For legacy
* xAPIC, ICR writes need to go down the common (slightly slower) path
* to get the upper half from ICR2.
*/
if (apic_x2apic_mode(apic) && offset == APIC_ICR) {
+ val = kvm_lapic_get_reg64(apic, APIC_ICR);
kvm_apic_send_ipi(apic, (u32)val, (u32)(val >> 32));
trace_kvm_apic_write(APIC_ICR, val);
} else {
/* TODO: optimize to just emulate side effect w/o one more write */
+ val = kvm_lapic_get_reg(apic, offset);
kvm_lapic_reg_write(apic, offset, (u32)val);
}
}
@@ -2394,11 +2510,15 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
}
- if (((old_value ^ value) & X2APIC_ENABLE) && (value & X2APIC_ENABLE))
- kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ if ((old_value ^ value) & X2APIC_ENABLE) {
+ if (value & X2APIC_ENABLE)
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
+ else if (value & MSR_IA32_APICBASE_ENABLE)
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) {
- kvm_vcpu_update_apicv(vcpu);
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
static_call_cond(kvm_x86_set_virtual_apic_mode)(vcpu);
}
@@ -2429,6 +2549,78 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
*/
apic->isr_count = count_vectors(apic->regs + APIC_ISR);
}
+ apic->highest_isr_cache = -1;
+}
+
+int kvm_alloc_apic_access_page(struct kvm *kvm)
+{
+ struct page *page;
+ void __user *hva;
+ int ret = 0;
+
+ mutex_lock(&kvm->slots_lock);
+ if (kvm->arch.apic_access_memslot_enabled ||
+ kvm->arch.apic_access_memslot_inhibited)
+ goto out;
+
+ hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
+ if (IS_ERR(hva)) {
+ ret = PTR_ERR(hva);
+ goto out;
+ }
+
+ page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Do not pin the page in memory, so that memory hot-unplug
+ * is able to migrate it.
+ */
+ put_page(page);
+ kvm->arch.apic_access_memslot_enabled = true;
+out:
+ mutex_unlock(&kvm->slots_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_alloc_apic_access_page);
+
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ if (!kvm->arch.apic_access_memslot_enabled)
+ return;
+
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ mutex_lock(&kvm->slots_lock);
+
+ if (kvm->arch.apic_access_memslot_enabled) {
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
+ /*
+ * Clear "enabled" after the memslot is deleted so that a
+ * different vCPU doesn't get a false negative when checking
+ * the flag out of slots_lock. No additional memory barrier is
+ * needed as modifying memslots requires waiting other vCPUs to
+ * drop SRCU (see above), and false positives are ok as the
+ * flag is rechecked after acquiring slots_lock.
+ */
+ kvm->arch.apic_access_memslot_enabled = false;
+
+ /*
+ * Mark the memslot as inhibited to prevent reallocating the
+ * memslot during vCPU creation, e.g. if a vCPU is hotplugged.
+ */
+ kvm->arch.apic_access_memslot_inhibited = true;
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ kvm_vcpu_srcu_read_lock(vcpu);
}
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -2484,7 +2676,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
kvm_apic_update_apicv(vcpu);
- apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -2756,9 +2947,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
}
memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s));
- if (!apic_x2apic_mode(apic))
- kvm_lapic_xapic_id_updated(apic);
-
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
kvm_recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);
@@ -2772,7 +2960,6 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
__start_apic_timer(apic, APIC_TMCCT);
kvm_lapic_set_reg(apic, APIC_TMCCT, 0);
kvm_apic_update_apicv(vcpu);
- apic->highest_isr_cache = -1;
if (apic->apicv_active) {
static_call_cond(kvm_x86_apicv_post_state_restore)(vcpu);
static_call_cond(kvm_x86_hwapic_irr_update)(vcpu, apic_find_highest_irr(apic));
@@ -2943,13 +3130,17 @@ static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
static int kvm_lapic_msr_write(struct kvm_lapic *apic, u32 reg, u64 data)
{
/*
- * ICR is a 64-bit register in x2APIC mode (and Hyper'v PV vAPIC) and
+ * ICR is a 64-bit register in x2APIC mode (and Hyper-V PV vAPIC) and
* can be written as such, all other registers remain accessible only
* through 32-bit reads/writes.
*/
if (reg == APIC_ICR)
return kvm_x2apic_icr_write(apic, data);
+ /* Bits 63:32 are reserved in all other registers. */
+ if (data >> 32)
+ return 1;
+
return kvm_lapic_reg_write(apic, reg, (u32)data);
}
@@ -2972,9 +3163,6 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
return 1;
- if (reg == APIC_DFR)
- return 1;
-
return kvm_lapic_msr_read(apic, reg, data);
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 58c3242fcc7a..0a0ea4b5dd8c 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -112,6 +112,8 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
+int kvm_alloc_apic_access_page(struct kvm *kvm);
+void kvm_inhibit_apic_access_page(struct kvm_vcpu *vcpu);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
@@ -144,6 +146,8 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len);
void kvm_lapic_exit(void);
+u64 kvm_lapic_readable_reg_mask(struct kvm_lapic *apic);
+
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 6bdaacb6faa0..168c46fd8dd1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -230,14 +230,14 @@ static inline bool kvm_shadow_root_allocated(struct kvm *kvm)
}
#ifdef CONFIG_X86_64
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return kvm->arch.tdp_mmu_enabled; }
+extern bool tdp_mmu_enabled;
#else
-static inline bool is_tdp_mmu_enabled(struct kvm *kvm) { return false; }
+#define tdp_mmu_enabled false
#endif
static inline bool kvm_memslots_have_rmaps(struct kvm *kvm)
{
- return !is_tdp_mmu_enabled(kvm) || kvm_shadow_root_allocated(kvm);
+ return !tdp_mmu_enabled || kvm_shadow_root_allocated(kvm);
}
static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 835426254e76..c8ebe542c565 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -14,6 +14,7 @@
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "irq.h"
#include "ioapic.h"
@@ -43,6 +44,7 @@
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/kern_levels.h>
+#include <linux/kstrtox.h>
#include <linux/kthread.h>
#include <asm/page.h>
@@ -99,6 +101,13 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644);
*/
bool tdp_enabled = false;
+static bool __ro_after_init tdp_mmu_allowed;
+
+#ifdef CONFIG_X86_64
+bool __read_mostly tdp_mmu_enabled = true;
+module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0444);
+#endif
+
static int max_huge_page_level __read_mostly;
static int tdp_root_level __read_mostly;
static int max_tdp_level __read_mostly;
@@ -261,6 +270,17 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
kvm_flush_remote_tlbs_with_range(kvm, &range);
}
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index);
+
+/* Flush the range of guest memory mapped by the given SPTE. */
+static void kvm_flush_remote_tlbs_sptep(struct kvm *kvm, u64 *sptep)
+{
+ struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+ gfn_t gfn = kvm_mmu_page_get_gfn(sp, spte_index(sptep));
+
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
+}
+
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned int access)
{
@@ -609,9 +629,14 @@ static bool mmu_spte_age(u64 *sptep)
return true;
}
+static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
+{
+ return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
+}
+
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
- if (is_tdp_mmu(vcpu->arch.mmu)) {
+ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_begin();
} else {
/*
@@ -630,7 +655,7 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
{
- if (is_tdp_mmu(vcpu->arch.mmu)) {
+ if (is_tdp_mmu_active(vcpu)) {
kvm_tdp_mmu_walk_lockless_end();
} else {
/*
@@ -800,7 +825,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_gfn_disallow_lpage(slot, gfn);
if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, PG_LEVEL_4K))
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, PG_LEVEL_4K);
}
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1174,8 +1199,7 @@ static void drop_large_spte(struct kvm *kvm, u64 *sptep, bool flush)
drop_spte(kvm, sptep);
if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
}
/*
@@ -1279,7 +1303,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, true);
@@ -1312,7 +1336,7 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
{
struct kvm_rmap_head *rmap_head;
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
slot->base_gfn + gfn_offset, mask, false);
@@ -1395,7 +1419,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
}
}
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
write_protected |=
kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
@@ -1456,7 +1480,7 @@ restart:
}
if (need_flush && kvm_available_flush_tlb_with_range()) {
- kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
return false;
}
@@ -1558,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
@@ -1571,7 +1595,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
return flush;
@@ -1626,8 +1650,7 @@ static void __rmap_add(struct kvm *kvm,
kvm->stat.max_mmu_rmap_size = rmap_count;
if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
kvm_zap_all_rmap_sptes(kvm, rmap_head);
- kvm_flush_remote_tlbs_with_address(
- kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_gfn(kvm, gfn, sp->role.level);
}
}
@@ -1646,7 +1669,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
return young;
@@ -1659,7 +1682,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
if (kvm_memslots_have_rmaps(kvm))
young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
return young;
@@ -1921,7 +1944,7 @@ static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
return true;
/* TDP MMU pages do not use the MMU generation. */
- return !sp->tdp_mmu_page &&
+ return !is_tdp_mmu_page(sp) &&
unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
}
@@ -2355,7 +2378,16 @@ static void __link_shadow_page(struct kvm *kvm,
mmu_page_add_parent_pte(cache, sp, sptep);
- if (sp->unsync_children || sp->unsync)
+ /*
+ * The non-direct sub-pagetable must be updated before linking. For
+ * L1 sp, the pagetable is updated via kvm_sync_page() in
+ * kvm_mmu_find_shadow_page() without write-protecting the gfn,
+ * so sp->unsync can be true or false. For higher level non-direct
+ * sp, the pagetable is updated/synced via mmu_sync_children() in
+ * FNAME(fetch)(), so sp->unsync_children can only be false.
+ * WARN_ON_ONCE() if anything happens unexpectedly.
+ */
+ if (WARN_ON_ONCE(sp->unsync_children) || sp->unsync)
mark_unsync(sptep);
}
@@ -2383,7 +2415,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
return;
drop_parent_pte(child, sptep);
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
}
}
@@ -2867,8 +2899,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
}
if (flush)
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
- KVM_PAGES_PER_HPAGE(level));
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -3116,11 +3147,11 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
!is_large_pte(spte) &&
spte_to_child_sp(spte)->nx_huge_page_disallowed) {
/*
- * A small SPTE exists for this pfn, but FNAME(fetch)
- * and __direct_map would like to create a large PTE
- * instead: just force them to go down another level,
- * patching back for them into pfn the next 9 bits of
- * the address.
+ * A small SPTE exists for this pfn, but FNAME(fetch),
+ * direct_map(), or kvm_tdp_mmu_map() would like to create a
+ * large PTE instead: just force them to go down another level,
+ * patching back for them into pfn the next 9 bits of the
+ * address.
*/
u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
KVM_PAGES_PER_HPAGE(cur_level - 1);
@@ -3129,7 +3160,7 @@ void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_
}
}
-static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
@@ -3147,7 +3178,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
if (it.level == fault->goal_level)
break;
@@ -3173,14 +3204,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return ret;
}
-static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
+static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
{
- send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
+ unsigned long hva = gfn_to_hva_memslot(slot, gfn);
+
+ send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
}
-static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- if (is_sigpending_pfn(pfn)) {
+ if (is_sigpending_pfn(fault->pfn)) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
}
@@ -3190,43 +3223,43 @@ static int kvm_handle_error_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
*/
- if (pfn == KVM_PFN_ERR_RO_FAULT)
+ if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
return RET_PF_EMULATE;
- if (pfn == KVM_PFN_ERR_HWPOISON) {
- kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
+ if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
+ kvm_send_hwpoison_signal(fault->slot, fault->gfn);
return RET_PF_RETRY;
}
return -EFAULT;
}
-static int handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
- unsigned int access)
+static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault,
+ unsigned int access)
{
- /* The pfn is invalid, report the error! */
- if (unlikely(is_error_pfn(fault->pfn)))
- return kvm_handle_error_pfn(vcpu, fault->gfn, fault->pfn);
+ gva_t gva = fault->is_tdp ? 0 : fault->addr;
- if (unlikely(!fault->slot)) {
- gva_t gva = fault->is_tdp ? 0 : fault->addr;
+ vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
+ access & shadow_mmio_access_mask);
- vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
- access & shadow_mmio_access_mask);
- /*
- * If MMIO caching is disabled, emulate immediately without
- * touching the shadow page tables as attempting to install an
- * MMIO SPTE will just be an expensive nop. Do not cache MMIO
- * whose gfn is greater than host.MAXPHYADDR, any guest that
- * generates such gfns is running nested and is being tricked
- * by L0 userspace (you can observe gfn > L1.MAXPHYADDR if
- * and only if L1's MAXPHYADDR is inaccurate with respect to
- * the hardware's).
- */
- if (unlikely(!enable_mmio_caching) ||
- unlikely(fault->gfn > kvm_mmu_max_gfn()))
- return RET_PF_EMULATE;
- }
+ /*
+ * If MMIO caching is disabled, emulate immediately without
+ * touching the shadow page tables as attempting to install an
+ * MMIO SPTE will just be an expensive nop.
+ */
+ if (unlikely(!enable_mmio_caching))
+ return RET_PF_EMULATE;
+
+ /*
+ * Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
+ * any guest that generates such gfns is running nested and is being
+ * tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
+ * only if L1's MAXPHYADDR is inaccurate with respect to the
+ * hardware's).
+ */
+ if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
+ return RET_PF_EMULATE;
return RET_PF_CONTINUE;
}
@@ -3350,7 +3383,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
do {
u64 new_spte;
- if (is_tdp_mmu(vcpu->arch.mmu))
+ if (tdp_mmu_enabled)
sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
else
sptep = fast_pf_get_last_sptep(vcpu, fault->addr, &spte);
@@ -3433,8 +3466,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
}
if (++retry_count > 4) {
- printk_once(KERN_WARNING
- "kvm: Fast #PF retrying more than 4 times.\n");
+ pr_warn_once("Fast #PF retrying more than 4 times.\n");
break;
}
@@ -3596,7 +3628,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_unlock;
- if (is_tdp_mmu_enabled(vcpu->kvm)) {
+ if (tdp_mmu_enabled) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
mmu->root.hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
@@ -4026,7 +4058,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
walk_shadow_page_lockless_begin(vcpu);
- if (is_tdp_mmu(vcpu->arch.mmu))
+ if (is_tdp_mmu_active(vcpu))
leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root);
else
leaf = get_walk(vcpu, addr, sptes, &root);
@@ -4174,7 +4206,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
}
-static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
+static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
struct kvm_memory_slot *slot = fault->slot;
bool async;
@@ -4235,12 +4267,33 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
return RET_PF_CONTINUE;
}
+static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+ unsigned int access)
+{
+ int ret;
+
+ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
+ smp_rmb();
+
+ ret = __kvm_faultin_pfn(vcpu, fault);
+ if (ret != RET_PF_CONTINUE)
+ return ret;
+
+ if (unlikely(is_error_pfn(fault->pfn)))
+ return kvm_handle_error_pfn(vcpu, fault);
+
+ if (unlikely(!fault->slot))
+ return kvm_handle_noslot_fault(vcpu, fault, access);
+
+ return RET_PF_CONTINUE;
+}
+
/*
* Returns true if the page fault is stale and needs to be retried, i.e. if the
* root was invalidated by a memslot update or a relevant mmu_notifier fired.
*/
static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
- struct kvm_page_fault *fault, int mmu_seq)
+ struct kvm_page_fault *fault)
{
struct kvm_mmu_page *sp = to_shadow_page(vcpu->arch.mmu->root.hpa);
@@ -4260,19 +4313,13 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
return true;
return fault->slot &&
- mmu_invalidate_retry_hva(vcpu->kvm, mmu_seq, fault->hva);
+ mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva);
}
static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
- bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
-
- unsigned long mmu_seq;
int r;
- fault->gfn = fault->addr >> PAGE_SHIFT;
- fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn);
-
if (page_fault_handle_page_track(vcpu, fault))
return RET_PF_EMULATE;
@@ -4284,41 +4331,24 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (r)
return r;
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
- smp_rmb();
-
- r = kvm_faultin_pfn(vcpu, fault);
- if (r != RET_PF_CONTINUE)
- return r;
-
- r = handle_abnormal_pfn(vcpu, fault, ACC_ALL);
+ r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
if (r != RET_PF_CONTINUE)
return r;
r = RET_PF_RETRY;
+ write_lock(&vcpu->kvm->mmu_lock);
- if (is_tdp_mmu_fault)
- read_lock(&vcpu->kvm->mmu_lock);
- else
- write_lock(&vcpu->kvm->mmu_lock);
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
- if (is_page_fault_stale(vcpu, fault, mmu_seq))
+ r = make_mmu_pages_available(vcpu);
+ if (r)
goto out_unlock;
- if (is_tdp_mmu_fault) {
- r = kvm_tdp_mmu_map(vcpu, fault);
- } else {
- r = make_mmu_pages_available(vcpu);
- if (r)
- goto out_unlock;
- r = __direct_map(vcpu, fault);
- }
+ r = direct_map(vcpu, fault);
out_unlock:
- if (is_tdp_mmu_fault)
- read_unlock(&vcpu->kvm->mmu_lock);
- else
- write_unlock(&vcpu->kvm->mmu_lock);
+ write_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(fault->pfn);
return r;
}
@@ -4366,6 +4396,42 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
+#ifdef CONFIG_X86_64
+static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
+ struct kvm_page_fault *fault)
+{
+ int r;
+
+ if (page_fault_handle_page_track(vcpu, fault))
+ return RET_PF_EMULATE;
+
+ r = fast_page_fault(vcpu, fault);
+ if (r != RET_PF_INVALID)
+ return r;
+
+ r = mmu_topup_memory_caches(vcpu, false);
+ if (r)
+ return r;
+
+ r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
+ if (r != RET_PF_CONTINUE)
+ return r;
+
+ r = RET_PF_RETRY;
+ read_lock(&vcpu->kvm->mmu_lock);
+
+ if (is_page_fault_stale(vcpu, fault))
+ goto out_unlock;
+
+ r = kvm_tdp_mmu_map(vcpu, fault);
+
+out_unlock:
+ read_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(fault->pfn);
+ return r;
+}
+#endif
+
int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
/*
@@ -4383,13 +4449,19 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (shadow_memtype_mask && kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
- gfn_t base = (fault->addr >> PAGE_SHIFT) & ~(page_num - 1);
+ gfn_t base = gfn_round_for_level(fault->gfn,
+ fault->max_level);
if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
break;
}
}
+#ifdef CONFIG_X86_64
+ if (tdp_mmu_enabled)
+ return kvm_tdp_mmu_page_fault(vcpu, fault);
+#endif
+
return direct_page_fault(vcpu, fault);
}
@@ -4494,10 +4566,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
struct kvm_mmu *mmu = vcpu->arch.mmu;
union kvm_mmu_page_role new_role = mmu->root_role;
- if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
- /* kvm_mmu_ensure_valid_pgd will set up a new root. */
+ /*
+ * Return immediately if no usable root was found, kvm_mmu_reload()
+ * will establish a valid root prior to the next VM-Enter.
+ */
+ if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role))
return;
- }
/*
* It's possible that the cached previous root page is obsolete because
@@ -5719,6 +5793,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
tdp_root_level = tdp_forced_root_level;
max_tdp_level = tdp_max_root_level;
+#ifdef CONFIG_X86_64
+ tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
+#endif
/*
* max_huge_page_level reflects KVM's MMU capabilities irrespective
* of kernel support, e.g. KVM may be capable of using 1GB pages when
@@ -5966,7 +6043,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* write and in the same critical section as making the reload request,
* e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
*/
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_invalidate_all_roots(kvm);
/*
@@ -5991,7 +6068,7 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
* Deferring the zap until the final reference to the root is put would
* lead to use-after-free.
*/
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_zap_invalidated_roots(kvm);
}
@@ -6017,9 +6094,11 @@ int kvm_mmu_init_vm(struct kvm *kvm)
INIT_LIST_HEAD(&kvm->arch.possible_nx_huge_pages);
spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
- r = kvm_mmu_init_tdp_mmu(kvm);
- if (r < 0)
- return r;
+ if (tdp_mmu_enabled) {
+ r = kvm_mmu_init_tdp_mmu(kvm);
+ if (r < 0)
+ return r;
+ }
node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
@@ -6049,7 +6128,8 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
kvm_page_track_unregister_notifier(kvm, node);
- kvm_mmu_uninit_tdp_mmu(kvm);
+ if (tdp_mmu_enabled)
+ kvm_mmu_uninit_tdp_mmu(kvm);
mmu_free_vm_memory_caches(kvm);
}
@@ -6103,7 +6183,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_leafs(kvm, i, gfn_start,
gfn_end, true, flush);
@@ -6136,7 +6216,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
read_unlock(&kvm->mmu_lock);
@@ -6379,7 +6459,7 @@ void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
u64 start, u64 end,
int target_level)
{
- if (!is_tdp_mmu_enabled(kvm))
+ if (!tdp_mmu_enabled)
return;
if (kvm_memslots_have_rmaps(kvm))
@@ -6400,7 +6480,7 @@ void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
u64 start = memslot->base_gfn;
u64 end = start + memslot->npages;
- if (!is_tdp_mmu_enabled(kvm))
+ if (!tdp_mmu_enabled)
return;
if (kvm_memslots_have_rmaps(kvm)) {
@@ -6450,8 +6530,7 @@ restart:
kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
- kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_sptep(kvm, sptep);
else
need_tlb_flush = 1;
@@ -6483,7 +6562,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
@@ -6518,7 +6597,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}
- if (is_tdp_mmu_enabled(kvm)) {
+ if (tdp_mmu_enabled) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
read_unlock(&kvm->mmu_lock);
@@ -6553,7 +6632,7 @@ restart:
kvm_mmu_commit_zap_page(kvm, &invalid_list);
- if (is_tdp_mmu_enabled(kvm))
+ if (tdp_mmu_enabled)
kvm_tdp_mmu_zap_all(kvm);
write_unlock(&kvm->mmu_lock);
@@ -6579,7 +6658,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
* zap all shadow pages.
*/
if (unlikely(gen == 0)) {
- kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_zap_all_fast(kvm);
}
}
@@ -6684,7 +6763,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
new_val = 1;
else if (sysfs_streq(val, "auto"))
new_val = get_nx_auto_mode();
- else if (strtobool(val, &new_val) < 0)
+ else if (kstrtobool(val, &new_val) < 0)
return -EINVAL;
__set_nx_huge_pages(new_val);
@@ -6718,6 +6797,13 @@ void __init kvm_mmu_x86_module_init(void)
if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());
+ /*
+ * Snapshot userspace's desire to enable the TDP MMU. Whether or not the
+ * TDP MMU is actually enabled is determined in kvm_configure_mmu()
+ * when the vendor module is loaded.
+ */
+ tdp_mmu_allowed = tdp_mmu_enabled;
+
kvm_mmu_spte_module_init();
}
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index dbaf6755c5a7..cc58631e2336 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -156,6 +156,11 @@ static inline bool kvm_mmu_page_ad_need_write_protect(struct kvm_mmu_page *sp)
return kvm_x86_ops.cpu_dirty_log_size && sp->role.guest_mode;
}
+static inline gfn_t gfn_round_for_level(gfn_t gfn, int level)
+{
+ return gfn & -KVM_PAGES_PER_HPAGE(level);
+}
+
int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
gfn_t gfn, bool can_unsync, bool prefetch);
@@ -164,8 +169,17 @@ void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn,
int min_level);
+
void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
u64 start_gfn, u64 pages);
+
+/* Flush the given page (huge or not) of guest memory. */
+static inline void kvm_flush_remote_tlbs_gfn(struct kvm *kvm, gfn_t gfn, int level)
+{
+ kvm_flush_remote_tlbs_with_address(kvm, gfn_round_for_level(gfn, level),
+ KVM_PAGES_PER_HPAGE(level));
+}
+
unsigned int pte_list_count(struct kvm_rmap_head *rmap_head);
extern int nx_huge_pages;
@@ -199,7 +213,7 @@ struct kvm_page_fault {
/*
* Maximum page size that can be created for this fault; input to
- * FNAME(fetch), __direct_map and kvm_tdp_mmu_map.
+ * FNAME(fetch), direct_map() and kvm_tdp_mmu_map().
*/
u8 max_level;
@@ -222,6 +236,7 @@ struct kvm_page_fault {
struct kvm_memory_slot *slot;
/* Outputs of kvm_faultin_pfn. */
+ unsigned long mmu_seq;
kvm_pfn_t pfn;
hva_t hva;
bool map_writable;
@@ -279,6 +294,11 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
};
int r;
+ if (vcpu->arch.mmu->root_role.direct) {
+ fault.gfn = fault.addr >> PAGE_SHIFT;
+ fault.slot = kvm_vcpu_gfn_to_memslot(vcpu, fault.gfn);
+ }
+
/*
* Async #PF "faults", a.k.a. prefetch faults, are not faults from the
* guest perspective and have already been counted at the time of the
diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c
index 2e09d1b6249f..0a2ac438d647 100644
--- a/arch/x86/kvm/mmu/page_track.c
+++ b/arch/x86/kvm/mmu/page_track.c
@@ -10,6 +10,7 @@
* Author:
* Xiao Guangrong <guangrong.xiao@linux.intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <linux/rculist.h>
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 0f6455072055..57f0b75c80f9 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -642,12 +642,12 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
goto out_gpte_changed;
- for (shadow_walk_init(&it, vcpu, fault->addr);
- shadow_walk_okay(&it) && it.level > gw->level;
- shadow_walk_next(&it)) {
+ for_each_shadow_entry(vcpu, fault->addr, it) {
gfn_t table_gfn;
clear_sp_write_flooding_count(it.sptep);
+ if (it.level == gw->level)
+ break;
table_gfn = gw->table_gfn[it.level - 2];
access = gw->pt_access[it.level - 2];
@@ -692,8 +692,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
trace_kvm_mmu_spte_requested(fault);
for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
- clear_sp_write_flooding_count(it.sptep);
-
/*
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
@@ -701,7 +699,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
if (fault->nx_huge_page_workaround_enabled)
disallowed_hugepage_adjust(fault, *it.sptep, it.level);
- base_gfn = fault->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ base_gfn = gfn_round_for_level(fault->gfn, it.level);
if (it.level == fault->goal_level)
break;
@@ -791,7 +789,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
{
struct guest_walker walker;
int r;
- unsigned long mmu_seq;
bool is_self_change_mapping;
pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
@@ -838,14 +835,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
else
fault->max_level = walker.level;
- mmu_seq = vcpu->kvm->mmu_invalidate_seq;
- smp_rmb();
-
- r = kvm_faultin_pfn(vcpu, fault);
- if (r != RET_PF_CONTINUE)
- return r;
-
- r = handle_abnormal_pfn(vcpu, fault, walker.pte_access);
+ r = kvm_faultin_pfn(vcpu, fault, walker.pte_access);
if (r != RET_PF_CONTINUE)
return r;
@@ -871,7 +861,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = RET_PF_RETRY;
write_lock(&vcpu->kvm->mmu_lock);
- if (is_page_fault_stale(vcpu, fault, mmu_seq))
+ if (is_page_fault_stale(vcpu, fault))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
@@ -937,8 +927,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
mmu_page_zap_pte(vcpu->kvm, sp, sptep, NULL);
if (is_shadow_present_pte(old_spte))
- kvm_flush_remote_tlbs_with_address(vcpu->kvm,
- sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
+ kvm_flush_remote_tlbs_sptep(vcpu->kvm, sptep);
if (!rmap_can_add(vcpu))
break;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index c0fd7e049b4e..c15bfca3ed15 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -7,7 +7,7 @@
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2020 Red Hat, Inc. and/or its affiliates.
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "mmu.h"
@@ -147,9 +147,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
WARN_ON_ONCE(!pte_access && !shadow_present_mask);
if (sp->role.ad_disabled)
- spte |= SPTE_TDP_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED;
else if (kvm_mmu_page_ad_need_write_protect(sp))
- spte |= SPTE_TDP_AD_WRPROT_ONLY_MASK;
+ spte |= SPTE_TDP_AD_WRPROT_ONLY;
/*
* For the EPT case, shadow_present_mask is 0 if hardware
@@ -317,7 +317,7 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
shadow_user_mask | shadow_x_mask | shadow_me_value;
if (ad_disabled)
- spte |= SPTE_TDP_AD_DISABLED_MASK;
+ spte |= SPTE_TDP_AD_DISABLED;
else
spte |= shadow_accessed_mask;
@@ -352,7 +352,7 @@ u64 mark_spte_for_access_track(u64 spte)
WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT),
- "kvm: Access Tracking saved bit locations are not zero\n");
+ "Access Tracking saved bit locations are not zero\n");
spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) <<
SHADOW_ACC_TRACK_SAVED_BITS_SHIFT;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 6f54dc9409c9..1279db2eab44 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -28,10 +28,10 @@
*/
#define SPTE_TDP_AD_SHIFT 52
#define SPTE_TDP_AD_MASK (3ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_ENABLED_MASK (0ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_DISABLED_MASK (1ULL << SPTE_TDP_AD_SHIFT)
-#define SPTE_TDP_AD_WRPROT_ONLY_MASK (2ULL << SPTE_TDP_AD_SHIFT)
-static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
+#define SPTE_TDP_AD_ENABLED (0ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_DISABLED (1ULL << SPTE_TDP_AD_SHIFT)
+#define SPTE_TDP_AD_WRPROT_ONLY (2ULL << SPTE_TDP_AD_SHIFT)
+static_assert(SPTE_TDP_AD_ENABLED == 0);
#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
#define SPTE_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
@@ -164,7 +164,7 @@ extern u64 __read_mostly shadow_me_value;
extern u64 __read_mostly shadow_me_mask;
/*
- * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED_MASK;
+ * SPTEs in MMUs without A/D bits are marked with SPTE_TDP_AD_DISABLED;
* shadow_acc_track_mask is the set of bits to be cleared in non-accessed
* pages.
*/
@@ -266,18 +266,18 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
static inline bool spte_ad_enabled(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
- return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED_MASK;
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_DISABLED;
}
static inline bool spte_ad_need_write_protect(u64 spte)
{
MMU_WARN_ON(!is_shadow_present_pte(spte));
/*
- * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED_MASK is '0',
+ * This is benign for non-TDP SPTEs as SPTE_TDP_AD_ENABLED is '0',
* and non-TDP SPTEs will never set these bits. Optimize for 64-bit
* TDP and do the A/D type check unconditionally.
*/
- return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED_MASK;
+ return (spte & SPTE_TDP_AD_MASK) != SPTE_TDP_AD_ENABLED;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
@@ -435,11 +435,11 @@ static inline void check_spte_writable_invariants(u64 spte)
{
if (spte & shadow_mmu_writable_mask)
WARN_ONCE(!(spte & shadow_host_writable_mask),
- "kvm: MMU-writable SPTE is not Host-writable: %llx",
+ KBUILD_MODNAME ": MMU-writable SPTE is not Host-writable: %llx",
spte);
else
WARN_ONCE(is_writable_pte(spte),
- "kvm: Writable SPTE is not MMU-writable: %llx", spte);
+ KBUILD_MODNAME ": Writable SPTE is not MMU-writable: %llx", spte);
}
static inline bool is_mmu_writable_spte(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_iter.c b/arch/x86/kvm/mmu/tdp_iter.c
index 39b48e7d7d1a..d2eb0d4f8710 100644
--- a/arch/x86/kvm/mmu/tdp_iter.c
+++ b/arch/x86/kvm/mmu/tdp_iter.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu_internal.h"
#include "tdp_iter.h"
@@ -15,11 +16,6 @@ static void tdp_iter_refresh_sptep(struct tdp_iter *iter)
iter->old_spte = kvm_tdp_mmu_read_spte(iter->sptep);
}
-static gfn_t round_gfn_for_level(gfn_t gfn, int level)
-{
- return gfn & -KVM_PAGES_PER_HPAGE(level);
-}
-
/*
* Return the TDP iterator to the root PT and allow it to continue its
* traversal over the paging structure from there.
@@ -30,7 +26,7 @@ void tdp_iter_restart(struct tdp_iter *iter)
iter->yielded_gfn = iter->next_last_level_gfn;
iter->level = iter->root_level;
- iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
iter->valid = true;
@@ -97,7 +93,7 @@ static bool try_step_down(struct tdp_iter *iter)
iter->level--;
iter->pt_path[iter->level - 1] = child_pt;
- iter->gfn = round_gfn_for_level(iter->next_last_level_gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->next_last_level_gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
@@ -139,7 +135,7 @@ static bool try_step_up(struct tdp_iter *iter)
return false;
iter->level++;
- iter->gfn = round_gfn_for_level(iter->gfn, iter->level);
+ iter->gfn = gfn_round_for_level(iter->gfn, iter->level);
tdp_iter_refresh_sptep(iter);
return true;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d6df38d371a0..7c25dbf32ecc 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "mmu.h"
#include "mmu_internal.h"
@@ -10,23 +11,15 @@
#include <asm/cmpxchg.h>
#include <trace/events/kvm.h>
-static bool __read_mostly tdp_mmu_enabled = true;
-module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644);
-
/* Initializes the TDP MMU for the VM, if enabled. */
int kvm_mmu_init_tdp_mmu(struct kvm *kvm)
{
struct workqueue_struct *wq;
- if (!tdp_enabled || !READ_ONCE(tdp_mmu_enabled))
- return 0;
-
wq = alloc_workqueue("kvm", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 0);
if (!wq)
return -ENOMEM;
- /* This should not be changed for the lifetime of the VM. */
- kvm->arch.tdp_mmu_enabled = true;
INIT_LIST_HEAD(&kvm->arch.tdp_mmu_roots);
spin_lock_init(&kvm->arch.tdp_mmu_pages_lock);
kvm->arch.tdp_mmu_zap_wq = wq;
@@ -47,9 +40,6 @@ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm,
void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm)
{
- if (!kvm->arch.tdp_mmu_enabled)
- return;
-
/* Also waits for any queued work items. */
destroy_workqueue(kvm->arch.tdp_mmu_zap_wq);
@@ -144,7 +134,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root,
if (!refcount_dec_and_test(&root->tdp_mmu_root_count))
return;
- WARN_ON(!root->tdp_mmu_page);
+ WARN_ON(!is_tdp_mmu_page(root));
/*
* The root now has refcount=0. It is valid, but readers already
@@ -690,8 +680,7 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
if (ret)
return ret;
- kvm_flush_remote_tlbs_with_address(kvm, iter->gfn,
- KVM_PAGES_PER_HPAGE(iter->level));
+ kvm_flush_remote_tlbs_gfn(kvm, iter->gfn, iter->level);
/*
* No other thread can overwrite the removed SPTE as they must either
@@ -1090,8 +1079,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
return RET_PF_RETRY;
else if (is_shadow_present_pte(iter->old_spte) &&
!is_last_spte(iter->old_spte, iter->level))
- kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
- KVM_PAGES_PER_HPAGE(iter->level + 1));
+ kvm_flush_remote_tlbs_gfn(vcpu->kvm, iter->gfn, iter->level);
/*
* If the page fault was caused by a write but the page is write
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index d3714200b932..0a63b1afabd3 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -7,6 +7,9 @@
#include "spte.h"
+int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
+void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
+
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu);
__must_check static inline bool kvm_tdp_mmu_get_root(struct kvm_mmu_page *root)
@@ -68,31 +71,9 @@ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr,
u64 *spte);
#ifdef CONFIG_X86_64
-int kvm_mmu_init_tdp_mmu(struct kvm *kvm);
-void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm);
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return sp->tdp_mmu_page; }
-
-static inline bool is_tdp_mmu(struct kvm_mmu *mmu)
-{
- struct kvm_mmu_page *sp;
- hpa_t hpa = mmu->root.hpa;
-
- if (WARN_ON(!VALID_PAGE(hpa)))
- return false;
-
- /*
- * A NULL shadow page is legal when shadowing a non-paging guest with
- * PAE paging, as the MMU will be direct with root_hpa pointing at the
- * pae_root page, not a shadow page.
- */
- sp = to_shadow_page(hpa);
- return sp && is_tdp_mmu_page(sp) && sp->root_count;
-}
#else
-static inline int kvm_mmu_init_tdp_mmu(struct kvm *kvm) { return 0; }
-static inline void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) {}
static inline bool is_tdp_mmu_page(struct kvm_mmu_page *sp) { return false; }
-static inline bool is_tdp_mmu(struct kvm_mmu *mmu) { return false; }
#endif
#endif /* __KVM_X86_MMU_TDP_MMU_H */
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index a8502e02f479..9fac1ec03463 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -13,6 +13,7 @@
* Paolo Bonzini <pbonzini@redhat.com>
* Xiao Guangrong <guangrong.xiao@linux.intel.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include <asm/mtrr.h>
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index eb594620dd75..612e6c70ce2e 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -9,6 +9,7 @@
* Gleb Natapov <gleb@redhat.com>
* Wei Huang <wei@redhat.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/kvm_host.h>
@@ -28,9 +29,18 @@
struct x86_pmu_capability __read_mostly kvm_pmu_cap;
EXPORT_SYMBOL_GPL(kvm_pmu_cap);
-static const struct x86_cpu_id vmx_icl_pebs_cpu[] = {
+/* Precise Distribution of Instructions Retired (PDIR) */
+static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
+ /* Instruction-Accurate PDIR (PDIR++) */
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
+ {}
+};
+
+/* Precise Distribution (PDist) */
+static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
{}
};
@@ -155,6 +165,28 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
+static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
+{
+ /*
+ * For some model specific pebs counters with special capabilities
+ * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
+ * level to the maximum value (currently 3, backwards compatible)
+ * so that the perf subsystem would assign specific hardware counter
+ * with that capability for vPMC.
+ */
+ if ((pmc->idx == 0 && x86_match_cpu(vmx_pebs_pdist_cpu)) ||
+ (pmc->idx == 32 && x86_match_cpu(vmx_pebs_pdir_cpu)))
+ return 3;
+
+ /*
+ * The non-zero precision level of guest event makes the ordinary
+ * guest event becomes a guest PEBS event and triggers the host
+ * PEBS PMI handler to determine whether the PEBS overflow PMI
+ * comes from the host counters or the guest.
+ */
+ return 1;
+}
+
static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
bool exclude_user, bool exclude_kernel,
bool intr)
@@ -186,22 +218,12 @@ static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
}
if (pebs) {
/*
- * The non-zero precision level of guest event makes the ordinary
- * guest event becomes a guest PEBS event and triggers the host
- * PEBS PMI handler to determine whether the PEBS overflow PMI
- * comes from the host counters or the guest.
- *
* For most PEBS hardware events, the difference in the software
* precision levels of guest and host PEBS events will not affect
* the accuracy of the PEBS profiling result, because the "event IP"
* in the PEBS record is calibrated on the guest side.
- *
- * On Icelake everything is fine. Other hardware (GLC+, TNT+) that
- * could possibly care here is unsupported and needs changes.
*/
- attr.precise_ip = 1;
- if (x86_match_cpu(vmx_icl_pebs_cpu) && pmc->idx == 32)
- attr.precise_ip = 3;
+ attr.precise_ip = pmc_get_pebs_precise_level(pmc);
}
event = perf_event_create_kernel_counter(&attr, -1, current,
@@ -254,48 +276,128 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return true;
}
-static int cmp_u64(const void *pa, const void *pb)
+static int filter_cmp(const void *pa, const void *pb, u64 mask)
{
- u64 a = *(u64 *)pa;
- u64 b = *(u64 *)pb;
+ u64 a = *(u64 *)pa & mask;
+ u64 b = *(u64 *)pb & mask;
return (a > b) - (a < b);
}
+
+static int filter_sort_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE));
+}
+
+/*
+ * For the event filter, searching is done on the 'includes' list and
+ * 'excludes' list separately rather than on the 'events' list (which
+ * has both). As a result the exclude bit can be ignored.
+ */
+static int filter_event_cmp(const void *pa, const void *pb)
+{
+ return filter_cmp(pa, pb, (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
+}
+
+static int find_filter_index(u64 *events, u64 nevents, u64 key)
+{
+ u64 *fe = bsearch(&key, events, nevents, sizeof(events[0]),
+ filter_event_cmp);
+
+ if (!fe)
+ return -1;
+
+ return fe - events;
+}
+
+static bool is_filter_entry_match(u64 filter_event, u64 umask)
+{
+ u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8);
+ u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
+
+ BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >>
+ (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) !=
+ ARCH_PERFMON_EVENTSEL_UMASK);
+
+ return (umask & mask) == match;
+}
+
+static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
+{
+ u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
+ u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
+ int i, index;
+
+ index = find_filter_index(events, nevents, event_select);
+ if (index < 0)
+ return false;
+
+ /*
+ * Entries are sorted by the event select. Walk the list in both
+ * directions to process all entries with the targeted event select.
+ */
+ for (i = index; i < nevents; i++) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ for (i = index - 1; i >= 0; i--) {
+ if (filter_event_cmp(&events[i], &event_select))
+ break;
+
+ if (is_filter_entry_match(events[i], umask))
+ return true;
+ }
+
+ return false;
+}
+
+static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
+ u64 eventsel)
+{
+ if (filter_contains_match(f->includes, f->nr_includes, eventsel) &&
+ !filter_contains_match(f->excludes, f->nr_excludes, eventsel))
+ return f->action == KVM_PMU_EVENT_ALLOW;
+
+ return f->action == KVM_PMU_EVENT_DENY;
+}
+
+static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
+ int idx)
+{
+ int fixed_idx = idx - INTEL_PMC_IDX_FIXED;
+
+ if (filter->action == KVM_PMU_EVENT_DENY &&
+ test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+ if (filter->action == KVM_PMU_EVENT_ALLOW &&
+ !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
+ return false;
+
+ return true;
+}
+
static bool check_pmu_event_filter(struct kvm_pmc *pmc)
{
- struct kvm_pmu_event_filter *filter;
+ struct kvm_x86_pmu_event_filter *filter;
struct kvm *kvm = pmc->vcpu->kvm;
- bool allow_event = true;
- __u64 key;
- int idx;
if (!static_call(kvm_x86_pmu_hw_event_available)(pmc))
return false;
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (!filter)
- goto out;
+ return true;
- if (pmc_is_gp(pmc)) {
- key = pmc->eventsel & AMD64_RAW_EVENT_MASK_NB;
- if (bsearch(&key, filter->events, filter->nevents,
- sizeof(__u64), cmp_u64))
- allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
- else
- allow_event = filter->action == KVM_PMU_EVENT_DENY;
- } else {
- idx = pmc->idx - INTEL_PMC_IDX_FIXED;
- if (filter->action == KVM_PMU_EVENT_DENY &&
- test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
- allow_event = false;
- if (filter->action == KVM_PMU_EVENT_ALLOW &&
- !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap))
- allow_event = false;
- }
+ if (pmc_is_gp(pmc))
+ return is_gp_event_allowed(filter, pmc->eventsel);
-out:
- return allow_event;
+ return is_fixed_event_allowed(filter, pmc->idx);
}
static void reprogram_counter(struct kvm_pmc *pmc)
@@ -592,47 +694,133 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
}
EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
+static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
+{
+ u64 mask = kvm_pmu_ops.EVENTSEL_EVENT |
+ KVM_PMU_MASKED_ENTRY_UMASK_MASK |
+ KVM_PMU_MASKED_ENTRY_UMASK_MATCH |
+ KVM_PMU_MASKED_ENTRY_EXCLUDE;
+ int i;
+
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & ~mask)
+ return false;
+ }
+
+ return true;
+}
+
+static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i, j;
+
+ for (i = 0, j = 0; i < filter->nevents; i++) {
+ /*
+ * Skip events that are impossible to match against a guest
+ * event. When filtering, only the event select + unit mask
+ * of the guest event is used. To maintain backwards
+ * compatibility, impossible filters can't be rejected :-(
+ */
+ if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT |
+ ARCH_PERFMON_EVENTSEL_UMASK))
+ continue;
+ /*
+ * Convert userspace events to a common in-kernel event so
+ * only one code path is needed to support both events. For
+ * the in-kernel events use masked events because they are
+ * flexible enough to handle both cases. To convert to masked
+ * events all that's needed is to add an "all ones" umask_mask,
+ * (unmasked filter events don't support EXCLUDE).
+ */
+ filter->events[j++] = filter->events[i] |
+ (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
+ }
+
+ filter->nevents = j;
+}
+
+static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
+{
+ int i;
+
+ if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
+ convert_to_masked_filter(filter);
+ else if (!is_masked_filter_valid(filter))
+ return -EINVAL;
+
+ /*
+ * Sort entries by event select and includes vs. excludes so that all
+ * entries for a given event select can be processed efficiently during
+ * filtering. The EXCLUDE flag uses a more significant bit than the
+ * event select, and so the sorted list is also effectively split into
+ * includes and excludes sub-lists.
+ */
+ sort(&filter->events, filter->nevents, sizeof(filter->events[0]),
+ filter_sort_cmp, NULL);
+
+ i = filter->nevents;
+ /* Find the first EXCLUDE event (only supported for masked events). */
+ if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
+ for (i = 0; i < filter->nevents; i++) {
+ if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
+ break;
+ }
+ }
+
+ filter->nr_includes = i;
+ filter->nr_excludes = filter->nevents - filter->nr_includes;
+ filter->includes = filter->events;
+ filter->excludes = filter->events + filter->nr_includes;
+
+ return 0;
+}
+
int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
{
- struct kvm_pmu_event_filter tmp, *filter;
+ struct kvm_pmu_event_filter __user *user_filter = argp;
+ struct kvm_x86_pmu_event_filter *filter;
+ struct kvm_pmu_event_filter tmp;
struct kvm_vcpu *vcpu;
unsigned long i;
size_t size;
int r;
- if (copy_from_user(&tmp, argp, sizeof(tmp)))
+ if (copy_from_user(&tmp, user_filter, sizeof(tmp)))
return -EFAULT;
if (tmp.action != KVM_PMU_EVENT_ALLOW &&
tmp.action != KVM_PMU_EVENT_DENY)
return -EINVAL;
- if (tmp.flags != 0)
+ if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
return -EINVAL;
if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
return -E2BIG;
size = struct_size(filter, events, tmp.nevents);
- filter = kmalloc(size, GFP_KERNEL_ACCOUNT);
+ filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
if (!filter)
return -ENOMEM;
+ filter->action = tmp.action;
+ filter->nevents = tmp.nevents;
+ filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
+ filter->flags = tmp.flags;
+
r = -EFAULT;
- if (copy_from_user(filter, argp, size))
+ if (copy_from_user(filter->events, user_filter->events,
+ sizeof(filter->events[0]) * filter->nevents))
goto cleanup;
- /* Ensure nevents can't be changed between the user copies. */
- *filter = tmp;
-
- /*
- * Sort the in-kernel list so that we can search it with bsearch.
- */
- sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+ r = prepare_filter_lists(filter);
+ if (r)
+ goto cleanup;
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
synchronize_srcu_expedited(&kvm->srcu);
BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) >
@@ -643,8 +831,6 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
- mutex_unlock(&kvm->lock);
-
r = 0;
cleanup:
kfree(filter);
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index ee67ba625094..be62c16f2265 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -18,12 +18,6 @@
#define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001
#define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002
-struct kvm_event_hw_type_mapping {
- u8 eventsel;
- u8 unit_mask;
- unsigned event_type;
-};
-
struct kvm_pmu_ops {
bool (*hw_event_available)(struct kvm_pmc *pmc);
bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
@@ -40,6 +34,9 @@ struct kvm_pmu_ops {
void (*reset)(struct kvm_vcpu *vcpu);
void (*deliver_pmi)(struct kvm_vcpu *vcpu);
void (*cleanup)(struct kvm_vcpu *vcpu);
+
+ const u64 EVENTSEL_EVENT;
+ const int MAX_NR_GP_COUNTERS;
};
void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
@@ -161,7 +158,7 @@ static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc)
extern struct x86_pmu_capability kvm_pmu_cap;
-static inline void kvm_init_pmu_capability(void)
+static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops)
{
bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL;
@@ -192,6 +189,8 @@ static inline void kvm_init_pmu_capability(void)
}
kvm_pmu_cap.version = min(kvm_pmu_cap.version, 2);
+ kvm_pmu_cap.num_counters_gp = min(kvm_pmu_cap.num_counters_gp,
+ pmu_ops->MAX_NR_GP_COUNTERS);
kvm_pmu_cap.num_counters_fixed = min(kvm_pmu_cap.num_counters_fixed,
KVM_PMC_MAX_FIXED);
}
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 81f4e9ce0c77..a5717282bb9c 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -14,6 +14,7 @@
enum kvm_only_cpuid_leafs {
CPUID_12_EAX = NCAPINTS,
CPUID_7_1_EDX,
+ CPUID_8000_0007_EDX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -43,6 +44,9 @@ enum kvm_only_cpuid_leafs {
#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+/* CPUID level 0x80000007 (EDX). */
+#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
+
struct cpuid_reg {
u32 function;
u32 index;
@@ -68,6 +72,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_12_EAX] = {0x00000012, 0, CPUID_EAX},
[CPUID_8000_001F_EAX] = {0x8000001f, 0, CPUID_EAX},
[CPUID_7_1_EDX] = { 7, 1, CPUID_EDX},
+ [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX},
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
};
@@ -101,6 +106,8 @@ static __always_inline u32 __feature_translate(int x86_feature)
return KVM_X86_FEATURE_SGX2;
else if (x86_feature == X86_FEATURE_SGX_EDECCSSA)
return KVM_X86_FEATURE_SGX_EDECCSSA;
+ else if (x86_feature == X86_FEATURE_CONSTANT_TSC)
+ return KVM_X86_FEATURE_CONSTANT_TSC;
return x86_feature;
}
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index a9c1c2af8d94..b42111a24cc2 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -1,4 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "x86.h"
@@ -110,8 +111,6 @@ static void check_smram_offsets(void)
void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm)
{
- BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
-
trace_kvm_smm_transition(vcpu->vcpu_id, vcpu->arch.smbase, entering_smm);
if (entering_smm) {
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 6919dee69f18..ca684979e90d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -12,7 +12,7 @@
* Avi Kivity <avi@qumranet.com>
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/hashtable.h>
@@ -53,7 +53,7 @@ static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
static u32 next_vm_id = 0;
static bool next_vm_id_wrapped = 0;
static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
-enum avic_modes avic_mode;
+bool x2avic_enabled;
/*
* This is a wrapper of struct amd_iommu_ir_data.
@@ -72,20 +72,25 @@ static void avic_activate_vmcb(struct vcpu_svm *svm)
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- /* Note:
- * KVM can support hybrid-AVIC mode, where KVM emulates x2APIC
- * MSR accesses, while interrupt injection to a running vCPU
- * can be achieved using AVIC doorbell. The AVIC hardware still
- * accelerate MMIO accesses, but this does not cause any harm
- * as the guest is not supposed to access xAPIC mmio when uses x2APIC.
+ /*
+ * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR
+ * accesses, while interrupt injection to a running vCPU can be
+ * achieved using AVIC doorbell. KVM disables the APIC access page
+ * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling
+ * AVIC in hybrid mode activates only the doorbell mechanism.
*/
- if (apic_x2apic_mode(svm->vcpu.arch.apic) &&
- avic_mode == AVIC_MODE_X2) {
+ if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) {
vmcb->control.int_ctl |= X2APIC_MODE_MASK;
vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID;
/* Disabling MSR intercept for x2APIC registers */
svm_set_x2apic_msr_interception(svm, false);
} else {
+ /*
+ * Flush the TLB, the guest may have inserted a non-APIC
+ * mapping into the TLB while AVIC was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, &svm->vcpu);
+
/* For xAVIC and hybrid-xAVIC modes */
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID;
/* Enabling MSR intercept for x2APIC registers */
@@ -241,8 +246,8 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
u64 *avic_physical_id_table;
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- if ((avic_mode == AVIC_MODE_X1 && index > AVIC_MAX_PHYSICAL_ID) ||
- (avic_mode == AVIC_MODE_X2 && index > X2AVIC_MAX_PHYSICAL_ID))
+ if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) ||
+ (index > X2AVIC_MAX_PHYSICAL_ID))
return NULL;
avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
@@ -250,47 +255,14 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
return &avic_physical_id_table[index];
}
-/*
- * Note:
- * AVIC hardware walks the nested page table to check permissions,
- * but does not use the SPA address specified in the leaf page
- * table entry since it uses address in the AVIC_BACKING_PAGE pointer
- * field of the VMCB. Therefore, we set up the
- * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
- */
-static int avic_alloc_access_page(struct kvm *kvm)
-{
- void __user *ret;
- int r = 0;
-
- mutex_lock(&kvm->slots_lock);
-
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
-
- ret = __x86_set_memory_region(kvm,
- APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE,
- PAGE_SIZE);
- if (IS_ERR(ret)) {
- r = PTR_ERR(ret);
- goto out;
- }
-
- kvm->arch.apic_access_memslot_enabled = true;
-out:
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
u64 *entry, new_entry;
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
- if ((avic_mode == AVIC_MODE_X1 && id > AVIC_MAX_PHYSICAL_ID) ||
- (avic_mode == AVIC_MODE_X2 && id > X2AVIC_MAX_PHYSICAL_ID))
+ if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) ||
+ (id > X2AVIC_MAX_PHYSICAL_ID))
return -EINVAL;
if (!vcpu->arch.apic->regs)
@@ -299,7 +271,13 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
if (kvm_apicv_activated(vcpu->kvm)) {
int ret;
- ret = avic_alloc_access_page(vcpu->kvm);
+ /*
+ * Note, AVIC hardware walks the nested page table to check
+ * permissions, but does not use the SPA address specified in
+ * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE
+ * pointer field of the VMCB.
+ */
+ ret = kvm_alloc_apic_access_page(vcpu->kvm);
if (ret)
return ret;
}
@@ -339,6 +317,60 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
put_cpu();
}
+
+static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl)
+{
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+}
+
+static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id,
+ u32 icrl)
+{
+ /*
+ * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID,
+ * i.e. APIC ID == vCPU ID.
+ */
+ struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, physical_id);
+
+ /* Once again, nothing to do if the target vCPU doesn't exist. */
+ if (unlikely(!target_vcpu))
+ return;
+
+ avic_kick_vcpu(target_vcpu, icrl);
+}
+
+static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table,
+ u32 logid_index, u32 icrl)
+{
+ u32 physical_id;
+
+ if (avic_logical_id_table) {
+ u32 logid_entry = avic_logical_id_table[logid_index];
+
+ /* Nothing to do if the logical destination is invalid. */
+ if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
+ return;
+
+ physical_id = logid_entry &
+ AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
+ } else {
+ /*
+ * For x2APIC, the logical APIC ID is a read-only value that is
+ * derived from the x2APIC ID, thus the x2APIC ID can be found
+ * by reversing the calculation (stored in logid_index). Note,
+ * bits 31:20 of the x2APIC ID aren't propagated to the logical
+ * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS.
+ */
+ physical_id = logid_index;
+ }
+
+ avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl);
+}
+
/*
* A fast-path version of avic_kick_target_vcpus(), which attempts to match
* destination APIC ID to vCPU without looping through all vCPUs.
@@ -346,11 +378,10 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
- u32 l1_physical_id, dest;
- struct kvm_vcpu *target_vcpu;
int dest_mode = icrl & APIC_DEST_MASK;
int shorthand = icrl & APIC_SHORT_MASK;
struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ u32 dest;
if (shorthand != APIC_DEST_NOSHORT)
return -EINVAL;
@@ -367,18 +398,18 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST)
return -EINVAL;
- l1_physical_id = dest;
-
- if (WARN_ON_ONCE(l1_physical_id != index))
+ if (WARN_ON_ONCE(dest != index))
return -EINVAL;
+ avic_kick_vcpu_by_physical_id(kvm, dest, icrl);
} else {
- u32 bitmap, cluster;
- int logid_index;
+ u32 *avic_logical_id_table;
+ unsigned long bitmap, i;
+ u32 cluster;
if (apic_x2apic_mode(source)) {
/* 16 bit dest mask, 16 bit cluster id */
- bitmap = dest & 0xFFFF0000;
+ bitmap = dest & 0xFFFF;
cluster = (dest >> 16) << 4;
} else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) {
/* 8 bit dest mask*/
@@ -390,67 +421,32 @@ static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source
cluster = (dest >> 4) << 2;
}
+ /* Nothing to do if there are no destinations in the cluster. */
if (unlikely(!bitmap))
- /* guest bug: nobody to send the logical interrupt to */
return 0;
- if (!is_power_of_2(bitmap))
- /* multiple logical destinations, use slow path */
- return -EINVAL;
-
- logid_index = cluster + __ffs(bitmap);
-
- if (!apic_x2apic_mode(source)) {
- u32 *avic_logical_id_table =
- page_address(kvm_svm->avic_logical_id_table_page);
-
- u32 logid_entry = avic_logical_id_table[logid_index];
-
- if (WARN_ON_ONCE(index != logid_index))
- return -EINVAL;
-
- /* guest bug: non existing/reserved logical destination */
- if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK)))
- return 0;
-
- l1_physical_id = logid_entry &
- AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
- } else {
- /*
- * For x2APIC logical mode, cannot leverage the index.
- * Instead, calculate physical ID from logical ID in ICRH.
- */
- int cluster = (icrh & 0xffff0000) >> 16;
- int apic = ffs(icrh & 0xffff) - 1;
-
- /*
- * If the x2APIC logical ID sub-field (i.e. icrh[15:0])
- * contains anything but a single bit, we cannot use the
- * fast path, because it is limited to a single vCPU.
- */
- if (apic < 0 || icrh != (1 << apic))
- return -EINVAL;
+ if (apic_x2apic_mode(source))
+ avic_logical_id_table = NULL;
+ else
+ avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
- l1_physical_id = (cluster << 4) + apic;
- }
+ /*
+ * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical
+ * IDs, thus each bit in the destination is guaranteed to map
+ * to at most one vCPU.
+ */
+ for_each_set_bit(i, &bitmap, 16)
+ avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table,
+ cluster + i, icrl);
}
- target_vcpu = kvm_get_vcpu_by_id(kvm, l1_physical_id);
- if (unlikely(!target_vcpu))
- /* guest bug: non existing vCPU is a target of this IPI*/
- return 0;
-
- target_vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(target_vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
return 0;
}
static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
u32 icrl, u32 icrh, u32 index)
{
+ u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh);
unsigned long i;
struct kvm_vcpu *vcpu;
@@ -466,21 +462,9 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
* since entered the guest will have processed pending IRQs at VMRUN.
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
- u32 dest;
-
- if (apic_x2apic_mode(vcpu->arch.apic))
- dest = icrh;
- else
- dest = GET_XAPIC_DEST_FIELD(icrh);
-
if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
- dest, icrl & APIC_DEST_MASK)) {
- vcpu->arch.apic->irr_pending = true;
- svm_complete_interrupt_delivery(vcpu,
- icrl & APIC_MODE_MASK,
- icrl & APIC_INT_LEVELTRIG,
- icrl & APIC_VECTOR_MASK);
- }
+ dest, icrl & APIC_DEST_MASK))
+ avic_kick_vcpu(vcpu, icrl);
}
}
@@ -496,14 +480,18 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
switch (id) {
+ case AVIC_IPI_FAILURE_INVALID_TARGET:
case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
/*
* Emulate IPIs that are not handled by AVIC hardware, which
- * only virtualizes Fixed, Edge-Triggered INTRs. The exit is
- * a trap, e.g. ICR holds the correct value and RIP has been
- * advanced, KVM is responsible only for emulating the IPI.
- * Sadly, hardware may sometimes leave the BUSY flag set, in
- * which case KVM needs to emulate the ICR write as well in
+ * only virtualizes Fixed, Edge-Triggered INTRs, and falls over
+ * if _any_ targets are invalid, e.g. if the logical mode mask
+ * is a superset of running vCPUs.
+ *
+ * The exit is a trap, e.g. ICR holds the correct value and RIP
+ * has been advanced, KVM is responsible only for emulating the
+ * IPI. Sadly, hardware may sometimes leave the BUSY flag set,
+ * in which case KVM needs to emulate the ICR write as well in
* order to clear the BUSY flag.
*/
if (icrl & APIC_ICR_BUSY)
@@ -519,8 +507,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
*/
avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
break;
- case AVIC_IPI_FAILURE_INVALID_TARGET:
- break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
break;
@@ -541,33 +527,33 @@ unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
- int index;
u32 *logical_apic_id_table;
- int dlid = GET_APIC_LOGICAL_ID(ldr);
-
- if (!dlid)
- return NULL;
+ u32 cluster, index;
- if (flat) { /* flat */
- index = ffs(dlid) - 1;
- if (index > 7)
- return NULL;
- } else { /* cluster */
- int cluster = (dlid & 0xf0) >> 4;
- int apic = ffs(dlid & 0x0f) - 1;
+ ldr = GET_APIC_LOGICAL_ID(ldr);
- if ((apic < 0) || (apic > 7) ||
- (cluster >= 0xf))
+ if (flat) {
+ cluster = 0;
+ } else {
+ cluster = (ldr >> 4);
+ if (cluster >= 0xf)
return NULL;
- index = (cluster << 2) + apic;
+ ldr &= 0xf;
}
+ if (!ldr || !is_power_of_2(ldr))
+ return NULL;
+
+ index = __ffs(ldr);
+ if (WARN_ON_ONCE(index > 7))
+ return NULL;
+ index += (cluster << 2);
logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
return &logical_apic_id_table[index];
}
-static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
+static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
@@ -575,15 +561,13 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
entry = avic_get_logical_id_entry(vcpu, ldr, flat);
if (!entry)
- return -EINVAL;
+ return;
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
-
- return 0;
}
static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
@@ -601,29 +585,23 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
}
-static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
+static void avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
- int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
u32 id = kvm_xapic_id(vcpu->arch.apic);
/* AVIC does not support LDR update for x2APIC */
if (apic_x2apic_mode(vcpu->arch.apic))
- return 0;
+ return;
if (ldr == svm->ldr_reg)
- return 0;
+ return;
avic_invalidate_logical_id_entry(vcpu);
- if (ldr)
- ret = avic_ldr_write(vcpu, id, ldr);
-
- if (!ret)
- svm->ldr_reg = ldr;
-
- return ret;
+ svm->ldr_reg = ldr;
+ avic_ldr_write(vcpu, id, ldr);
}
static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
@@ -645,12 +623,14 @@ static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu)
switch (offset) {
case APIC_LDR:
- if (avic_handle_ldr_update(vcpu))
- return 0;
+ avic_handle_ldr_update(vcpu);
break;
case APIC_DFR:
avic_handle_dfr_update(vcpu);
break;
+ case APIC_RRR:
+ /* Ignore writes to Read Remote Data, it's read-only. */
+ return 1;
default:
break;
}
@@ -739,18 +719,6 @@ void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu)
avic_handle_ldr_update(vcpu);
}
-void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
-{
- if (!lapic_in_kernel(vcpu) || avic_mode == AVIC_MODE_NONE)
- return;
-
- if (kvm_get_apic_mode(vcpu) == LAPIC_MODE_INVALID) {
- WARN_ONCE(true, "Invalid local APIC state (vcpu_id=%d)", vcpu->vcpu_id);
- return;
- }
- avic_refresh_apicv_exec_ctrl(vcpu);
-}
-
static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
{
int ret = 0;
@@ -995,23 +963,6 @@ out:
return ret;
}
-bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
-{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_ABSENT) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_NESTED) |
- BIT(APICV_INHIBIT_REASON_IRQWIN) |
- BIT(APICV_INHIBIT_REASON_PIT_REINJ) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_SEV) |
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
-
- return supported & BIT(reason);
-}
-
-
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
@@ -1064,6 +1015,7 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
+ WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
@@ -1092,17 +1044,15 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-
-void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb01.ptr;
- bool activated = kvm_vcpu_apicv_active(vcpu);
- if (!enable_apicv)
+ if (!lapic_in_kernel(vcpu) || !enable_apicv)
return;
- if (activated) {
+ if (kvm_vcpu_apicv_active(vcpu)) {
/**
* During AVIC temporary deactivation, guest could update
* APIC ID, DFR and LDR registers, which would not be trapped
@@ -1116,6 +1066,16 @@ void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
avic_deactivate_vmcb(svm);
}
vmcb_mark_dirty(vmcb, VMCB_AVIC);
+}
+
+void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+ bool activated = kvm_vcpu_apicv_active(vcpu);
+
+ if (!enable_apicv)
+ return;
+
+ avic_refresh_virtual_apic_mode(vcpu);
if (activated)
avic_vcpu_load(vcpu, vcpu->cpu);
@@ -1160,37 +1120,37 @@ void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
* - Hypervisor can support both xAVIC and x2AVIC in the same guest.
* - The mode can be switched at run-time.
*/
-bool avic_hardware_setup(struct kvm_x86_ops *x86_ops)
+bool avic_hardware_setup(void)
{
if (!npt_enabled)
return false;
+ /* AVIC is a prerequisite for x2AVIC. */
+ if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) {
+ if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
+ pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
+ pr_warn(FW_BUG "Try enable AVIC using force_avic option");
+ }
+ return false;
+ }
+
if (boot_cpu_has(X86_FEATURE_AVIC)) {
- avic_mode = AVIC_MODE_X1;
pr_info("AVIC enabled\n");
} else if (force_avic) {
/*
* Some older systems does not advertise AVIC support.
* See Revision Guide for specific AMD processor for more detail.
*/
- avic_mode = AVIC_MODE_X1;
pr_warn("AVIC is not supported in CPUID but force enabled");
pr_warn("Your system might crash and burn");
}
/* AVIC is a prerequisite for x2AVIC. */
- if (boot_cpu_has(X86_FEATURE_X2AVIC)) {
- if (avic_mode == AVIC_MODE_X1) {
- avic_mode = AVIC_MODE_X2;
- pr_info("x2AVIC enabled\n");
- } else {
- pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled");
- pr_warn(FW_BUG "Try enable AVIC using force_avic option");
- }
- }
+ x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC);
+ if (x2avic_enabled)
+ pr_info("x2AVIC enabled\n");
- if (avic_mode != AVIC_MODE_NONE)
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- return !!avic_mode;
+ return true;
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index add65dd59756..05d38944a6c0 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -12,7 +12,7 @@
* Avi Kivity <avi@qumranet.com>
*/
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
@@ -1008,7 +1008,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
- vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
@@ -1104,7 +1103,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
* to benefit from it right away.
*/
if (kvm_apicv_activated(vcpu->kvm))
- kvm_vcpu_update_apicv(vcpu);
+ __kvm_vcpu_update_apicv(vcpu);
return 0;
}
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 0e313fbae055..cc77a0681800 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -9,6 +9,8 @@
*
* Implementation is based on pmu_intel.c file
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
@@ -229,4 +231,6 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = {
.refresh = amd_pmu_refresh,
.init = amd_pmu_init,
.reset = amd_pmu_reset,
+ .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC,
};
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 86d6897f4806..c25aeb550cd9 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -6,6 +6,7 @@
*
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
@@ -812,7 +813,7 @@ static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
if (!IS_ALIGNED(dst_paddr, 16) ||
!IS_ALIGNED(paddr, 16) ||
!IS_ALIGNED(size, 16)) {
- tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO);
+ tpage = (void *)alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!tpage)
return -ENOMEM;
@@ -1293,7 +1294,7 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- if ((params.guest_len + offset > PAGE_SIZE))
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
return -EINVAL;
/* Pin guest memory */
@@ -1473,7 +1474,7 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
/* Check if we are crossing the page boundary */
offset = params.guest_uaddr & (PAGE_SIZE - 1);
- if ((params.guest_len + offset > PAGE_SIZE))
+ if (params.guest_len > PAGE_SIZE || (params.guest_len + offset) > PAGE_SIZE)
return -EINVAL;
hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 60c7c880266b..252e7f37e4e2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1,4 +1,4 @@
-#define pr_fmt(fmt) "SVM: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
@@ -519,21 +519,37 @@ static void svm_init_osvw(struct kvm_vcpu *vcpu)
vcpu->arch.osvw.status |= 1;
}
-static int has_svm(void)
+static bool kvm_is_svm_supported(void)
{
+ int cpu = raw_smp_processor_id();
const char *msg;
+ u64 vm_cr;
if (!cpu_has_svm(&msg)) {
- printk(KERN_INFO "has_svm: %s\n", msg);
- return 0;
+ pr_err("SVM not supported by CPU %d, %s\n", cpu, msg);
+ return false;
}
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
- return 0;
+ return false;
}
- return 1;
+ rdmsrl(MSR_VM_CR, vm_cr);
+ if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE)) {
+ pr_err("SVM disabled (by BIOS) in MSR_VM_CR on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int svm_check_processor_compat(void)
+{
+ if (!kvm_is_svm_supported())
+ return -EIO;
+
+ return 0;
}
void __svm_write_tsc_multiplier(u64 multiplier)
@@ -572,10 +588,6 @@ static int svm_hardware_enable(void)
if (efer & EFER_SVME)
return -EBUSY;
- if (!has_svm()) {
- pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
- return -EINVAL;
- }
sd = per_cpu_ptr(&svm_data, me);
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
@@ -813,7 +825,7 @@ void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept)
if (intercept == svm->x2avic_msrs_intercepted)
return;
- if (avic_mode != AVIC_MODE_X2 ||
+ if (!x2avic_enabled ||
!apic_x2apic_mode(svm->vcpu.arch.apic))
return;
@@ -1326,6 +1338,9 @@ static void __svm_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.microcode_version = 0x01000065;
svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio;
+ svm->nmi_masked = false;
+ svm->awaiting_iret_completion = false;
+
if (sev_es_guest(vcpu->kvm))
sev_es_vcpu_reset(svm);
}
@@ -2076,7 +2091,7 @@ static void svm_handle_mce(struct kvm_vcpu *vcpu)
* Erratum 383 triggered. Guest state is corrupt so kill the
* guest.
*/
- pr_err("KVM: Guest triggered AMD Erratum 383\n");
+ pr_err("Guest triggered AMD Erratum 383\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@ -2470,7 +2485,7 @@ static int iret_interception(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
++vcpu->stat.nmi_window_exits;
- vcpu->arch.hflags |= HF_IRET_MASK;
+ svm->awaiting_iret_completion = true;
if (!sev_es_guest(vcpu->kvm)) {
svm_clr_intercept(svm, INTERCEPT_IRET);
svm->nmi_iret_rip = kvm_rip_read(vcpu);
@@ -3003,8 +3018,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_DEBUGCTLMSR:
if (!lbrv) {
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
- __func__, data);
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
}
if (data & DEBUGCTL_RESERVED_BITS)
@@ -3033,7 +3047,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_VM_CR:
return svm_set_vm_cr(vcpu, data);
case MSR_VM_IGNNE:
- vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
+ kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
case MSR_AMD64_DE_CFG: {
struct kvm_msr_entry msr_entry;
@@ -3466,7 +3480,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
if (svm->nmi_l1_to_l2)
return;
- vcpu->arch.hflags |= HF_NMI_MASK;
+ svm->nmi_masked = true;
if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
@@ -3571,7 +3585,6 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- bool ret;
if (!gif_set(svm))
return true;
@@ -3579,10 +3592,8 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm))
return false;
- ret = (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
- (vcpu->arch.hflags & HF_NMI_MASK);
-
- return ret;
+ return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
+ svm->nmi_masked;
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
@@ -3602,7 +3613,7 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
{
- return !!(vcpu->arch.hflags & HF_NMI_MASK);
+ return to_svm(vcpu)->nmi_masked;
}
static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3610,11 +3621,11 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
struct vcpu_svm *svm = to_svm(vcpu);
if (masked) {
- vcpu->arch.hflags |= HF_NMI_MASK;
+ svm->nmi_masked = true;
if (!sev_es_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_IRET);
} else {
- vcpu->arch.hflags &= ~HF_NMI_MASK;
+ svm->nmi_masked = false;
if (!sev_es_guest(vcpu->kvm))
svm_clr_intercept(svm, INTERCEPT_IRET);
}
@@ -3700,7 +3711,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if ((vcpu->arch.hflags & (HF_NMI_MASK | HF_IRET_MASK)) == HF_NMI_MASK)
+ if (svm->nmi_masked && !svm->awaiting_iret_completion)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
@@ -3824,10 +3835,11 @@ static void svm_complete_interrupts(struct kvm_vcpu *vcpu)
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
- if ((vcpu->arch.hflags & HF_IRET_MASK) &&
+ if (svm->awaiting_iret_completion &&
(sev_es_guest(vcpu->kvm) ||
kvm_rip_read(vcpu) != svm->nmi_iret_rip)) {
- vcpu->arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+ svm->awaiting_iret_completion = false;
+ svm->nmi_masked = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -4076,17 +4088,6 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
vmcb_mark_dirty(svm->vmcb, VMCB_CR);
}
-static int is_disabled(void)
-{
- u64 vm_cr;
-
- rdmsrl(MSR_VM_CR, vm_cr);
- if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
- return 1;
-
- return 0;
-}
-
static void
svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
{
@@ -4098,11 +4099,6 @@ svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xd9;
}
-static int __init svm_check_processor_compat(void)
-{
- return 0;
-}
-
/*
* The kvm parameter can be NULL (module initialization, or invocation before
* VM creation). Be sure to check the kvm parameter before using it.
@@ -4629,7 +4625,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
smap = cr4 & X86_CR4_SMAP;
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
- pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
+ pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n");
/*
* If the fault occurred in userspace, arbitrarily inject #GP
@@ -4701,7 +4697,9 @@ static int svm_vm_init(struct kvm *kvm)
}
static struct kvm_x86_ops svm_x86_ops __initdata = {
- .name = "kvm_amd",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
.hardware_enable = svm_hardware_enable,
@@ -4771,10 +4769,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
- .set_virtual_apic_mode = avic_set_virtual_apic_mode,
+ .set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
- .check_apicv_inhibit_reasons = avic_check_apicv_inhibit_reasons,
.apicv_post_state_restore = avic_apicv_post_state_restore,
+ .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS,
.get_exit_info = svm_get_exit_info,
@@ -4981,7 +4979,7 @@ static __init int svm_hardware_setup(void)
}
if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ pr_info("Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
@@ -4999,7 +4997,7 @@ static __init int svm_hardware_setup(void)
/* Force VM NPT level equal to the host's paging level */
kvm_configure_mmu(npt_enabled, get_npt_level(),
get_npt_level(), PG_LEVEL_1G);
- pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+ pr_info("Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
/* Setup shadow_me_value and shadow_me_mask */
kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask);
@@ -5025,12 +5023,14 @@ static __init int svm_hardware_setup(void)
nrips = false;
}
- enable_apicv = avic = avic && avic_hardware_setup(&svm_x86_ops);
+ enable_apicv = avic = avic && avic_hardware_setup();
if (!enable_apicv) {
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
+ } else if (!x2avic_enabled) {
+ svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true;
}
if (vls) {
@@ -5089,10 +5089,7 @@ err:
static struct kvm_x86_init_ops svm_init_ops __initdata = {
- .cpu_has_kvm_support = has_svm,
- .disabled_by_bios = is_disabled,
.hardware_setup = svm_hardware_setup,
- .check_processor_compatibility = svm_check_processor_compat,
.runtime_ops = &svm_x86_ops,
.pmu_ops = &amd_pmu_ops,
@@ -5100,15 +5097,37 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static int __init svm_init(void)
{
+ int r;
+
__unused_size_checks();
- return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
- __alignof__(struct vcpu_svm), THIS_MODULE);
+ if (!kvm_is_svm_supported())
+ return -EOPNOTSUPP;
+
+ r = kvm_x86_vendor_init(&svm_init_ops);
+ if (r)
+ return r;
+
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
+ return 0;
+
+err_kvm_init:
+ kvm_x86_vendor_exit();
+ return r;
}
static void __exit svm_exit(void)
{
kvm_exit();
+ kvm_x86_vendor_exit();
}
module_init(svm_init)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 4826e6cc611b..839809972da1 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -35,14 +35,7 @@ extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern int vgif;
extern bool intercept_smi;
-
-enum avic_modes {
- AVIC_MODE_NONE = 0,
- AVIC_MODE_X1,
- AVIC_MODE_X2,
-};
-
-extern enum avic_modes avic_mode;
+extern bool x2avic_enabled;
/*
* Clean bits in VMCB.
@@ -237,8 +230,26 @@ struct vcpu_svm {
struct svm_nested_state nested;
+ /* NMI mask value, used when vNMI is not enabled */
+ bool nmi_masked;
+
+ /*
+ * True when NMIs are still masked but guest IRET was just intercepted
+ * and KVM is waiting for RIP to change, which will signal that the
+ * intercepted IRET was retired and thus NMI can be unmasked.
+ */
+ bool awaiting_iret_completion;
+
+ /*
+ * Set when KVM is awaiting IRET completion and needs to inject NMIs as
+ * soon as the IRET completes (e.g. NMI is pending injection). KVM
+ * temporarily steals RFLAGS.TF to single-step the guest in this case
+ * in order to regain control as soon as the NMI-blocking condition
+ * goes away.
+ */
bool nmi_singlestep;
u64 nmi_singlestep_guest_rflags;
+
bool nmi_l1_to_l2;
unsigned long soft_int_csbase;
@@ -280,6 +291,9 @@ struct vcpu_svm {
bool guest_state_loaded;
bool x2avic_msrs_intercepted;
+
+ /* Guest GIF value, used when vGIF is not enabled */
+ bool guest_gif;
};
struct svm_cpu_data {
@@ -497,7 +511,7 @@ static inline void enable_gif(struct vcpu_svm *svm)
if (vmcb)
vmcb->control.int_ctl |= V_GIF_MASK;
else
- svm->vcpu.arch.hflags |= HF_GIF_MASK;
+ svm->guest_gif = true;
}
static inline void disable_gif(struct vcpu_svm *svm)
@@ -507,7 +521,7 @@ static inline void disable_gif(struct vcpu_svm *svm)
if (vmcb)
vmcb->control.int_ctl &= ~V_GIF_MASK;
else
- svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
+ svm->guest_gif = false;
}
static inline bool gif_set(struct vcpu_svm *svm)
@@ -517,7 +531,7 @@ static inline bool gif_set(struct vcpu_svm *svm)
if (vmcb)
return !!(vmcb->control.int_ctl & V_GIF_MASK);
else
- return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
+ return svm->guest_gif;
}
static inline bool nested_npt_enabled(struct vcpu_svm *svm)
@@ -628,8 +642,23 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb);
extern struct kvm_x86_nested_ops svm_nested_ops;
/* avic.c */
-
-bool avic_hardware_setup(struct kvm_x86_ops *ops);
+#define AVIC_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLE) | \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_NESTED) | \
+ BIT(APICV_INHIBIT_REASON_IRQWIN) | \
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_SEV) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED) \
+)
+
+bool avic_hardware_setup(void);
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -641,14 +670,13 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void avic_vcpu_put(struct kvm_vcpu *vcpu);
void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu);
void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu);
-bool avic_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason);
int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
-void avic_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
+void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu);
/* sev.c */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.c b/arch/x86/kvm/svm/svm_onhyperv.c
index 26a89d0da93e..7af8422d3382 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.c
+++ b/arch/x86/kvm/svm/svm_onhyperv.c
@@ -2,6 +2,7 @@
/*
* KVM L1 hypervisor optimizations on Hyper-V for SVM.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index 45faf84476ce..cff838f15db5 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -30,11 +30,11 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
hve->hv_enlightenments_control.msr_bitmap = 1;
}
-static inline void svm_hv_hardware_setup(void)
+static inline __init void svm_hv_hardware_setup(void)
{
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB) {
- pr_info("kvm: Hyper-V enlightened NPT TLB flush enabled\n");
+ pr_info(KBUILD_MODNAME ": Hyper-V enlightened NPT TLB flush enabled\n");
svm_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
svm_x86_ops.tlb_remote_flush_with_range =
hv_remote_flush_tlb_with_range;
@@ -43,7 +43,7 @@ static inline void svm_hv_hardware_setup(void)
if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH) {
int cpu;
- pr_info("kvm: Hyper-V Direct TLB Flush enabled\n");
+ pr_info(KBUILD_MODNAME ": Hyper-V Direct TLB Flush enabled\n");
for_each_online_cpu(cpu) {
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(cpu);
@@ -84,7 +84,7 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
}
-static inline void svm_hv_hardware_setup(void)
+static inline __init void svm_hv_hardware_setup(void)
{
}
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index cd2ac9536c99..45162c1bcd8f 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -66,13 +66,13 @@ struct vmcs_config {
u64 misc;
struct nested_vmx_msrs nested;
};
-extern struct vmcs_config vmcs_config;
+extern struct vmcs_config vmcs_config __ro_after_init;
struct vmx_capability {
u32 ept;
u32 vpid;
};
-extern struct vmx_capability vmx_capability;
+extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
diff --git a/arch/x86/kvm/vmx/hyperv.c b/arch/x86/kvm/vmx/hyperv.c
index ae03d1fe0355..22daca752797 100644
--- a/arch/x86/kvm/vmx/hyperv.c
+++ b/arch/x86/kvm/vmx/hyperv.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/errno.h>
#include <linux/smp.h>
@@ -361,35 +362,43 @@ enum evmcs_revision {
enum evmcs_ctrl_type {
EVMCS_EXIT_CTRLS,
EVMCS_ENTRY_CTRLS,
+ EVMCS_EXEC_CTRL,
EVMCS_2NDEXEC,
+ EVMCS_3RDEXEC,
EVMCS_PINCTRL,
EVMCS_VMFUNC,
NR_EVMCS_CTRLS,
};
-static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
+static const u32 evmcs_supported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
[EVMCS_EXIT_CTRLS] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMEXIT_CTRL,
},
[EVMCS_ENTRY_CTRLS] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMENTRY_CTRL,
+ },
+ [EVMCS_EXEC_CTRL] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_EXEC_CTRL,
},
[EVMCS_2NDEXEC] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_2NDEXEC & ~SECONDARY_EXEC_TSC_SCALING,
+ },
+ [EVMCS_3RDEXEC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_3RDEXEC,
},
[EVMCS_PINCTRL] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_PINCTRL,
},
[EVMCS_VMFUNC] = {
- [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC,
+ [EVMCSv1_LEGACY] = EVMCS1_SUPPORTED_VMFUNC,
},
};
-static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type)
+static u32 evmcs_get_supported_ctls(enum evmcs_ctrl_type ctrl_type)
{
enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY;
- return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev];
+ return evmcs_supported_ctrls[ctrl_type][evmcs_rev];
}
static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
@@ -413,7 +422,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
{
u32 ctl_low = (u32)*pdata;
u32 ctl_high = (u32)(*pdata >> 32);
- u32 unsupported_ctrls;
+ u32 supported_ctrls;
/*
* Hyper-V 2016 and 2019 try using these features even when eVMCS
@@ -422,27 +431,31 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
switch (msr_index) {
case MSR_IA32_VMX_EXIT_CTLS:
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS);
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_EXIT_CTRLS);
if (!evmcs_has_perf_global_ctrl(vcpu))
- unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- ctl_high &= ~unsupported_ctrls;
+ supported_ctrls &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
break;
case MSR_IA32_VMX_ENTRY_CTLS:
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS);
+ supported_ctrls = evmcs_get_supported_ctls(EVMCS_ENTRY_CTRLS);
if (!evmcs_has_perf_global_ctrl(vcpu))
- unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
- ctl_high &= ~unsupported_ctrls;
+ supported_ctrls &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= supported_ctrls;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_EXEC_CTRL);
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC);
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_2NDEXEC);
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
- ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL);
+ ctl_high &= evmcs_get_supported_ctls(EVMCS_PINCTRL);
break;
case MSR_IA32_VMX_VMFUNC:
- ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC);
+ ctl_low &= evmcs_get_supported_ctls(EVMCS_VMFUNC);
break;
}
@@ -452,7 +465,7 @@ void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *
static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type,
u32 val)
{
- return !(val & evmcs_get_unsupported_ctls(ctrl_type));
+ return !(val & ~evmcs_get_supported_ctls(ctrl_type));
}
int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
@@ -461,6 +474,10 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
vmcs12->pin_based_vm_exec_control)))
return -EINVAL;
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXEC_CTRL,
+ vmcs12->cpu_based_vm_exec_control)))
+ return -EINVAL;
+
if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC,
vmcs12->secondary_vm_exec_control)))
return -EINVAL;
@@ -488,6 +505,38 @@ int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
return 0;
}
+#if IS_ENABLED(CONFIG_HYPERV)
+/*
+ * KVM on Hyper-V always uses the latest known eVMCSv1 revision, the assumption
+ * is: in case a feature has corresponding fields in eVMCS described and it was
+ * exposed in VMX feature MSRs, KVM is free to use it. Warn if KVM meets a
+ * feature which has no corresponding eVMCS field, this likely means that KVM
+ * needs to be updated.
+ */
+#define evmcs_check_vmcs_conf(field, ctrl) \
+ do { \
+ typeof(vmcs_conf->field) unsupported; \
+ \
+ unsupported = vmcs_conf->field & ~EVMCS1_SUPPORTED_ ## ctrl; \
+ if (unsupported) { \
+ pr_warn_once(#field " unsupported with eVMCS: 0x%llx\n",\
+ (u64)unsupported); \
+ vmcs_conf->field &= EVMCS1_SUPPORTED_ ## ctrl; \
+ } \
+ } \
+ while (0)
+
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+{
+ evmcs_check_vmcs_conf(cpu_based_exec_ctrl, EXEC_CTRL);
+ evmcs_check_vmcs_conf(pin_based_exec_ctrl, PINCTRL);
+ evmcs_check_vmcs_conf(cpu_based_2nd_exec_ctrl, 2NDEXEC);
+ evmcs_check_vmcs_conf(cpu_based_3rd_exec_ctrl, 3RDEXEC);
+ evmcs_check_vmcs_conf(vmentry_ctrl, VMENTRY_CTRL);
+ evmcs_check_vmcs_conf(vmexit_ctrl, VMEXIT_CTRL);
+}
+#endif
+
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index 571e7929d14e..78d17667e7ec 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -48,22 +48,84 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
* Currently unsupported in KVM:
* GUEST_IA32_RTIT_CTL = 0x00002814,
*/
-#define EVMCS1_UNSUPPORTED_PINCTRL (PIN_BASED_POSTED_INTR | \
- PIN_BASED_VMX_PREEMPTION_TIMER)
-#define EVMCS1_UNSUPPORTED_EXEC_CTRL (CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
-#define EVMCS1_UNSUPPORTED_2NDEXEC \
- (SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
- SECONDARY_EXEC_APIC_REGISTER_VIRT | \
- SECONDARY_EXEC_ENABLE_PML | \
- SECONDARY_EXEC_ENABLE_VMFUNC | \
- SECONDARY_EXEC_SHADOW_VMCS | \
+#define EVMCS1_SUPPORTED_PINCTRL \
+ (PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING | \
+ PIN_BASED_VIRTUAL_NMIS)
+
+#define EVMCS1_SUPPORTED_EXEC_CTRL \
+ (CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | \
+ CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING | \
+ CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+
+#define EVMCS1_SUPPORTED_2NDEXEC \
+ (SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
SECONDARY_EXEC_TSC_SCALING | \
- SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
- (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
-#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0)
-#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define EVMCS1_SUPPORTED_3RDEXEC (0ULL)
+
+#define EVMCS1_SUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE | \
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMENTRY_CTRL \
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | \
+ VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_IA32E_MODE | \
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define EVMCS1_SUPPORTED_VMFUNC (0)
struct evmcs_field {
u16 offset;
@@ -117,9 +179,7 @@ static __always_inline int get_evmcs_offset(unsigned long field,
{
int offset = evmcs_field_offset(field, clean_field);
- WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
- field);
-
+ WARN_ONCE(offset < 0, "accessing unsupported EVMCS field %lx\n", field);
return offset;
}
@@ -136,7 +196,7 @@ static __always_inline void evmcs_write64(unsigned long field, u64 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline void evmcs_write32(unsigned long field, u32 value)
+static __always_inline void evmcs_write32(unsigned long field, u32 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -148,7 +208,7 @@ static inline void evmcs_write32(unsigned long field, u32 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline void evmcs_write16(unsigned long field, u16 value)
+static __always_inline void evmcs_write16(unsigned long field, u16 value)
{
u16 clean_field;
int offset = get_evmcs_offset(field, &clean_field);
@@ -160,7 +220,7 @@ static inline void evmcs_write16(unsigned long field, u16 value)
current_evmcs->hv_clean_fields &= ~clean_field;
}
-static inline u64 evmcs_read64(unsigned long field)
+static __always_inline u64 evmcs_read64(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -170,7 +230,7 @@ static inline u64 evmcs_read64(unsigned long field)
return *(u64 *)((char *)current_evmcs + offset);
}
-static inline u32 evmcs_read32(unsigned long field)
+static __always_inline u32 evmcs_read32(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -180,7 +240,7 @@ static inline u32 evmcs_read32(unsigned long field)
return *(u32 *)((char *)current_evmcs + offset);
}
-static inline u16 evmcs_read16(unsigned long field)
+static __always_inline u16 evmcs_read16(unsigned long field)
{
int offset = get_evmcs_offset(field, NULL);
@@ -190,16 +250,6 @@ static inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}
-static inline void evmcs_touch_msr_bitmap(void)
-{
- if (unlikely(!current_evmcs))
- return;
-
- if (current_evmcs->hv_enlightenments_control.msr_bitmap)
- current_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-}
-
static inline void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
@@ -211,15 +261,15 @@ static inline void evmcs_load(u64 phys_addr)
vp_ap->enlighten_vmentry = 1;
}
+void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
-static inline void evmcs_write32(unsigned long field, u32 value) {}
-static inline void evmcs_write16(unsigned long field, u16 value) {}
-static inline u64 evmcs_read64(unsigned long field) { return 0; }
-static inline u32 evmcs_read32(unsigned long field) { return 0; }
-static inline u16 evmcs_read16(unsigned long field) { return 0; }
+static __always_inline void evmcs_write32(unsigned long field, u32 value) {}
+static __always_inline void evmcs_write16(unsigned long field, u16 value) {}
+static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
+static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
+static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
#define EVMPTR_INVALID (-1ULL)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d93c715cda6a..7c4f5ca405c7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/objtool.h>
#include <linux/percpu.h>
@@ -203,7 +204,7 @@ static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
- pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
+ pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator);
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -5863,11 +5864,10 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu)
u32 function = kvm_rax_read(vcpu);
/*
- * VMFUNC is only supported for nested guests, but we always enable the
- * secondary control for simplicity; for non-nested mode, fake that we
- * didn't by injecting #UD.
+ * VMFUNC should never execute cleanly while L1 is active; KVM supports
+ * VMFUNC for nested VMs, but not for L1.
*/
- if (!is_guest_mode(vcpu)) {
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu))) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
@@ -6880,6 +6880,7 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_INVPCID |
+ SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_TSC_SCALING |
@@ -6912,18 +6913,13 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
SECONDARY_EXEC_ENABLE_PML;
msrs->ept_caps |= VMX_EPT_AD_BIT;
}
- }
- if (cpu_has_vmx_vmfunc()) {
- msrs->secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_VMFUNC;
/*
- * Advertise EPTP switching unconditionally
- * since we emulate it
+ * Advertise EPTP switching irrespective of hardware support,
+ * KVM emulates it in software so long as VMFUNC is supported.
*/
- if (enable_ept)
- msrs->vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
+ if (cpu_has_vmx_vmfunc())
+ msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING;
}
/*
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index e5cec07ca8d9..e8a3be0b9df9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -8,6 +8,8 @@
* Avi Kivity <avi@redhat.com>
* Gleb Natapov <gleb@redhat.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
@@ -20,16 +22,19 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
-static struct kvm_event_hw_type_mapping intel_arch_events[] = {
- [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
- [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
- [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
- [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+static struct {
+ u8 eventsel;
+ u8 unit_mask;
+} const intel_arch_events[] = {
+ [0] = { 0x3c, 0x00 },
+ [1] = { 0xc0, 0x00 },
+ [2] = { 0x3c, 0x01 },
+ [3] = { 0x2e, 0x4f },
+ [4] = { 0x2e, 0x41 },
+ [5] = { 0xc4, 0x00 },
+ [6] = { 0xc5, 0x00 },
/* The above index must match CPUID 0x0A.EBX bit vector */
- [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
+ [7] = { 0x00, 0x03 },
};
/* mapping between fixed pmc index and intel_arch_events array */
@@ -762,8 +767,7 @@ void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu)
return;
warn:
- pr_warn_ratelimited("kvm: vcpu-%d: fail to passthrough LBR.\n",
- vcpu->vcpu_id);
+ pr_warn_ratelimited("vcpu-%d: fail to passthrough LBR.\n", vcpu->vcpu_id);
}
static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
@@ -810,4 +814,6 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = {
.reset = intel_pmu_reset,
.deliver_pmi = intel_pmu_deliver_pmi,
.cleanup = intel_pmu_cleanup,
+ .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT,
+ .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC,
};
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 1b56c5e5c9fb..94c38bea60e7 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kvm_host.h>
#include <asm/irq_remapping.h>
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index b12da2a6dec9..aa53c98034bf 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2021 Intel Corporation. */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/sgx.h>
@@ -164,7 +165,7 @@ static int __handle_encls_ecreate(struct kvm_vcpu *vcpu,
if (!vcpu->kvm->arch.sgx_provisioning_allowed &&
(attributes & SGX_ATTR_PROVISIONKEY)) {
if (sgx_12_1->eax & SGX_ATTR_PROVISIONKEY)
- pr_warn_once("KVM: SGX PROVISIONKEY advertised but not allowed\n");
+ pr_warn_once("SGX PROVISIONKEY advertised but not allowed\n");
kvm_inject_gp(vcpu, 0);
return 1;
}
@@ -381,7 +382,7 @@ int handle_encls(struct kvm_vcpu *vcpu)
return handle_encls_ecreate(vcpu);
if (leaf == EINIT)
return handle_encls_einit(vcpu);
- WARN(1, "KVM: unexpected exit on ENCLS[%u]", leaf);
+ WARN_ONCE(1, "unexpected exit on ENCLS[%u]", leaf);
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->run->hw.hardware_exit_reason = EXIT_REASON_ENCLS;
return 0;
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index ac290a44a693..7c1996b433e2 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -75,7 +75,7 @@ struct loaded_vmcs {
struct vmcs_controls_shadow controls_shadow;
};
-static inline bool is_intr_type(u32 intr_info, u32 type)
+static __always_inline bool is_intr_type(u32 intr_info, u32 type)
{
const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;
@@ -146,7 +146,7 @@ static inline bool is_icebp(u32 intr_info)
return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
}
-static inline bool is_nmi(u32 intr_info)
+static __always_inline bool is_nmi(u32 intr_info)
{
return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
}
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 2251b60920f8..106a72c923ca 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "vmcs12.h"
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 766c6b3ef5ed..f550540ed54e 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -31,6 +31,39 @@
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif
+.macro VMX_DO_EVENT_IRQOFF call_insn call_target
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ \call_insn \call_target
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ mov %_ASM_BP, %_ASM_SP
+ pop %_ASM_BP
+ RET
+.endm
+
.section .noinstr.text, "ax"
/**
@@ -69,8 +102,8 @@ SYM_FUNC_START(__vmx_vcpu_run)
*/
push %_ASM_ARG2
- /* Copy @flags to BL, _ASM_ARG3 is volatile. */
- mov %_ASM_ARG3B, %bl
+ /* Copy @flags to EBX, _ASM_ARG3 is volatile. */
+ mov %_ASM_ARG3L, %ebx
lea (%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
@@ -106,7 +139,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
- testb $VMX_RUN_VMRESUME, %bl
+ test $VMX_RUN_VMRESUME, %ebx
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
@@ -128,7 +161,7 @@ SYM_FUNC_START(__vmx_vcpu_run)
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
- /* Check EFLAGS.ZF from 'testb' above */
+ /* Check EFLAGS.ZF from 'test VMX_RUN_VMRESUME' above */
jz .Lvmlaunch
/*
@@ -266,6 +299,10 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
SYM_FUNC_END(__vmx_vcpu_run)
+SYM_FUNC_START(vmx_do_nmi_irqoff)
+ VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
+SYM_FUNC_END(vmx_do_nmi_irqoff)
+
.section .text, "ax"
@@ -320,35 +357,6 @@ SYM_FUNC_START(vmread_error_trampoline)
SYM_FUNC_END(vmread_error_trampoline)
#endif
-SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
- /*
- * Unconditionally create a stack frame, getting the correct RSP on the
- * stack (for x86-64) would take two instructions anyways, and RBP can
- * be used to restore RSP to make objtool happy (see below).
- */
- push %_ASM_BP
- mov %_ASM_SP, %_ASM_BP
-
-#ifdef CONFIG_X86_64
- /*
- * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
- * creating the synthetic interrupt stack frame for the IRQ/NMI.
- */
- and $-16, %rsp
- push $__KERNEL_DS
- push %rbp
-#endif
- pushf
- push $__KERNEL_CS
- CALL_NOSPEC _ASM_ARG1
-
- /*
- * "Restore" RSP from RBP, even though IRET has already unwound RSP to
- * the correct value. objtool doesn't know the callee will IRET and,
- * without the explicit restore, thinks the stack is getting walloped.
- * Using an unwind hint is problematic due to x86-64's dynamic alignment.
- */
- mov %_ASM_BP, %_ASM_SP
- pop %_ASM_BP
- RET
-SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
+SYM_FUNC_START(vmx_do_interrupt_irqoff)
+ VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
+SYM_FUNC_END(vmx_do_interrupt_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 7eec0226d56a..bcac3efcde41 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -12,6 +12,7 @@
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/highmem.h>
#include <linux/hrtimer.h>
@@ -444,36 +445,36 @@ void vmread_error(unsigned long field, bool fault)
if (fault)
kvm_spurious_fault();
else
- vmx_insn_failed("kvm: vmread failed: field=%lx\n", field);
+ vmx_insn_failed("vmread failed: field=%lx\n", field);
}
noinline void vmwrite_error(unsigned long field, unsigned long value)
{
- vmx_insn_failed("kvm: vmwrite failed: field=%lx val=%lx err=%u\n",
+ vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmclear failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
{
- vmx_insn_failed("kvm: vmptrld failed: %p/%llx err=%u\n",
+ vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
}
noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
{
- vmx_insn_failed("kvm: invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
+ vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
ext, vpid, gva);
}
noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
{
- vmx_insn_failed("kvm: invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
+ vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
ext, eptp, gpa);
}
@@ -488,8 +489,8 @@ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
-struct vmcs_config vmcs_config;
-struct vmx_capability vmx_capability;
+struct vmcs_config vmcs_config __ro_after_init;
+struct vmx_capability vmx_capability __ro_after_init;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
@@ -523,6 +524,8 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
static unsigned long host_idt_base;
#if IS_ENABLED(CONFIG_HYPERV)
+static struct kvm_x86_ops vmx_x86_ops __initdata;
+
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
@@ -551,6 +554,71 @@ static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
return 0;
}
+static __init void hv_init_evmcs(void)
+{
+ int cpu;
+
+ if (!enlightened_vmcs)
+ return;
+
+ /*
+ * Enlightened VMCS usage should be recommended and the host needs
+ * to support eVMCS v1 or above.
+ */
+ if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
+ (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
+ KVM_EVMCS_VERSION) {
+
+ /* Check that we have assist pages on all online CPUs */
+ for_each_online_cpu(cpu) {
+ if (!hv_get_vp_assist_page(cpu)) {
+ enlightened_vmcs = false;
+ break;
+ }
+ }
+
+ if (enlightened_vmcs) {
+ pr_info("Using Hyper-V Enlightened VMCS\n");
+ static_branch_enable(&enable_evmcs);
+ }
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+ vmx_x86_ops.enable_l2_tlb_flush
+ = hv_enable_l2_tlb_flush;
+
+ } else {
+ enlightened_vmcs = false;
+ }
+}
+
+static void hv_reset_evmcs(void)
+{
+ struct hv_vp_assist_page *vp_ap;
+
+ if (!static_branch_unlikely(&enable_evmcs))
+ return;
+
+ /*
+ * KVM should enable eVMCS if and only if all CPUs have a VP assist
+ * page, and should reject CPU onlining if eVMCS is enabled the CPU
+ * doesn't have a VP assist page allocated.
+ */
+ vp_ap = hv_get_vp_assist_page(smp_processor_id());
+ if (WARN_ON_ONCE(!vp_ap))
+ return;
+
+ /*
+ * Reset everything to support using non-enlightened VMCS access later
+ * (e.g. when we reload the module with enlightened_vmcs=0)
+ */
+ vp_ap->nested_control.features.directhypercall = 0;
+ vp_ap->current_nested_vmcs = 0;
+ vp_ap->enlighten_vmentry = 0;
+}
+
+#else /* IS_ENABLED(CONFIG_HYPERV) */
+static void hv_init_evmcs(void) {}
+static void hv_reset_evmcs(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
@@ -1613,8 +1681,8 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
if (!instr_len)
goto rip_updated;
- WARN(exit_reason.enclave_mode,
- "KVM: skipping instruction after SGX enclave VM-Exit");
+ WARN_ONCE(exit_reason.enclave_mode,
+ "skipping instruction after SGX enclave VM-Exit");
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + instr_len;
@@ -2138,9 +2206,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
invalid = data & ~vmx_get_supported_debugctl(vcpu, msr_info->host_initiated);
if (invalid & (DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR)) {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "%s: BTF|LBR in IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr_index, data);
data &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
invalid &= ~(DEBUGCTLMSR_BTF|DEBUGCTLMSR_LBR);
}
@@ -2448,88 +2514,6 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
}
}
-static __init int cpu_has_kvm_support(void)
-{
- return cpu_has_vmx();
-}
-
-static __init int vmx_disabled_by_bios(void)
-{
- return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !boot_cpu_has(X86_FEATURE_VMX);
-}
-
-static int kvm_cpu_vmxon(u64 vmxon_pointer)
-{
- u64 msr;
-
- cr4_set_bits(X86_CR4_VMXE);
-
- asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
- _ASM_EXTABLE(1b, %l[fault])
- : : [vmxon_pointer] "m"(vmxon_pointer)
- : : fault);
- return 0;
-
-fault:
- WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
- rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
- cr4_clear_bits(X86_CR4_VMXE);
-
- return -EFAULT;
-}
-
-static int vmx_hardware_enable(void)
-{
- int cpu = raw_smp_processor_id();
- u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- int r;
-
- if (cr4_read_shadow() & X86_CR4_VMXE)
- return -EBUSY;
-
- /*
- * This can happen if we hot-added a CPU but failed to allocate
- * VP assist page for it.
- */
- if (static_branch_unlikely(&enable_evmcs) &&
- !hv_get_vp_assist_page(cpu))
- return -EFAULT;
-
- intel_pt_handle_vmx(1);
-
- r = kvm_cpu_vmxon(phys_addr);
- if (r) {
- intel_pt_handle_vmx(0);
- return r;
- }
-
- if (enable_ept)
- ept_sync_global();
-
- return 0;
-}
-
-static void vmclear_local_loaded_vmcss(void)
-{
- int cpu = raw_smp_processor_id();
- struct loaded_vmcs *v, *n;
-
- list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
- loaded_vmcss_on_cpu_link)
- __loaded_vmcs_clear(v);
-}
-
-static void vmx_hardware_disable(void)
-{
- vmclear_local_loaded_vmcss();
-
- if (cpu_vmxoff())
- kvm_spurious_fault();
-
- intel_pt_handle_vmx(0);
-}
-
/*
* There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
* directly instead of going through cpu_has(), to ensure KVM is trapping
@@ -2565,8 +2549,7 @@ static bool cpu_has_perf_global_ctrl_bug(void)
return false;
}
-static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
- u32 msr, u32 *result)
+static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
@@ -2584,7 +2567,7 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
-static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
+static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
{
u64 allowed;
@@ -2593,8 +2576,8 @@ static __init u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
return ctl_opt & allowed;
}
-static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
- struct vmx_capability *vmx_cap)
+static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
+ struct vmx_capability *vmx_cap)
{
u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
@@ -2752,9 +2735,127 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->vmentry_ctrl = _vmentry_control;
vmcs_conf->misc = misc_msr;
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
+ evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
+
+ return 0;
+}
+
+static bool kvm_is_vmx_supported(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!cpu_has_vmx()) {
+ pr_err("VMX not supported by CPU %d\n", cpu);
+ return false;
+ }
+
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
+ return false;
+ }
+
+ return true;
+}
+
+static int vmx_check_processor_compat(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct vmcs_config vmcs_conf;
+ struct vmx_capability vmx_cap;
+
+ if (!kvm_is_vmx_supported())
+ return -EIO;
+
+ if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0) {
+ pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ if (nested)
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
+ if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
+ pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
+ return -EIO;
+ }
+ return 0;
+}
+
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
+{
+ u64 msr;
+
+ cr4_set_bits(X86_CR4_VMXE);
+
+ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
+}
+
+static int vmx_hardware_enable(void)
+{
+ int cpu = raw_smp_processor_id();
+ u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ int r;
+
+ if (cr4_read_shadow() & X86_CR4_VMXE)
+ return -EBUSY;
+
+ /*
+ * This can happen if we hot-added a CPU but failed to allocate
+ * VP assist page for it.
+ */
+ if (static_branch_unlikely(&enable_evmcs) &&
+ !hv_get_vp_assist_page(cpu))
+ return -EFAULT;
+
+ intel_pt_handle_vmx(1);
+
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r) {
+ intel_pt_handle_vmx(0);
+ return r;
+ }
+
+ if (enable_ept)
+ ept_sync_global();
+
return 0;
}
+static void vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v, *n;
+
+ list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ __loaded_vmcs_clear(v);
+}
+
+static void vmx_hardware_disable(void)
+{
+ vmclear_local_loaded_vmcss();
+
+ if (cpu_vmxoff())
+ kvm_spurious_fault();
+
+ hv_reset_evmcs();
+
+ intel_pt_handle_vmx(0);
+}
+
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
@@ -2950,9 +3051,8 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
- printk_once(KERN_WARNING "kvm: segment base is not "
- "paragraph aligned when entering "
- "protected mode (seg=%d)", seg);
+ pr_warn_once("segment base is not paragraph aligned "
+ "when entering protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
@@ -2982,8 +3082,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
* vcpu. Warn the user that an update is overdue.
*/
if (!kvm_vmx->tss_addr)
- printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
- "called before entering vcpu\n");
+ pr_warn_once("KVM_SET_TSS_ADDR needs to be called before running vCPU\n");
vmx_segment_cache_clear(vmx);
@@ -3800,39 +3899,6 @@ static void seg_setup(int seg)
vmcs_write32(sf->ar_bytes, ar);
}
-static int alloc_apic_access_page(struct kvm *kvm)
-{
- struct page *page;
- void __user *hva;
- int ret = 0;
-
- mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_memslot_enabled)
- goto out;
- hva = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
- APIC_DEFAULT_PHYS_BASE, PAGE_SIZE);
- if (IS_ERR(hva)) {
- ret = PTR_ERR(hva);
- goto out;
- }
-
- page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page)) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * Do not pin the page in memory, so that memory hot-unplug
- * is able to migrate it.
- */
- put_page(page);
- kvm->arch.apic_access_memslot_enabled = true;
-out:
- mutex_unlock(&kvm->slots_lock);
- return ret;
-}
-
int allocate_vpid(void)
{
int vpid;
@@ -3865,8 +3931,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
* bitmap has changed.
*/
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
vmx->nested.force_msr_bitmap_recalc = true;
}
@@ -3947,29 +4018,20 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type)
vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
-{
- unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
- unsigned long read_intercept;
- int msr;
-
- read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
-
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned int read_idx = msr / BITS_PER_LONG;
- unsigned int write_idx = read_idx + (0x800 / sizeof(long));
-
- msr_bitmap[read_idx] = read_intercept;
- msr_bitmap[write_idx] = ~0ul;
- }
-}
-
static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
{
+ /*
+ * x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
+ * of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
+ * i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
+ */
+ const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
+ const int write_idx = read_idx + (0x800 / sizeof(u64));
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 *msr_bitmap = (u64 *)vmx->vmcs01.msr_bitmap;
u8 mode;
- if (!cpu_has_vmx_msr_bitmap())
+ if (!cpu_has_vmx_msr_bitmap() || WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
return;
if (cpu_has_secondary_exec_ctrls() &&
@@ -3987,7 +4049,18 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
vmx->x2apic_msr_bitmap_mode = mode;
- vmx_reset_x2apic_msrs(vcpu, mode);
+ /*
+ * Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
+ * registers (0x840 and above) intercepted, KVM doesn't support them.
+ * Intercept all writes by default and poke holes as needed. Pass
+ * through reads for all valid registers by default in x2APIC+APICv
+ * mode, only the current timer count needs on-demand emulation by KVM.
+ */
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
+ msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(vcpu->arch.apic);
+ else
+ msr_bitmap[read_idx] = ~0ull;
+ msr_bitmap[write_idx] = ~0ull;
/*
* TPR reads and writes can be virtualized even if virtual interrupt
@@ -4519,6 +4592,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+ /*
+ * KVM doesn't support VMFUNC for L1, but the control is set in KVM's
+ * base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
+ */
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
+
/* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,
* in vmx_set_cr4. */
exec_control &= ~SECONDARY_EXEC_DESC;
@@ -4535,7 +4614,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
* it needs to be set here when dirty logging is already active, e.g.
* if this vCPU was created after dirty logging was enabled.
*/
- if (!vcpu->kvm->arch.cpu_dirty_logging_count)
+ if (!enable_pml || !atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
if (cpu_has_vmx_xsaves()) {
@@ -5099,8 +5178,13 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
vect_info = vmx->idt_vectoring_info;
intr_info = vmx_get_intr_info(vcpu);
+ /*
+ * Machine checks are handled by handle_exception_irqoff(), or by
+ * vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
+ * vmx_vcpu_enter_exit().
+ */
if (is_machine_check(intr_info) || is_nmi(intr_info))
- return 1; /* handled by handle_exception_nmi_irqoff() */
+ return 1;
/*
* Queue the exception here instead of in handle_nm_fault_irqoff().
@@ -6790,17 +6874,8 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
-void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
-
-static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
- unsigned long entry)
-{
- bool is_nmi = entry == (unsigned long)asm_exc_nmi_noist;
-
- kvm_before_interrupt(vcpu, is_nmi ? KVM_HANDLING_NMI : KVM_HANDLING_IRQ);
- vmx_do_interrupt_nmi_irqoff(entry);
- kvm_after_interrupt(vcpu);
-}
+void vmx_do_interrupt_irqoff(unsigned long entry);
+void vmx_do_nmi_irqoff(void);
static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
{
@@ -6822,9 +6897,8 @@ static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
}
-static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
+static void handle_exception_irqoff(struct vcpu_vmx *vmx)
{
- const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
@@ -6836,9 +6910,6 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
/* Handle machine checks before interrupts are enabled */
else if (is_machine_check(intr_info))
kvm_machine_check();
- /* We need to handle NMIs before interrupts are enabled */
- else if (is_nmi(intr_info))
- handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
@@ -6848,10 +6919,13 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
gate_desc *desc = (gate_desc *)host_idt_base + vector;
if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
- "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
+ "unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
+ kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
+ vmx_do_interrupt_irqoff(gate_offset(desc));
+ kvm_after_interrupt(vcpu);
+
vcpu->arch.at_instruction_boundary = true;
}
@@ -6865,7 +6939,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
- handle_exception_nmi_irqoff(vmx);
+ handle_exception_irqoff(vmx);
}
/*
@@ -7100,9 +7174,10 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
}
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
- struct vcpu_vmx *vmx,
- unsigned long flags)
+ unsigned int flags)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
guest_state_enter_irqoff();
/* L1D Flush includes CPU buffer clear to mitigate MDS */
@@ -7126,6 +7201,18 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
vmx_enable_fb_clear(vmx);
+ if (unlikely(vmx->fail))
+ vmx->exit_reason.full = 0xdead;
+ else
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+
+ if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
+ is_nmi(vmx_get_intr_info(vcpu))) {
+ kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
+ vmx_do_nmi_irqoff();
+ kvm_after_interrupt(vcpu);
+ }
+
guest_state_exit_irqoff();
}
@@ -7220,7 +7307,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
kvm_wait_lapic_expire(vcpu);
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
- vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx));
+ vmx_vcpu_enter_exit(vcpu, __vmx_vcpu_run_flags(vmx));
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs)) {
@@ -7267,12 +7354,9 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->idt_vectoring_info = 0;
- if (unlikely(vmx->fail)) {
- vmx->exit_reason.full = 0xdead;
+ if (unlikely(vmx->fail))
return EXIT_FASTPATH_NONE;
- }
- vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
@@ -7386,7 +7470,7 @@ static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs = &vmx->vmcs01;
if (cpu_need_virtualize_apic_accesses(vcpu)) {
- err = alloc_apic_access_page(vcpu->kvm);
+ err = kvm_alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
@@ -7446,29 +7530,6 @@ static int vmx_vm_init(struct kvm *kvm)
return 0;
}
-static int __init vmx_check_processor_compat(void)
-{
- struct vmcs_config vmcs_conf;
- struct vmx_capability vmx_cap;
-
- if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
- !this_cpu_has(X86_FEATURE_VMX)) {
- pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
- return -EIO;
- }
-
- if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
- return -EIO;
- if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
- if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
- printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
- smp_processor_id());
- return -EIO;
- }
- return 0;
-}
-
static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 cache;
@@ -7940,17 +8001,20 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (WARN_ON_ONCE(!enable_pml))
+ return;
+
if (is_guest_mode(vcpu)) {
vmx->nested.update_vmcs01_cpu_dirty_logging = true;
return;
}
/*
- * Note, cpu_dirty_logging_count can be changed concurrent with this
+ * Note, nr_memslots_dirty_logging can be changed concurrent with this
* code, but in that case another update request will be made and so
* the guest will never run with a stale PML value.
*/
- if (vcpu->kvm->arch.cpu_dirty_logging_count)
+ if (atomic_read(&vcpu->kvm->nr_memslots_dirty_logging))
secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
else
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
@@ -8048,17 +8112,16 @@ static void vmx_hardware_unsetup(void)
free_kvm_area();
}
-static bool vmx_check_apicv_inhibit_reasons(enum kvm_apicv_inhibit reason)
-{
- ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
- BIT(APICV_INHIBIT_REASON_ABSENT) |
- BIT(APICV_INHIBIT_REASON_HYPERV) |
- BIT(APICV_INHIBIT_REASON_BLOCKIRQ) |
- BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) |
- BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED);
-
- return supported & BIT(reason);
-}
+#define VMX_REQUIRED_APICV_INHIBITS \
+( \
+ BIT(APICV_INHIBIT_REASON_DISABLE)| \
+ BIT(APICV_INHIBIT_REASON_ABSENT) | \
+ BIT(APICV_INHIBIT_REASON_HYPERV) | \
+ BIT(APICV_INHIBIT_REASON_BLOCKIRQ) | \
+ BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) | \
+ BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
+)
static void vmx_vm_destroy(struct kvm *kvm)
{
@@ -8068,7 +8131,9 @@ static void vmx_vm_destroy(struct kvm *kvm)
}
static struct kvm_x86_ops vmx_x86_ops __initdata = {
- .name = "kvm_intel",
+ .name = KBUILD_MODNAME,
+
+ .check_processor_compatibility = vmx_check_processor_compat,
.hardware_unsetup = vmx_hardware_unsetup,
@@ -8142,7 +8207,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_post_state_restore = vmx_apicv_post_state_restore,
- .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
+ .required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
@@ -8288,7 +8353,7 @@ static __init int hardware_setup(void)
return -EIO;
if (cpu_has_perf_global_ctrl_bug())
- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
if (boot_cpu_has(X86_FEATURE_NX))
@@ -8296,7 +8361,7 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_MPX)) {
rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
- WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
+ WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
}
if (!cpu_has_vmx_mpx())
@@ -8315,7 +8380,7 @@ static __init int hardware_setup(void)
/* NX support is required for shadow paging. */
if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
- pr_err_ratelimited("kvm: NX (Execute Disable) not supported\n");
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
return -EOPNOTSUPP;
}
@@ -8467,9 +8532,6 @@ static __init int hardware_setup(void)
}
static struct kvm_x86_init_ops vmx_init_ops __initdata = {
- .cpu_has_kvm_support = cpu_has_kvm_support,
- .disabled_by_bios = vmx_disabled_by_bios,
- .check_processor_compatibility = vmx_check_processor_compat,
.hardware_setup = hardware_setup,
.handle_intel_pt_intr = NULL,
@@ -8487,41 +8549,23 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
+ kvm_x86_vendor_exit();
-#if IS_ENABLED(CONFIG_HYPERV)
- if (static_branch_unlikely(&enable_evmcs)) {
- int cpu;
- struct hv_vp_assist_page *vp_ap;
- /*
- * Reset everything to support using non-enlightened VMCS
- * access later (e.g. when we reload the module with
- * enlightened_vmcs=0)
- */
- for_each_online_cpu(cpu) {
- vp_ap = hv_get_vp_assist_page(cpu);
-
- if (!vp_ap)
- continue;
-
- vp_ap->nested_control.features.directhypercall = 0;
- vp_ap->current_nested_vmcs = 0;
- vp_ap->enlighten_vmentry = 0;
- }
-
- static_branch_disable(&enable_evmcs);
- }
-#endif
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
@@ -8529,56 +8573,29 @@ static int __init vmx_init(void)
{
int r, cpu;
-#if IS_ENABLED(CONFIG_HYPERV)
+ if (!kvm_is_vmx_supported())
+ return -EOPNOTSUPP;
+
/*
- * Enlightened VMCS usage should be recommended and the host needs
- * to support eVMCS v1 or above. We can also disable eVMCS support
- * with module parameter.
+ * Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
+ * to unwind if a later step fails.
*/
- if (enlightened_vmcs &&
- ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
- (ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
- KVM_EVMCS_VERSION) {
-
- /* Check that we have assist pages on all online CPUs */
- for_each_online_cpu(cpu) {
- if (!hv_get_vp_assist_page(cpu)) {
- enlightened_vmcs = false;
- break;
- }
- }
+ hv_init_evmcs();
- if (enlightened_vmcs) {
- pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
- static_branch_enable(&enable_evmcs);
- }
-
- if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
- vmx_x86_ops.enable_l2_tlb_flush
- = hv_enable_l2_tlb_flush;
-
- } else {
- enlightened_vmcs = false;
- }
-#endif
-
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ r = kvm_x86_vendor_init(&vmx_init_ops);
if (r)
return r;
/*
- * Must be called after kvm_init() so enable_ept is properly set
+ * Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
* the pre module init parser. If no parameter was given, it will
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ if (r)
+ goto err_l1d_flush;
vmx_setup_fb_clear_ctrl();
@@ -8602,6 +8619,21 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx),
+ THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
+
+err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
+ kvm_x86_vendor_exit();
+ return r;
}
module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a3da84f4ea45..2acdc54bc34b 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -640,12 +640,12 @@ BUILD_CONTROLS_SHADOW(tertiary_exec, TERTIARY_VM_EXEC_CONTROL, 64)
(1 << VCPU_EXREG_EXIT_INFO_1) | \
(1 << VCPU_EXREG_EXIT_INFO_2))
-static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
+static __always_inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm)
{
return container_of(kvm, struct kvm_vmx, kvm);
}
-static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
+static __always_inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
@@ -669,25 +669,23 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu);
int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu);
void vmx_passthrough_lbr_msrs(struct kvm_vcpu *vcpu);
-static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
+static __always_inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) {
- kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1))
vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- }
+
return vmx->exit_qualification;
}
-static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+static __always_inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) {
- kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ if (!kvm_register_test_and_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2))
vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- }
+
return vmx->exit_intr_info;
}
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 842dc898c972..db95bde52998 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -100,8 +100,10 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field)
return value;
do_fail:
- WARN_ONCE(1, "kvm: vmread failed: field=%lx\n", field);
- pr_warn_ratelimited("kvm: vmread failed: field=%lx\n", field);
+ instrumentation_begin();
+ WARN_ONCE(1, KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ pr_warn_ratelimited(KBUILD_MODNAME ": vmread failed: field=%lx\n", field);
+ instrumentation_end();
return 0;
do_exception:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0735fbc9ba8c..7713420abab0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,6 +15,7 @@
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kvm_host.h>
#include "irq.h"
@@ -128,6 +129,7 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
+static DEFINE_MUTEX(vendor_module_lock);
struct kvm_x86_ops kvm_x86_ops __read_mostly;
#define KVM_X86_OP(func) \
@@ -1421,7 +1423,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
* may depend on host virtualization features rather than host cpu features.
*/
-static const u32 msrs_to_save_all[] = {
+static const u32 msrs_to_save_base[] = {
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -1438,6 +1440,10 @@ static const u32 msrs_to_save_all[] = {
MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
MSR_IA32_UMWAIT_CONTROL,
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+};
+
+static const u32 msrs_to_save_pmu[] = {
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
@@ -1462,11 +1468,10 @@ static const u32 msrs_to_save_all[] = {
MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
-
- MSR_IA32_XFD, MSR_IA32_XFD_ERR,
};
-static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
static unsigned num_msrs_to_save;
static const u32 emulated_msrs_all[] = {
@@ -1484,7 +1489,7 @@ static const u32 emulated_msrs_all[] = {
HV_X64_MSR_STIMER0_CONFIG,
HV_X64_MSR_VP_ASSIST_PAGE,
HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
- HV_X64_MSR_TSC_EMULATION_STATUS,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
HV_X64_MSR_SYNDBG_OPTIONS,
HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
@@ -2093,7 +2098,7 @@ static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
!guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
return kvm_handle_invalid_op(vcpu);
- pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
+ pr_warn_once("%s instruction emulated as NOP!\n", insn);
return kvm_emulate_as_nop(vcpu);
}
int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
@@ -2440,7 +2445,8 @@ static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
- pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
+ pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n",
+ user_tsc_khz, thresh_lo, thresh_hi);
use_scaling = 1;
}
return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
@@ -3165,6 +3171,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
&vcpu->hv_clock.tsc_shift,
&vcpu->hv_clock.tsc_to_system_mul);
vcpu->hw_tsc_khz = tgt_tsc_khz;
+ kvm_xen_update_tsc_info(v);
}
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
@@ -3562,9 +3569,20 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
+static bool kvm_is_msr_to_save(u32 msr_index)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ return false;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- bool pr = false;
u32 msr = msr_info->index;
u64 data = msr_info->data;
@@ -3610,15 +3628,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data == BIT_ULL(18)) {
vcpu->arch.msr_hwcr = data;
} else if (data != 0) {
- vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
- data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
break;
case MSR_FAM10H_MMIO_CONF_BASE:
if (data != 0) {
- vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
- "0x%llx\n", data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
return 1;
}
break;
@@ -3798,16 +3814,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
- pr = true;
- fallthrough;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- if (pr || data != 0)
- vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
+ if (data)
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_K7_CLK_CTL:
/*
@@ -3828,15 +3841,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
return kvm_hv_set_msr_common(vcpu, msr, data,
msr_info->host_initiated);
case MSR_IA32_BBL_CR_CTL3:
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
*/
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ kvm_pr_unimpl_wrmsr(vcpu, msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
@@ -3884,20 +3896,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vcpu->arch.guest_fpu.xfd_err = data;
break;
#endif
- case MSR_IA32_PEBS_ENABLE:
- case MSR_IA32_DS_AREA:
- case MSR_PEBS_DATA_CFG:
- case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
+ default:
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
+
/*
* Userspace is allowed to write '0' to MSRs that KVM reports
* as to-be-saved, even if an MSRs isn't fully supported.
*/
- return !msr_info->host_initiated || data;
- default:
- if (kvm_pmu_is_valid_msr(vcpu, msr))
- return kvm_pmu_set_msr(vcpu, msr_info);
+ if (msr_info->host_initiated && !data &&
+ kvm_is_msr_to_save(msr))
+ break;
+
return KVM_MSR_RET_INVALID;
}
return 0;
@@ -3987,20 +3997,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
msr_info->data = 0;
break;
- case MSR_IA32_PEBS_ENABLE:
- case MSR_IA32_DS_AREA:
- case MSR_PEBS_DATA_CFG:
- case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
- if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to read MSRs that KVM reports as
- * to-be-saved, even if an MSR isn't fully supported.
- */
- if (!msr_info->host_initiated)
- return 1;
- msr_info->data = 0;
- break;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -4198,6 +4194,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
case HV_X64_MSR_TSC_EMULATION_CONTROL:
case HV_X64_MSR_TSC_EMULATION_STATUS:
+ case HV_X64_MSR_TSC_INVARIANT_CONTROL:
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data,
msr_info->host_initiated);
@@ -4255,6 +4252,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
+
+ /*
+ * Userspace is allowed to read MSRs that KVM reports as
+ * to-be-saved, even if an MSR isn't fully supported.
+ */
+ if (msr_info->host_initiated &&
+ kvm_is_msr_to_save(msr_info->index)) {
+ msr_info->data = 0;
+ break;
+ }
+
return KVM_MSR_RET_INVALID;
}
return 0;
@@ -4292,8 +4300,8 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
{
struct kvm_msrs msrs;
struct kvm_msr_entry *entries;
- int r, n;
unsigned size;
+ int r;
r = -EFAULT;
if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
@@ -4310,17 +4318,11 @@ static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
goto out;
}
- r = n = __msr_io(vcpu, &msrs, entries, do_msr);
- if (r < 0)
- goto out_free;
+ r = __msr_io(vcpu, &msrs, entries, do_msr);
- r = -EFAULT;
if (writeback && copy_to_user(user_msrs->entries, entries, size))
- goto out_free;
-
- r = n;
+ r = -EFAULT;
-out_free:
kfree(entries);
out:
return r;
@@ -4408,6 +4410,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_PMU_EVENT_FILTER:
+ case KVM_CAP_PMU_EVENT_MASKED_EVENTS:
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
@@ -6482,7 +6485,7 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
struct kvm_x86_msr_filter *new_filter, *old_filter;
bool default_allow;
bool empty = true;
- int r = 0;
+ int r;
u32 i;
if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
@@ -6508,17 +6511,14 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
}
mutex_lock(&kvm->lock);
-
- /* The per-VM filter is protected by kvm->lock... */
- old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
-
- rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+ old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter,
+ mutex_is_locked(&kvm->lock));
+ mutex_unlock(&kvm->lock);
synchronize_srcu(&kvm->srcu);
kvm_free_msr_filter(old_filter);
kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
- mutex_unlock(&kvm->lock);
return 0;
}
@@ -7018,83 +7018,98 @@ out:
return r;
}
-static void kvm_init_msr_list(void)
+static void kvm_probe_msr_to_save(u32 msr_index)
{
u32 dummy[2];
+
+ if (rdmsr_safe(msr_index, &dummy[0], &dummy[1]))
+ return;
+
+ /*
+ * Even MSRs that are valid in the host may not be exposed to guests in
+ * some cases.
+ */
+ switch (msr_index) {
+ case MSR_IA32_BNDCFGS:
+ if (!kvm_mpx_supported())
+ return;
+ break;
+ case MSR_TSC_AUX:
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
+ !kvm_cpu_cap_has(X86_FEATURE_RDPID))
+ return;
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
+ return;
+ break;
+ case MSR_IA32_RTIT_CTL:
+ case MSR_IA32_RTIT_STATUS:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
+ return;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
+ return;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
+ return;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (msr_index - MSR_IA32_RTIT_ADDR0_A >=
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2))
+ return;
+ break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ kvm_pmu_cap.num_counters_gp)
+ return;
+ break;
+ case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX:
+ if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >=
+ kvm_pmu_cap.num_counters_fixed)
+ return;
+ break;
+ case MSR_IA32_XFD:
+ case MSR_IA32_XFD_ERR:
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ return;
+ break;
+ default:
+ break;
+ }
+
+ msrs_to_save[num_msrs_to_save++] = msr_index;
+}
+
+static void kvm_init_msr_list(void)
+{
unsigned i;
BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
- "Please update the fixed PMCs in msrs_to_saved_all[]");
+ "Please update the fixed PMCs in msrs_to_save_pmu[]");
num_msrs_to_save = 0;
num_emulated_msrs = 0;
num_msr_based_features = 0;
- for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
- if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
- continue;
-
- /*
- * Even MSRs that are valid in the host may not be exposed
- * to the guests in some cases.
- */
- switch (msrs_to_save_all[i]) {
- case MSR_IA32_BNDCFGS:
- if (!kvm_mpx_supported())
- continue;
- break;
- case MSR_TSC_AUX:
- if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
- !kvm_cpu_cap_has(X86_FEATURE_RDPID))
- continue;
- break;
- case MSR_IA32_UMWAIT_CONTROL:
- if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
- continue;
- break;
- case MSR_IA32_RTIT_CTL:
- case MSR_IA32_RTIT_STATUS:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
- continue;
- break;
- case MSR_IA32_RTIT_CR3_MATCH:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
- continue;
- break;
- case MSR_IA32_RTIT_OUTPUT_BASE:
- case MSR_IA32_RTIT_OUTPUT_MASK:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
- !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
- continue;
- break;
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
- if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
- msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
- intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
- continue;
- break;
- case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
- if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
- min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
- continue;
- break;
- case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
- if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
- min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
- continue;
- break;
- case MSR_IA32_XFD:
- case MSR_IA32_XFD_ERR:
- if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
- continue;
- break;
- default:
- break;
- }
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++)
+ kvm_probe_msr_to_save(msrs_to_save_base[i]);
- msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
+ if (enable_pmu) {
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++)
+ kvm_probe_msr_to_save(msrs_to_save_pmu[i]);
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
@@ -7721,7 +7736,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
emul_write:
- printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
+ pr_warn_once("emulating exchange as write\n");
return emulator_write_emulated(ctxt, addr, new, bytes, exception);
}
@@ -8167,9 +8182,14 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
}
-static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
+static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt)
+{
+ return is_smm(emul_to_vcpu(ctxt));
+}
+
+static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt)
{
- return emul_to_vcpu(ctxt)->arch.hflags;
+ return is_guest_mode(emul_to_vcpu(ctxt));
}
#ifndef CONFIG_KVM_SMM
@@ -8238,7 +8258,8 @@ static const struct x86_emulate_ops emulate_ops = {
.guest_has_fxsr = emulator_guest_has_fxsr,
.guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
- .get_hflags = emulator_get_hflags,
+ .is_smm = emulator_is_smm,
+ .is_guest_mode = emulator_is_guest_mode,
.leave_smm = emulator_leave_smm,
.triple_fault = emulator_triple_fault,
.set_xcr = emulator_set_xcr,
@@ -8282,7 +8303,7 @@ static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
if (!ctxt) {
- pr_err("kvm: failed to allocate vcpu's emulator\n");
+ pr_err("failed to allocate vcpu's emulator\n");
return NULL;
}
@@ -8310,8 +8331,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
- BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
-
ctxt->interruptibility = 0;
ctxt->have_exception = false;
ctxt->exception.vector = -1;
@@ -9293,35 +9312,66 @@ static struct notifier_block pvclock_gtod_notifier = {
};
#endif
-int kvm_arch_init(void *opaque)
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
+static int kvm_x86_check_processor_compatibility(void)
+{
+ int cpu = smp_processor_id();
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ /*
+ * Compatibility checks are done when loading KVM and when enabling
+ * hardware, e.g. during CPU hotplug, to ensure all online CPUs are
+ * compatible, i.e. KVM should never perform a compatibility check on
+ * an offline CPU.
+ */
+ WARN_ON(!cpu_online(cpu));
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return static_call(kvm_x86_check_processor_compatibility)();
+}
+
+static void kvm_x86_check_cpu_compat(void *ret)
+{
+ *(int *)ret = kvm_x86_check_processor_compatibility();
+}
+
+static int __kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
{
- struct kvm_x86_init_ops *ops = opaque;
u64 host_pat;
- int r;
+ int r, cpu;
if (kvm_x86_ops.hardware_enable) {
- pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
+ pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
- if (!ops->cpu_has_kvm_support()) {
- pr_err_ratelimited("kvm: no hardware support for '%s'\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
- if (ops->disabled_by_bios()) {
- pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
- ops->runtime_ops->name);
- return -EOPNOTSUPP;
- }
-
/*
* KVM explicitly assumes that the guest has an FPU and
* FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
* vCPU's FPU state as a fxregs_state struct.
*/
if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
- printk(KERN_ERR "kvm: inadequate fpu\n");
+ pr_err("inadequate fpu\n");
return -EOPNOTSUPP;
}
@@ -9339,19 +9389,19 @@ int kvm_arch_init(void *opaque)
*/
if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
(host_pat & GENMASK(2, 0)) != 6) {
- pr_err("kvm: host PAT[0] is not WB\n");
+ pr_err("host PAT[0] is not WB\n");
return -EIO;
}
x86_emulator_cache = kvm_alloc_emulator_cache();
if (!x86_emulator_cache) {
- pr_err("kvm: failed to allocate cache for x86 emulator\n");
+ pr_err("failed to allocate cache for x86 emulator\n");
return -ENOMEM;
}
user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
if (!user_return_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ pr_err("failed to allocate percpu kvm_user_return_msrs\n");
r = -ENOMEM;
goto out_free_x86_emulator_cache;
}
@@ -9361,13 +9411,37 @@ int kvm_arch_init(void *opaque)
if (r)
goto out_free_percpu;
- kvm_timer_init();
-
if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
}
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ kvm_init_pmu_capability(ops->pmu_ops);
+
+ r = ops->hardware_setup();
+ if (r != 0)
+ goto out_mmu_exit;
+
+ kvm_ops_update(ops);
+
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, kvm_x86_check_cpu_compat, &r, 1);
+ if (r < 0)
+ goto out_unwind_ops;
+ }
+
+ /*
+ * Point of no return! DO NOT add error paths below this point unless
+ * absolutely necessary, as most operations from this point forward
+ * require unwinding.
+ */
+ kvm_timer_init();
+
if (pi_inject_timer == -1)
pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
#ifdef CONFIG_X86_64
@@ -9377,8 +9451,35 @@ int kvm_arch_init(void *opaque)
set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
#endif
+ kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ kvm_caps.supported_xss = 0;
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
+
+ if (kvm_caps.has_tsc_control) {
+ /*
+ * Make sure the user can only configure tsc_khz values that
+ * fit into a signed integer.
+ * A min value is not calculated because it will always
+ * be 1 on all machines.
+ */
+ u64 max = min(0x7fffffffULL,
+ __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
+ kvm_caps.max_guest_tsc_khz = max;
+ }
+ kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
+ kvm_init_msr_list();
return 0;
+out_unwind_ops:
+ kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
+out_mmu_exit:
+ kvm_mmu_vendor_module_exit();
out_free_percpu:
free_percpu(user_return_msrs);
out_free_x86_emulator_cache:
@@ -9386,8 +9487,22 @@ out_free_x86_emulator_cache:
return r;
}
-void kvm_arch_exit(void)
+int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
+{
+ int r;
+
+ mutex_lock(&vendor_module_lock);
+ r = __kvm_x86_vendor_init(ops);
+ mutex_unlock(&vendor_module_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_init);
+
+void kvm_x86_vendor_exit(void)
{
+ kvm_unregister_perf_callbacks();
+
#ifdef CONFIG_X86_64
if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
clear_hv_tscchange_cb();
@@ -9404,7 +9519,7 @@ void kvm_arch_exit(void)
irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- kvm_x86_ops.hardware_enable = NULL;
+ static_call(kvm_x86_hardware_unsetup)();
kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
@@ -9412,7 +9527,11 @@ void kvm_arch_exit(void)
static_key_deferred_flush(&kvm_xen_enabled);
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
+ mutex_lock(&vendor_module_lock);
+ kvm_x86_ops.hardware_enable = NULL;
+ mutex_unlock(&vendor_module_lock);
}
+EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
{
@@ -10065,7 +10184,7 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
-void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
bool activate;
@@ -10100,7 +10219,30 @@ out:
preempt_enable();
up_read(&vcpu->kvm->arch.apicv_update_lock);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv);
+
+static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ /*
+ * Due to sharing page tables across vCPUs, the xAPIC memslot must be
+ * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but
+ * and hardware doesn't support x2APIC virtualization. E.g. some AMD
+ * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in
+ * this case so that KVM can the AVIC doorbell to inject interrupts to
+ * running vCPUs, but KVM must not create SPTEs for the APIC base as
+ * the vCPU would incorrectly be able to access the vAPIC page via MMIO
+ * despite being in x2APIC mode. For simplicity, inhibiting the APIC
+ * access page is sticky.
+ */
+ if (apic_x2apic_mode(vcpu->arch.apic) &&
+ kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization)
+ kvm_inhibit_apic_access_page(vcpu);
+
+ __kvm_vcpu_update_apicv(vcpu);
+}
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set)
@@ -10109,7 +10251,7 @@ void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
- if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
+ if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason)))
return;
old = new = kvm->arch.apicv_inhibit_reasons;
@@ -11553,7 +11695,7 @@ static int sync_regs(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
if (kvm_check_tsc_unstable() && kvm->created_vcpus)
- pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ pr_warn_once("SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
if (!kvm->arch.max_vcpu_ids)
@@ -11630,7 +11772,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
goto free_wbinvd_dirty_mask;
if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
- pr_err("kvm: failed to allocate vcpu's fpu\n");
+ pr_err("failed to allocate vcpu's fpu\n");
goto free_emulate_ctxt;
}
@@ -11904,6 +12046,11 @@ int kvm_arch_hardware_enable(void)
bool stable, backwards_tsc = false;
kvm_user_return_msr_cpu_online();
+
+ ret = kvm_x86_check_processor_compatibility();
+ if (ret)
+ return ret;
+
ret = static_call(kvm_x86_hardware_enable)();
if (ret != 0)
return ret;
@@ -11990,88 +12137,6 @@ void kvm_arch_hardware_disable(void)
drop_user_return_notifiers();
}
-static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
-{
- memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
-
-#define __KVM_X86_OP(func) \
- static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP(func) \
- WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
-#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
-#define KVM_X86_OP_OPTIONAL_RET0(func) \
- static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
- (void *)__static_call_return0);
-#include <asm/kvm-x86-ops.h>
-#undef __KVM_X86_OP
-
- kvm_pmu_ops_update(ops->pmu_ops);
-}
-
-int kvm_arch_hardware_setup(void *opaque)
-{
- struct kvm_x86_init_ops *ops = opaque;
- int r;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
-
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
-
- kvm_init_pmu_capability();
-
- r = ops->hardware_setup();
- if (r != 0)
- return r;
-
- kvm_ops_update(ops);
-
- kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
-
- if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
- kvm_caps.supported_xss = 0;
-
-#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
- cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
-#undef __kvm_cpu_cap_has
-
- if (kvm_caps.has_tsc_control) {
- /*
- * Make sure the user can only configure tsc_khz values that
- * fit into a signed integer.
- * A min value is not calculated because it will always
- * be 1 on all machines.
- */
- u64 max = min(0x7fffffffULL,
- __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
- kvm_caps.max_guest_tsc_khz = max;
- }
- kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
- kvm_init_msr_list();
- return 0;
-}
-
-void kvm_arch_hardware_unsetup(void)
-{
- kvm_unregister_perf_callbacks();
-
- static_call(kvm_x86_hardware_unsetup)();
-}
-
-int kvm_arch_check_processor_compat(void *opaque)
-{
- struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
- struct kvm_x86_init_ops *ops = opaque;
-
- WARN_ON(!irqs_disabled());
-
- if (__cr4_reserved_bits(cpu_has, c) !=
- __cr4_reserved_bits(cpu_has, &boot_cpu_data))
- return -EIO;
-
- return ops->check_processor_compatibility();
-}
-
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
@@ -12244,7 +12309,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
*/
hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, 0);
- if (IS_ERR((void *)hva))
+ if (IS_ERR_VALUE(hva))
return (void __user *)hva;
} else {
if (!slot || !slot->npages)
@@ -12459,16 +12524,14 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
{
- struct kvm_arch *ka = &kvm->arch;
+ int nr_slots;
if (!kvm_x86_ops.cpu_dirty_log_size)
return;
- if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
- (!enable && --ka->cpu_dirty_logging_count == 0))
+ nr_slots = atomic_read(&kvm->nr_memslots_dirty_logging);
+ if ((enable && nr_slots == 1) || !nr_slots)
kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
-
- WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
}
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 9de72586f406..a8167b47b8c8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -331,6 +331,18 @@ extern bool report_ignored_msrs;
extern bool eager_page_split;
+static inline void kvm_pr_unimpl_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled WRMSR(0x%x) = 0x%llx\n", msr, data);
+}
+
+static inline void kvm_pr_unimpl_rdmsr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "Unhandled RDMSR(0x%x)\n", msr);
+}
+
static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
{
return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
@@ -382,13 +394,13 @@ enum kvm_intr_type {
KVM_HANDLING_NMI,
};
-static inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
- enum kvm_intr_type intr)
+static __always_inline void kvm_before_interrupt(struct kvm_vcpu *vcpu,
+ enum kvm_intr_type intr)
{
WRITE_ONCE(vcpu->arch.handling_intr_from_guest, (u8)intr);
}
-static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline void kvm_after_interrupt(struct kvm_vcpu *vcpu)
{
WRITE_ONCE(vcpu->arch.handling_intr_from_guest, 0);
}
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 8fd41f5deae3..40edf4d1974c 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -5,6 +5,7 @@
*
* KVM Xen emulation
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "x86.h"
#include "xen.h"
@@ -22,6 +23,9 @@
#include <xen/interface/event_channel.h>
#include <xen/interface/sched.h>
+#include <asm/xen/cpuid.h>
+
+#include "cpuid.h"
#include "trace.h"
static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
@@ -2076,6 +2080,29 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
del_timer_sync(&vcpu->arch.xen.poll_timer);
}
+void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *entry;
+ u32 function;
+
+ if (!vcpu->arch.xen.cpuid.base)
+ return;
+
+ function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3);
+ if (function > vcpu->arch.xen.cpuid.limit)
+ return;
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 1);
+ if (entry) {
+ entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
+ entry->edx = vcpu->arch.hv_clock.tsc_shift;
+ }
+
+ entry = kvm_find_cpuid_entry_index(vcpu, function, 2);
+ if (entry)
+ entry->eax = vcpu->arch.hw_tsc_khz;
+}
+
void kvm_xen_init_vm(struct kvm *kvm)
{
mutex_init(&kvm->arch.xen.xen_lock);
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index ea33d80a0c51..f8f1fe22d090 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -9,6 +9,8 @@
#ifndef __ARCH_X86_KVM_XEN_H__
#define __ARCH_X86_KVM_XEN_H__
+#include <asm/xen/hypervisor.h>
+
#ifdef CONFIG_KVM_XEN
#include <linux/jump_label_ratelimit.h>
@@ -32,6 +34,7 @@ int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue);
+void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu);
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
@@ -135,6 +138,10 @@ static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
{
return false;
}
+
+static inline void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
+{
+}
#endif
int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index bddfc9a46645..90e820ac9771 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
PHONY += posttest
-ifeq ($(KBUILD_VERBOSE),1)
+ifneq ($(findstring 1, $(KBUILD_VERBOSE)),)
posttest_verbose = -v
else
posttest_verbose =
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index 3d5cd2e57820..ee89f6bb9242 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -48,4 +48,4 @@ include/generated/user_constants.h: $(obj)/user-offsets.s FORCE
UNPROFILE_OBJS := stub_segv.o
CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING)
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile
index 253bfb8cb702..ae169125d03f 100644
--- a/arch/x86/um/os-Linux/Makefile
+++ b/arch/x86/um/os-Linux/Makefile
@@ -10,4 +10,4 @@ obj-$(CONFIG_64BIT) += prctl.o
USER_OBJS := $(obj-y)
-include arch/um/scripts/Makefile.rules
+include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile
index 6fbe97c52c99..6825e146a62f 100644
--- a/arch/x86/um/vdso/Makefile
+++ b/arch/x86/um/vdso/Makefile
@@ -61,7 +61,7 @@ CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage
#
quiet_cmd_vdso = VDSO $@
cmd_vdso = $(CC) -nostdlib -o $@ \
- $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
+ $(CC_FLAGS_LTO) $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
-Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c
index 2112b8d14668..ff0f3b4b6c45 100644
--- a/arch/x86/um/vdso/um_vdso.c
+++ b/arch/x86/um/vdso/um_vdso.c
@@ -17,8 +17,10 @@ int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts)
{
long ret;
- asm("syscall" : "=a" (ret) :
- "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory");
+ asm("syscall"
+ : "=a" (ret)
+ : "0" (__NR_clock_gettime), "D" (clock), "S" (ts)
+ : "rcx", "r11", "memory");
return ret;
}
@@ -29,8 +31,10 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz)
{
long ret;
- asm("syscall" : "=a" (ret) :
- "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory");
+ asm("syscall"
+ : "=a" (ret)
+ : "0" (__NR_gettimeofday), "D" (tv), "S" (tz)
+ : "rcx", "r11", "memory");
return ret;
}