aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt17
-rw-r--r--Documentation/arch/s390/vfio-ap.rst30
-rw-r--r--Documentation/core-api/cleanup.rst8
-rw-r--r--Documentation/core-api/folio_queue.rst212
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml12
-rw-r--r--Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml2
-rw-r--r--Documentation/virt/kvm/api.rst31
-rw-r--r--Documentation/virt/kvm/locking.rst32
-rw-r--r--MAINTAINERS3
-rw-r--r--Makefile4
-rw-r--r--arch/arm64/kvm/arm.c6
-rw-r--r--arch/loongarch/kvm/main.c4
-rw-r--r--arch/mips/include/asm/kvm_host.h4
-rw-r--r--arch/mips/kvm/mips.c8
-rw-r--r--arch/mips/kvm/vz.c8
-rw-r--r--arch/riscv/kvm/main.c4
-rw-r--r--arch/s390/configs/debug_defconfig1
-rw-r--r--arch/s390/kernel/vdso64/vdso_user_wrapper.S14
-rw-r--r--arch/s390/kernel/vdso64/vgetrandom-chacha.S76
-rw-r--r--arch/s390/kvm/kvm-s390.c27
-rw-r--r--arch/x86/coco/tdx/tdx.c6
-rw-r--r--arch/x86/include/asm/atomic64_32.h6
-rw-r--r--arch/x86/include/asm/cpuid.h1
-rw-r--r--arch/x86/include/asm/intel-family.h5
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h32
-rw-r--r--arch/x86/include/asm/msr-index.h34
-rw-r--r--arch/x86/include/asm/reboot.h4
-rw-r--r--arch/x86/include/asm/svm.h20
-rw-r--r--arch/x86/include/asm/vmx.h40
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c6
-rw-r--r--arch/x86/kvm/cpuid.c30
-rw-r--r--arch/x86/kvm/irq.c10
-rw-r--r--arch/x86/kvm/lapic.c84
-rw-r--r--arch/x86/kvm/lapic.h3
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c556
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h5
-rw-r--r--arch/x86/kvm/mmu/mmutrace.h1
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h63
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c6
-rw-r--r--arch/x86/kvm/reverse_cpuid.h8
-rw-r--r--arch/x86/kvm/smm.c24
-rw-r--r--arch/x86/kvm/svm/nested.c4
-rw-r--r--arch/x86/kvm/svm/svm.c87
-rw-r--r--arch/x86/kvm/svm/svm.h18
-rw-r--r--arch/x86/kvm/svm/vmenter.S8
-rw-r--r--arch/x86/kvm/vmx/capabilities.h10
-rw-r--r--arch/x86/kvm/vmx/main.c10
-rw-r--r--arch/x86/kvm/vmx/nested.c134
-rw-r--r--arch/x86/kvm/vmx/nested.h8
-rw-r--r--arch/x86/kvm/vmx/sgx.c2
-rw-r--r--arch/x86/kvm/vmx/vmx.c67
-rw-r--r--arch/x86/kvm/vmx/vmx.h9
-rw-r--r--arch/x86/kvm/vmx/vmx_onhyperv.h8
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h2
-rw-r--r--arch/x86/kvm/vmx/x86_ops.h7
-rw-r--r--arch/x86/kvm/x86.c1002
-rw-r--r--arch/x86/kvm/x86.h31
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S9
-rw-r--r--arch/x86/mm/pat/memtype.c36
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c108
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c64
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c89
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_display.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c13
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_object.c132
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_object.h23
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c31
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c30
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h11
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c23
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c56
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c19
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c32
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v10_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v11_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v6_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/dce_v8_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c20
-rw-r--r--drivers/gpu/drm/amd/amdgpu/imu_v11_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_v11_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_v12_0.c14
-rw-r--r--drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/psp_v13_0.c17
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/soc24.c23
-rw-r--r--drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c165
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c24
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c15
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c4
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c30
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c2
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c86
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h2
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c4
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c1
-rw-r--r--drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c1
-rw-r--r--drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c2
-rw-r--r--drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c9
-rw-r--r--drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c6
-rw-r--r--drivers/gpu/drm/amd/display/dc/core/dc.c41
-rw-r--r--drivers/gpu/drm/amd/display/dc/dc.h14
-rw-r--r--drivers/gpu/drm/amd/display/dc/dc_dp_types.h12
-rw-r--r--drivers/gpu/drm/amd/display/dc/dc_dsc.h4
-rw-r--r--drivers/gpu/drm/amd/display/dc/dc_spl_translate.c14
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c3
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c3
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c9
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c6
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c4
-rw-r--r--drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c8
-rw-r--r--drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c15
-rw-r--r--drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c1
-rw-r--r--drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c71
-rw-r--r--drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c2
-rw-r--r--drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c14
-rw-r--r--drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c13
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c31
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/link_validation.c7
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c5
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c80
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h16
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c21
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c64
-rw-r--r--drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h19
-rw-r--r--drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c1
-rw-r--r--drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c3
-rw-r--r--drivers/gpu/drm/amd/display/dc/spl/dc_spl.c54
-rw-r--r--drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c85
-rw-r--r--drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h9
-rw-r--r--drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h15
-rw-r--r--drivers/gpu/drm/amd/display/dmub/dmub_srv.h1
-rw-r--r--drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h25
-rw-r--r--drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c1
-rw-r--r--drivers/gpu/drm/amd/display/modules/freesync/freesync.c2
-rw-r--r--drivers/gpu/drm/amd/include/amd_shared.h2
-rw-r--r--drivers/gpu/drm/amd/include/kgd_kfd_interface.h10
-rw-r--r--drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h6
-rw-r--r--drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c6
-rw-r--r--drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c8
-rw-r--r--drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c3
-rw-r--r--drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c6
-rw-r--r--drivers/gpu/drm/i915/display/intel_ddi.c2
-rw-r--r--drivers/gpu/drm/i915/display/intel_dp.c22
-rw-r--r--drivers/gpu/drm/i915/display/intel_psr.c32
-rw-r--r--drivers/gpu/drm/i915/display/intel_psr.h2
-rw-r--r--drivers/gpu/drm/xe/xe_bb.c3
-rw-r--r--drivers/gpu/drm/xe/xe_bo.c14
-rw-r--r--drivers/gpu/drm/xe/xe_bo.h6
-rw-r--r--drivers/gpu/drm/xe/xe_drm_client.c7
-rw-r--r--drivers/gpu/drm/xe/xe_gt_pagefault.c6
-rw-r--r--drivers/gpu/drm/xe/xe_guc.h6
-rw-r--r--drivers/gpu/drm/xe/xe_vram.c1
-rw-r--r--drivers/i2c/busses/Kconfig1
-rw-r--r--drivers/i2c/busses/i2c-designware-common.c14
-rw-r--r--drivers/i2c/busses/i2c-designware-core.h1
-rw-r--r--drivers/i2c/busses/i2c-designware-master.c38
-rw-r--r--drivers/i2c/busses/i2c-synquacer.c5
-rw-r--r--drivers/i2c/busses/i2c-xiic.c2
-rw-r--r--drivers/mailbox/Kconfig3
-rw-r--r--drivers/mailbox/bcm2835-mailbox.c3
-rw-r--r--drivers/mailbox/imx-mailbox.c6
-rw-r--r--drivers/mailbox/mailbox.c22
-rw-r--r--drivers/mailbox/omap-mailbox.c2
-rw-r--r--drivers/mailbox/rockchip-mailbox.c2
-rw-r--r--drivers/mailbox/sprd-mailbox.c25
-rw-r--r--drivers/message/fusion/mptctl.c2
-rw-r--r--drivers/remoteproc/Kconfig6
-rw-r--r--drivers/s390/crypto/vfio_ap_drv.c13
-rw-r--r--drivers/scsi/cxgbi/libcxgbi.h3
-rw-r--r--drivers/scsi/hisi_sas/hisi_sas_v3_hw.c2
-rw-r--r--drivers/scsi/ibmvscsi/ibmvfc.c21
-rw-r--r--drivers/scsi/ibmvscsi/ibmvfc.h2
-rw-r--r--drivers/scsi/lpfc/lpfc_bsg.c3
-rw-r--r--drivers/scsi/lpfc/lpfc_ct.c22
-rw-r--r--drivers/scsi/lpfc/lpfc_disc.h7
-rw-r--r--drivers/scsi/lpfc/lpfc_els.c132
-rw-r--r--drivers/scsi/lpfc/lpfc_hbadisc.c10
-rw-r--r--drivers/scsi/lpfc/lpfc_hw.h21
-rw-r--r--drivers/scsi/lpfc/lpfc_hw4.h3
-rw-r--r--drivers/scsi/lpfc/lpfc_init.c32
-rw-r--r--drivers/scsi/lpfc/lpfc_scsi.c2
-rw-r--r--drivers/scsi/lpfc/lpfc_sli.c52
-rw-r--r--drivers/scsi/lpfc/lpfc_version.h2
-rw-r--r--drivers/scsi/lpfc/lpfc_vport.c43
-rw-r--r--drivers/scsi/megaraid/megaraid_sas_base.c2
-rw-r--r--drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h35
-rw-r--r--drivers/scsi/mpi3mr/mpi/mpi30_image.h13
-rw-r--r--drivers/scsi/mpi3mr/mpi/mpi30_ioc.h8
-rw-r--r--drivers/scsi/mpi3mr/mpi/mpi30_transport.h4
-rw-r--r--drivers/scsi/mpi3mr/mpi3mr.h10
-rw-r--r--drivers/scsi/mpi3mr/mpi3mr_fw.c79
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_base.c5
-rw-r--r--drivers/scsi/pm8001/pm8001_init.c6
-rw-r--r--drivers/scsi/pm8001/pm80xx_hwi.c2
-rw-r--r--drivers/scsi/pmcraid.c2
-rw-r--r--drivers/scsi/qedf/qedf_io.c2
-rw-r--r--drivers/scsi/scsi_debug.c1
-rw-r--r--drivers/scsi/sd.c32
-rw-r--r--drivers/scsi/st.c5
-rw-r--r--drivers/scsi/zalon.c2
-rw-r--r--drivers/ufs/host/ufs-qcom.c2
-rw-r--r--drivers/video/fbdev/core/fbcon.c10
-rw-r--r--drivers/video/fbdev/omap2/omapfb/dss/dss-of.c7
-rw-r--r--drivers/video/fbdev/sis/sis_main.c2
-rw-r--r--fs/afs/afs_vl.h9
-rw-r--r--fs/afs/file.c1
-rw-r--r--fs/afs/fs_operation.c2
-rw-r--r--fs/afs/fs_probe.c4
-rw-r--r--fs/afs/rotate.c11
-rw-r--r--fs/bcachefs/backpointers.c2
-rw-r--r--fs/bcachefs/bcachefs.h3
-rw-r--r--fs/bcachefs/bcachefs_format.h8
-rw-r--r--fs/bcachefs/bkey.h8
-rw-r--r--fs/bcachefs/bkey_methods.c2
-rw-r--r--fs/bcachefs/bkey_methods.h2
-rw-r--r--fs/bcachefs/btree_gc.c8
-rw-r--r--fs/bcachefs/btree_io.c6
-rw-r--r--fs/bcachefs/btree_node_scan.c2
-rw-r--r--fs/bcachefs/btree_trans_commit.c108
-rw-r--r--fs/bcachefs/btree_update.h3
-rw-r--r--fs/bcachefs/data_update.c2
-rw-r--r--fs/bcachefs/disk_accounting.c82
-rw-r--r--fs/bcachefs/disk_accounting.h29
-rw-r--r--fs/bcachefs/disk_accounting_types.h2
-rw-r--r--fs/bcachefs/error.c14
-rw-r--r--fs/bcachefs/error.h2
-rw-r--r--fs/bcachefs/fsck.c295
-rw-r--r--fs/bcachefs/inode.c12
-rw-r--r--fs/bcachefs/inode.h1
-rw-r--r--fs/bcachefs/io_read.c4
-rw-r--r--fs/bcachefs/io_write.c4
-rw-r--r--fs/bcachefs/journal_io.c2
-rw-r--r--fs/bcachefs/logged_ops.c13
-rw-r--r--fs/bcachefs/recovery.c7
-rw-r--r--fs/bcachefs/recovery_passes_types.h2
-rw-r--r--fs/bcachefs/reflink.c2
-rw-r--r--fs/bcachefs/replicas.c18
-rw-r--r--fs/bcachefs/replicas.h2
-rw-r--r--fs/bcachefs/sb-clean.c1
-rw-r--r--fs/bcachefs/sb-downgrade.c9
-rw-r--r--fs/bcachefs/sb-errors.c6
-rw-r--r--fs/bcachefs/sb-errors.h2
-rw-r--r--fs/bcachefs/sb-errors_format.h39
-rw-r--r--fs/bcachefs/six.c12
-rw-r--r--fs/bcachefs/snapshot.c3
-rw-r--r--fs/bcachefs/subvolume.c54
-rw-r--r--fs/bcachefs/super-io.c7
-rw-r--r--fs/bcachefs/tests.c2
-rw-r--r--fs/cachefiles/namei.c7
-rw-r--r--fs/ceph/addr.c1
-rw-r--r--fs/ceph/caps.c29
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/inode.c2
-rw-r--r--fs/ceph/mds_client.c25
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/super.c1
-rw-r--r--fs/ceph/super.h7
-rw-r--r--fs/netfs/internal.h1
-rw-r--r--fs/netfs/misc.c74
-rw-r--r--fs/netfs/write_issue.c24
-rw-r--r--fs/pidfs.c5
-rw-r--r--fs/smb/client/cifsencrypt.c151
-rw-r--r--fs/smb/client/cifsglob.h2
-rw-r--r--fs/smb/client/sess.c2
-rw-r--r--fs/smb/client/smb2misc.c28
-rw-r--r--fs/smb/client/smb2ops.c47
-rw-r--r--fs/smb/client/smb2pdu.c10
-rw-r--r--fs/smb/client/smb2proto.h2
-rw-r--r--fs/smb/client/smb2transport.c32
-rw-r--r--fs/smb/common/smb2pdu.h6
-rw-r--r--fs/smb/server/connection.c2
-rw-r--r--fs/smb/server/ksmbd_netlink.h2
-rw-r--r--fs/smb/server/oplock.c4
-rw-r--r--fs/smb/server/server.c2
-rw-r--r--fs/smb/server/smb2pdu.c35
-rw-r--r--fs/smb/server/smb2pdu.h4
-rw-r--r--fs/smb/server/smb_common.c2
-rw-r--r--fs/smb/server/vfs_cache.h4
-rw-r--r--fs/smb/server/xattr.h2
-rw-r--r--include/linux/ceph/osd_client.h2
-rw-r--r--include/linux/cleanup.h136
-rw-r--r--include/linux/folio_queue.h168
-rw-r--r--include/linux/kvm_host.h18
-rw-r--r--include/trace/events/dma.h37
-rw-r--r--include/trace/events/netfs.h3
-rw-r--r--kernel/jump_label.c34
-rw-r--r--kernel/locking/lockdep.c53
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/rwsem.c22
-rw-r--r--kernel/module/Kconfig77
-rw-r--r--kernel/module/debug_kmemleak.c18
-rw-r--r--kernel/module/sysfs.c63
-rw-r--r--kernel/static_call_inline.c13
-rw-r--r--net/ceph/messenger.c2
-rw-r--r--scripts/Makefile.modinst2
-rw-r--r--scripts/coccinelle/api/string_choices.cocci259
-rw-r--r--tools/include/linux/linkage.h4
-rw-r--r--tools/testing/selftests/kvm/.gitignore4
-rw-r--r--tools/testing/selftests/kvm/Makefile4
-rw-r--r--tools/testing/selftests/kvm/coalesced_io_test.c236
-rw-r--r--tools/testing/selftests/kvm/guest_print_test.c19
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h28
-rw-r--r--tools/testing/selftests/kvm/include/s390x/debug_print.h69
-rw-r--r--tools/testing/selftests/kvm/include/s390x/processor.h5
-rw-r--r--tools/testing/selftests/kvm/include/s390x/sie.h240
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/apic.h21
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/hyperv.h18
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h7
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c85
-rw-r--r--tools/testing/selftests/kvm/lib/s390x/processor.c10
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/hyperv.c67
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c69
-rw-r--r--tools/testing/selftests/kvm/memslot_modification_stress_test.c19
-rw-r--r--tools/testing/selftests/kvm/memslot_perf_test.c12
-rw-r--r--tools/testing/selftests/kvm/s390x/cmma_test.c7
-rw-r--r--tools/testing/selftests/kvm/s390x/config2
-rw-r--r--tools/testing/selftests/kvm/s390x/debug_test.c4
-rw-r--r--tools/testing/selftests/kvm/s390x/memop.c4
-rw-r--r--tools/testing/selftests/kvm/s390x/tprot.c5
-rw-r--r--tools/testing/selftests/kvm/s390x/ucontrol_test.c332
-rw-r--r--tools/testing/selftests/kvm/set_memory_region_test.c29
-rw-r--r--tools/testing/selftests/kvm/x86_64/debug_regs.c11
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c2
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c2
-rw-r--r--tools/testing/selftests/kvm/x86_64/sev_smoke_test.c32
-rw-r--r--tools/testing/selftests/kvm/x86_64/xapic_state_test.c54
-rw-r--r--tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c1
-rw-r--r--tools/testing/selftests/vDSO/vdso_standalone_test_x86.c2
-rw-r--r--virt/kvm/coalesced_mmio.c31
-rw-r--r--virt/kvm/kvm_main.c281
352 files changed, 6245 insertions, 3308 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bb48ae24ae69..1518343bbe22 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2677,6 +2677,23 @@
Default is Y (on).
+ kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86]
+ If enabled, KVM will enable virtualization in hardware
+ when KVM is loaded, and disable virtualization when KVM
+ is unloaded (if KVM is built as a module).
+
+ If disabled, KVM will dynamically enable and disable
+ virtualization on-demand when creating and destroying
+ VMs, i.e. on the 0=>1 and 1=>0 transitions of the
+ number of VMs.
+
+ Enabling virtualization at module lode avoids potential
+ latency for creation of the 0=>1 VM, as KVM serializes
+ virtualization enabling across all online CPUs. The
+ "cost" of enabling virtualization when KVM is loaded,
+ is that doing so may interfere with using out-of-tree
+ hypervisors that want to "own" virtualization hardware.
+
kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
Default is false (don't support).
diff --git a/Documentation/arch/s390/vfio-ap.rst b/Documentation/arch/s390/vfio-ap.rst
index ea744cbc8687..eba1991fbdba 100644
--- a/Documentation/arch/s390/vfio-ap.rst
+++ b/Documentation/arch/s390/vfio-ap.rst
@@ -999,6 +999,36 @@ the vfio_ap mediated device to which it is assigned as long as each new APQN
resulting from plugging it in references a queue device bound to the vfio_ap
device driver.
+Driver Features
+===============
+The vfio_ap driver exposes a sysfs file containing supported features.
+This exists so third party tools (like Libvirt and mdevctl) can query the
+availability of specific features.
+
+The features list can be found here: /sys/bus/matrix/devices/matrix/features
+
+Entries are space delimited. Each entry consists of a combination of
+alphanumeric and underscore characters.
+
+Example:
+cat /sys/bus/matrix/devices/matrix/features
+guest_matrix dyn ap_config
+
+the following features are advertised:
+
+---------------+---------------------------------------------------------------+
+| Flag | Description |
++==============+===============================================================+
+| guest_matrix | guest_matrix attribute exists. It reports the matrix of |
+| | adapters and domains that are or will be passed through to a |
+| | guest when the mdev is attached to it. |
++--------------+---------------------------------------------------------------+
+| dyn | Indicates hot plug/unplug of AP adapters, domains and control |
+| | domains for a guest to which the mdev is attached. |
++------------+-----------------------------------------------------------------+
+| ap_config | ap_config interface for one-shot modifications to mdev config |
++--------------+---------------------------------------------------------------+
+
Limitations
===========
Live guest migration is not supported for guests using AP devices without
diff --git a/Documentation/core-api/cleanup.rst b/Documentation/core-api/cleanup.rst
new file mode 100644
index 000000000000..527eb2f8ec6e
--- /dev/null
+++ b/Documentation/core-api/cleanup.rst
@@ -0,0 +1,8 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+Scope-based Cleanup Helpers
+===========================
+
+.. kernel-doc:: include/linux/cleanup.h
+ :doc: scope-based cleanup helpers
diff --git a/Documentation/core-api/folio_queue.rst b/Documentation/core-api/folio_queue.rst
new file mode 100644
index 000000000000..1fe7a9bc4b8d
--- /dev/null
+++ b/Documentation/core-api/folio_queue.rst
@@ -0,0 +1,212 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+===========
+Folio Queue
+===========
+
+:Author: David Howells <[email protected]>
+
+.. Contents:
+
+ * Overview
+ * Initialisation
+ * Adding and removing folios
+ * Querying information about a folio
+ * Querying information about a folio_queue
+ * Folio queue iteration
+ * Folio marks
+ * Lockless simultaneous production/consumption issues
+
+
+Overview
+========
+
+The folio_queue struct forms a single segment in a segmented list of folios
+that can be used to form an I/O buffer. As such, the list can be iterated over
+using the ITER_FOLIOQ iov_iter type.
+
+The publicly accessible members of the structure are::
+
+ struct folio_queue {
+ struct folio_queue *next;
+ struct folio_queue *prev;
+ ...
+ };
+
+A pair of pointers are provided, ``next`` and ``prev``, that point to the
+segments on either side of the segment being accessed. Whilst this is a
+doubly-linked list, it is intentionally not a circular list; the outward
+sibling pointers in terminal segments should be NULL.
+
+Each segment in the list also stores:
+
+ * an ordered sequence of folio pointers,
+ * the size of each folio and
+ * three 1-bit marks per folio,
+
+but hese should not be accessed directly as the underlying data structure may
+change, but rather the access functions outlined below should be used.
+
+The facility can be made accessible by::
+
+ #include <linux/folio_queue.h>
+
+and to use the iterator::
+
+ #include <linux/uio.h>
+
+
+Initialisation
+==============
+
+A segment should be initialised by calling::
+
+ void folioq_init(struct folio_queue *folioq);
+
+with a pointer to the segment to be initialised. Note that this will not
+necessarily initialise all the folio pointers, so care must be taken to check
+the number of folios added.
+
+
+Adding and removing folios
+==========================
+
+Folios can be set in the next unused slot in a segment struct by calling one
+of::
+
+ unsigned int folioq_append(struct folio_queue *folioq,
+ struct folio *folio);
+
+ unsigned int folioq_append_mark(struct folio_queue *folioq,
+ struct folio *folio);
+
+Both functions update the stored folio count, store the folio and note its
+size. The second function also sets the first mark for the folio added. Both
+functions return the number of the slot used. [!] Note that no attempt is made
+to check that the capacity wasn't overrun and the list will not be extended
+automatically.
+
+A folio can be excised by calling::
+
+ void folioq_clear(struct folio_queue *folioq, unsigned int slot);
+
+This clears the slot in the array and also clears all the marks for that folio,
+but doesn't change the folio count - so future accesses of that slot must check
+if the slot is occupied.
+
+
+Querying information about a folio
+==================================
+
+Information about the folio in a particular slot may be queried by the
+following function::
+
+ struct folio *folioq_folio(const struct folio_queue *folioq,
+ unsigned int slot);
+
+If a folio has not yet been set in that slot, this may yield an undefined
+pointer. The size of the folio in a slot may be queried with either of::
+
+ unsigned int folioq_folio_order(const struct folio_queue *folioq,
+ unsigned int slot);
+
+ size_t folioq_folio_size(const struct folio_queue *folioq,
+ unsigned int slot);
+
+The first function returns the size as an order and the second as a number of
+bytes.
+
+
+Querying information about a folio_queue
+========================================
+
+Information may be retrieved about a particular segment with the following
+functions::
+
+ unsigned int folioq_nr_slots(const struct folio_queue *folioq);
+
+ unsigned int folioq_count(struct folio_queue *folioq);
+
+ bool folioq_full(struct folio_queue *folioq);
+
+The first function returns the maximum capacity of a segment. It must not be
+assumed that this won't vary between segments. The second returns the number
+of folios added to a segments and the third is a shorthand to indicate if the
+segment has been filled to capacity.
+
+Not that the count and fullness are not affected by clearing folios from the
+segment. These are more about indicating how many slots in the array have been
+initialised, and it assumed that slots won't get reused, but rather the segment
+will get discarded as the queue is consumed.
+
+
+Folio marks
+===========
+
+Folios within a queue can also have marks assigned to them. These marks can be
+used to note information such as if a folio needs folio_put() calling upon it.
+There are three marks available to be set for each folio.
+
+The marks can be set by::
+
+ void folioq_mark(struct folio_queue *folioq, unsigned int slot);
+ void folioq_mark2(struct folio_queue *folioq, unsigned int slot);
+ void folioq_mark3(struct folio_queue *folioq, unsigned int slot);
+
+Cleared by::
+
+ void folioq_unmark(struct folio_queue *folioq, unsigned int slot);
+ void folioq_unmark2(struct folio_queue *folioq, unsigned int slot);
+ void folioq_unmark3(struct folio_queue *folioq, unsigned int slot);
+
+And the marks can be queried by::
+
+ bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot);
+ bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot);
+ bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot);
+
+The marks can be used for any purpose and are not interpreted by this API.
+
+
+Folio queue iteration
+=====================
+
+A list of segments may be iterated over using the I/O iterator facility using
+an ``iov_iter`` iterator of ``ITER_FOLIOQ`` type. The iterator may be
+initialised with::
+
+ void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
+ const struct folio_queue *folioq,
+ unsigned int first_slot, unsigned int offset,
+ size_t count);
+
+This may be told to start at a particular segment, slot and offset within a
+queue. The iov iterator functions will follow the next pointers when advancing
+and prev pointers when reverting when needed.
+
+
+Lockless simultaneous production/consumption issues
+===================================================
+
+If properly managed, the list can be extended by the producer at the head end
+and shortened by the consumer at the tail end simultaneously without the need
+to take locks. The ITER_FOLIOQ iterator inserts appropriate barriers to aid
+with this.
+
+Care must be taken when simultaneously producing and consuming a list. If the
+last segment is reached and the folios it refers to are entirely consumed by
+the IOV iterators, an iov_iter struct will be left pointing to the last segment
+with a slot number equal to the capacity of that segment. The iterator will
+try to continue on from this if there's another segment available when it is
+used again, but care must be taken lest the segment got removed and freed by
+the consumer before the iterator was advanced.
+
+It is recommended that the queue always contain at least one segment, even if
+that segment has never been filled or is entirely spent. This prevents the
+head and tail pointers from collapsing.
+
+
+API Function Reference
+======================
+
+.. kernel-doc:: include/linux/folio_queue.h
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index e18a2ffe0787..a331d2c814f5 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -35,6 +35,7 @@ Library functionality that is used throughout the kernel.
kobject
kref
+ cleanup
assoc_array
xarray
maple_tree
diff --git a/Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml b/Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml
index 72c1d9e82c89..8a1369df4ecb 100644
--- a/Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml
+++ b/Documentation/devicetree/bindings/mailbox/mtk,adsp-mbox.yaml
@@ -17,9 +17,15 @@ description: |
properties:
compatible:
- enum:
- - mediatek,mt8195-adsp-mbox
- - mediatek,mt8186-adsp-mbox
+ oneOf:
+ - enum:
+ - mediatek,mt8186-adsp-mbox
+ - mediatek,mt8195-adsp-mbox
+ - items:
+ - enum:
+ - mediatek,mt8188-adsp-mbox
+ - const: mediatek,mt8186-adsp-mbox
+
"#mbox-cells":
const: 0
diff --git a/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml b/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml
index 05e4e1d51713..2d66770ed361 100644
--- a/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml
+++ b/Documentation/devicetree/bindings/mailbox/qcom-ipcc.yaml
@@ -24,7 +24,9 @@ properties:
compatible:
items:
- enum:
+ - qcom,qcs8300-ipcc
- qcom,qdu1000-ipcc
+ - qcom,sa8255p-ipcc
- qcom,sa8775p-ipcc
- qcom,sc7280-ipcc
- qcom,sc8280xp-ipcc
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index b3be87489108..e32471977d0a 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4214,7 +4214,9 @@ whether or not KVM_CAP_X86_USER_SPACE_MSR's KVM_MSR_EXIT_REASON_FILTER is
enabled. If KVM_MSR_EXIT_REASON_FILTER is enabled, KVM will exit to userspace
on denied accesses, i.e. userspace effectively intercepts the MSR access. If
KVM_MSR_EXIT_REASON_FILTER is not enabled, KVM will inject a #GP into the guest
-on denied accesses.
+on denied accesses. Note, if an MSR access is denied during emulation of MSR
+load/stores during VMX transitions, KVM ignores KVM_MSR_EXIT_REASON_FILTER.
+See the below warning for full details.
If an MSR access is allowed by userspace, KVM will emulate and/or virtualize
the access in accordance with the vCPU model. Note, KVM may still ultimately
@@ -4229,9 +4231,22 @@ filtering. In that mode, ``KVM_MSR_FILTER_DEFAULT_DENY`` is invalid and causes
an error.
.. warning::
- MSR accesses as part of nested VM-Enter/VM-Exit are not filtered.
- This includes both writes to individual VMCS fields and reads/writes
- through the MSR lists pointed to by the VMCS.
+ MSR accesses that are side effects of instruction execution (emulated or
+ native) are not filtered as hardware does not honor MSR bitmaps outside of
+ RDMSR and WRMSR, and KVM mimics that behavior when emulating instructions
+ to avoid pointless divergence from hardware. E.g. RDPID reads MSR_TSC_AUX,
+ SYSENTER reads the SYSENTER MSRs, etc.
+
+ MSRs that are loaded/stored via dedicated VMCS fields are not filtered as
+ part of VM-Enter/VM-Exit emulation.
+
+ MSRs that are loaded/store via VMX's load/store lists _are_ filtered as part
+ of VM-Enter/VM-Exit emulation. If an MSR access is denied on VM-Enter, KVM
+ synthesizes a consistency check VM-Exit(EXIT_REASON_MSR_LOAD_FAIL). If an
+ MSR access is denied on VM-Exit, KVM synthesizes a VM-Abort. In short, KVM
+ extends Intel's architectural list of MSRs that cannot be loaded/saved via
+ the VM-Enter/VM-Exit MSR list. It is platform owner's responsibility to
+ to communicate any such restrictions to their end users.
x2APIC MSR accesses cannot be filtered (KVM silently ignores filters that
cover any x2APIC MSRs).
@@ -8082,6 +8097,14 @@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if
guest CPUID on writes to MISC_ENABLE if
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
disabled.
+
+KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in
+ fast way for memslot deletion when VM type
+ is KVM_X86_DEFAULT_VM.
+ When this quirk is disabled or when VM type
+ is other than KVM_X86_DEFAULT_VM, KVM zaps
+ only leaf SPTEs that are within the range of
+ the memslot being deleted.
=================================== ============================================
7.32 KVM_CAP_MAX_VCPU_ID
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 02880d5552d5..20a9a37d1cdd 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -11,6 +11,8 @@ The acquisition orders for mutexes are as follows:
- cpus_read_lock() is taken outside kvm_lock
+- kvm_usage_lock is taken outside cpus_read_lock()
+
- kvm->lock is taken outside vcpu->mutex
- kvm->lock is taken outside kvm->slots_lock and kvm->irq_lock
@@ -24,6 +26,13 @@ The acquisition orders for mutexes are as follows:
are taken on the waiting side when modifying memslots, so MMU notifiers
must not take either kvm->slots_lock or kvm->slots_arch_lock.
+cpus_read_lock() vs kvm_lock:
+
+- Taking cpus_read_lock() outside of kvm_lock is problematic, despite that
+ being the official ordering, as it is quite easy to unknowingly trigger
+ cpus_read_lock() while holding kvm_lock. Use caution when walking vm_list,
+ e.g. avoid complex operations when possible.
+
For SRCU:
- ``synchronize_srcu(&kvm->srcu)`` is called inside critical sections
@@ -227,10 +236,16 @@ time it will be set using the Dirty tracking mechanism described above.
:Type: mutex
:Arch: any
:Protects: - vm_list
- - kvm_usage_count
+
+``kvm_usage_lock``
+^^^^^^^^^^^^^^^^^^
+
+:Type: mutex
+:Arch: any
+:Protects: - kvm_usage_count
- hardware virtualization enable/disable
-:Comment: KVM also disables CPU hotplug via cpus_read_lock() during
- enable/disable.
+:Comment: Exists to allow taking cpus_read_lock() while kvm_usage_count is
+ protected, which simplifies the virtualization enabling logic.
``kvm->mn_invalidate_lock``
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -290,11 +305,12 @@ time it will be set using the Dirty tracking mechanism described above.
wakeup.
``vendor_module_lock``
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^
:Type: mutex
:Arch: x86
:Protects: loading a vendor module (kvm_amd or kvm_intel)
-:Comment: Exists because using kvm_lock leads to deadlock. cpu_hotplug_lock is
- taken outside of kvm_lock, e.g. in KVM's CPU online/offline callbacks, and
- many operations need to take cpu_hotplug_lock when loading a vendor module,
- e.g. updating static calls.
+:Comment: Exists because using kvm_lock leads to deadlock. kvm_lock is taken
+ in notifiers, e.g. __kvmclock_cpufreq_notifier(), that may be invoked while
+ cpu_hotplug_lock is held, e.g. from cpufreq_boost_trigger_state(), and many
+ operations need to take cpu_hotplug_lock when loading a vendor module, e.g.
+ updating static calls.
diff --git a/MAINTAINERS b/MAINTAINERS
index 5834a1cd10d7..c27f3190737f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15678,6 +15678,9 @@ F: include/dt-bindings/clock/mobileye,eyeq5-clk.h
MODULE SUPPORT
M: Luis Chamberlain <[email protected]>
+R: Petr Pavlu <[email protected]>
+R: Sami Tolvanen <[email protected]>
+R: Daniel Gomez <[email protected]>
S: Maintained
diff --git a/Makefile b/Makefile
index 265dd990a9b6..187a4ce2728e 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
# SPDX-License-Identifier: GPL-2.0
VERSION = 6
-PATCHLEVEL = 11
+PATCHLEVEL = 12
SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION = -rc1
NAME = Baby Opossum Posse
# *DOCUMENTATION*
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index fe0764173cd0..a0d01c46e408 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2164,7 +2164,7 @@ static void cpu_hyp_uninit(void *discard)
}
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
/*
* Most calls to this function are made with migration
@@ -2184,7 +2184,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
kvm_timer_cpu_down();
kvm_vgic_cpu_down();
@@ -2380,7 +2380,7 @@ static int __init do_pkvm_init(u32 hyp_va_bits)
/*
* The stub hypercalls are now disabled, so set our local flag to
- * prevent a later re-init attempt in kvm_arch_hardware_enable().
+ * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
*/
__this_cpu_write(kvm_hyp_initialized, 1);
preempt_enable();
diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c
index 844736b99d38..27e9b94c0a0b 100644
--- a/arch/loongarch/kvm/main.c
+++ b/arch/loongarch/kvm/main.c
@@ -261,7 +261,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -ENOIOCTLCMD;
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
unsigned long env, gcfg = 0;
@@ -300,7 +300,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
write_csr_gcfg(0);
write_csr_gstat(0);
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 6743a57c1ab4..f7222eb594ea 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -728,8 +728,8 @@ struct kvm_mips_callbacks {
int (*handle_fpe)(struct kvm_vcpu *vcpu);
int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
int (*handle_guest_exit)(struct kvm_vcpu *vcpu);
- int (*hardware_enable)(void);
- void (*hardware_disable)(void);
+ int (*enable_virtualization_cpu)(void);
+ void (*disable_virtualization_cpu)(void);
int (*check_extension)(struct kvm *kvm, long ext);
int (*vcpu_init)(struct kvm_vcpu *vcpu);
void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index b5de770b092e..60b43ea85c12 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -125,14 +125,14 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
return 1;
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
- return kvm_mips_callbacks->hardware_enable();
+ return kvm_mips_callbacks->enable_virtualization_cpu();
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
- kvm_mips_callbacks->hardware_disable();
+ kvm_mips_callbacks->disable_virtualization_cpu();
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 99d5a71e4300..ccab4d76b126 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -2869,7 +2869,7 @@ static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size)
return ret + 1;
}
-static int kvm_vz_hardware_enable(void)
+static int kvm_vz_enable_virtualization_cpu(void)
{
unsigned int mmu_size, guest_mmu_size, ftlb_size;
u64 guest_cvmctl, cvmvmconfig;
@@ -2983,7 +2983,7 @@ static int kvm_vz_hardware_enable(void)
return 0;
}
-static void kvm_vz_hardware_disable(void)
+static void kvm_vz_disable_virtualization_cpu(void)
{
u64 cvmvmconfig;
unsigned int mmu_size;
@@ -3280,8 +3280,8 @@ static struct kvm_mips_callbacks kvm_vz_callbacks = {
.handle_msa_disabled = kvm_trap_vz_handle_msa_disabled,
.handle_guest_exit = kvm_trap_vz_handle_guest_exit,
- .hardware_enable = kvm_vz_hardware_enable,
- .hardware_disable = kvm_vz_hardware_disable,
+ .enable_virtualization_cpu = kvm_vz_enable_virtualization_cpu,
+ .disable_virtualization_cpu = kvm_vz_disable_virtualization_cpu,
.check_extension = kvm_vz_check_extension,
.vcpu_init = kvm_vz_vcpu_init,
.vcpu_uninit = kvm_vz_vcpu_uninit,
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index bab2ec34cd87..f3427f6de608 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -20,7 +20,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
return -EINVAL;
}
-int kvm_arch_hardware_enable(void)
+int kvm_arch_enable_virtualization_cpu(void)
{
csr_write(CSR_HEDELEG, KVM_HEDELEG_DEFAULT);
csr_write(CSR_HIDELEG, KVM_HIDELEG_DEFAULT);
@@ -35,7 +35,7 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
kvm_riscv_aia_disable();
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 7ec1b8cd0de9..9b57add02cd5 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -59,6 +59,7 @@ CONFIG_CMM=m
CONFIG_APPLDATA_BASE=y
CONFIG_S390_HYPFS_FS=y
CONFIG_KVM=m
+CONFIG_KVM_S390_UCONTROL=y
CONFIG_S390_UNWIND_SELFTEST=m
CONFIG_S390_KPROBES_SANITY_TEST=m
CONFIG_S390_MODULES_SANITY_TEST=m
diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
index e26e68675c08..aa06c85bcbd3 100644
--- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S
+++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S
@@ -13,10 +13,7 @@
* for details.
*/
.macro vdso_func func
- .globl __kernel_\func
- .type __kernel_\func,@function
- __ALIGN
-__kernel_\func:
+SYM_FUNC_START(__kernel_\func)
CFI_STARTPROC
aghi %r15,-STACK_FRAME_VDSO_OVERHEAD
CFI_DEF_CFA_OFFSET (STACK_FRAME_USER_OVERHEAD + STACK_FRAME_VDSO_OVERHEAD)
@@ -32,7 +29,7 @@ __kernel_\func:
CFI_RESTORE 15
br %r14
CFI_ENDPROC
- .size __kernel_\func,.-__kernel_\func
+SYM_FUNC_END(__kernel_\func)
.endm
vdso_func gettimeofday
@@ -41,16 +38,13 @@ vdso_func clock_gettime
vdso_func getcpu
.macro vdso_syscall func,syscall
- .globl __kernel_\func
- .type __kernel_\func,@function
- __ALIGN
-__kernel_\func:
+SYM_FUNC_START(__kernel_\func)
CFI_STARTPROC
svc \syscall
/* Make sure we notice when a syscall returns, which shouldn't happen */
.word 0
CFI_ENDPROC
- .size __kernel_\func,.-__kernel_\func
+SYM_FUNC_END(__kernel_\func)
.endm
vdso_syscall restart_syscall,__NR_restart_syscall
diff --git a/arch/s390/kernel/vdso64/vgetrandom-chacha.S b/arch/s390/kernel/vdso64/vgetrandom-chacha.S
index d802b0a96f41..09c034c2f853 100644
--- a/arch/s390/kernel/vdso64/vgetrandom-chacha.S
+++ b/arch/s390/kernel/vdso64/vgetrandom-chacha.S
@@ -1,7 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/stringify.h>
#include <linux/linkage.h>
#include <asm/alternative.h>
+#include <asm/dwarf.h>
#include <asm/fpu-insn.h>
#define STATE0 %v0
@@ -12,9 +14,6 @@
#define COPY1 %v5
#define COPY2 %v6
#define COPY3 %v7
-#define PERM4 %v16
-#define PERM8 %v17
-#define PERM12 %v18
#define BEPERM %v19
#define TMP0 %v20
#define TMP1 %v21
@@ -23,13 +22,11 @@
.section .rodata
- .balign 128
-.Lconstants:
+ .balign 32
+SYM_DATA_START_LOCAL(chacha20_constants)
.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
- .long 0x04050607,0x08090a0b,0x0c0d0e0f,0x00010203 # rotl 4 bytes
- .long 0x08090a0b,0x0c0d0e0f,0x00010203,0x04050607 # rotl 8 bytes
- .long 0x0c0d0e0f,0x00010203,0x04050607,0x08090a0b # rotl 12 bytes
.long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap
+SYM_DATA_END(chacha20_constants)
.text
/*
@@ -43,13 +40,14 @@
* size_t nblocks)
*/
SYM_FUNC_START(__arch_chacha20_blocks_nostack)
- larl %r1,.Lconstants
+ CFI_STARTPROC
+ larl %r1,chacha20_constants
/* COPY0 = "expand 32-byte k" */
VL COPY0,0,,%r1
- /* PERM4-PERM12,BEPERM = byte selectors for VPERM */
- VLM PERM4,BEPERM,16,%r1
+ /* BEPERM = byte selectors for VPERM */
+ ALTERNATIVE __stringify(VL BEPERM,16,,%r1), "brcl 0,0", ALT_FACILITY(148)
/* COPY1,COPY2 = key */
VLM COPY1,COPY2,0,%r3
@@ -89,11 +87,11 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
VERLLF STATE1,STATE1,7
/* STATE1[0,1,2,3] = STATE1[1,2,3,0] */
- VPERM STATE1,STATE1,STATE1,PERM4
+ VSLDB STATE1,STATE1,STATE1,4
/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
- VPERM STATE2,STATE2,STATE2,PERM8
+ VSLDB STATE2,STATE2,STATE2,8
/* STATE3[0,1,2,3] = STATE3[3,0,1,2] */
- VPERM STATE3,STATE3,STATE3,PERM12
+ VSLDB STATE3,STATE3,STATE3,12
/* STATE0 += STATE1, STATE3 = rotl32(STATE3 ^ STATE0, 16) */
VAF STATE0,STATE0,STATE1
@@ -116,32 +114,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
VERLLF STATE1,STATE1,7
/* STATE1[0,1,2,3] = STATE1[3,0,1,2] */
- VPERM STATE1,STATE1,STATE1,PERM12
+ VSLDB STATE1,STATE1,STATE1,12
/* STATE2[0,1,2,3] = STATE2[2,3,0,1] */
- VPERM STATE2,STATE2,STATE2,PERM8
+ VSLDB STATE2,STATE2,STATE2,8
/* STATE3[0,1,2,3] = STATE3[1,2,3,0] */
- VPERM STATE3,STATE3,STATE3,PERM4
+ VSLDB STATE3,STATE3,STATE3,4
brctg %r0,.Ldoubleround
- /* OUTPUT0 = STATE0 + STATE0 */
+ /* OUTPUT0 = STATE0 + COPY0 */
VAF STATE0,STATE0,COPY0
- /* OUTPUT1 = STATE1 + STATE1 */
+ /* OUTPUT1 = STATE1 + COPY1 */
VAF STATE1,STATE1,COPY1
- /* OUTPUT2 = STATE2 + STATE2 */
+ /* OUTPUT2 = STATE2 + COPY2 */
VAF STATE2,STATE2,COPY2
- /* OUTPUT2 = STATE3 + STATE3 */
+ /* OUTPUT3 = STATE3 + COPY3 */
VAF STATE3,STATE3,COPY3
- /*
- * 32 bit wise little endian store to OUTPUT. If the vector
- * enhancement facility 2 is not installed use the slow path.
- */
- ALTERNATIVE "brc 0xf,.Lstoreslow", "nop", ALT_FACILITY(148)
- VSTBRF STATE0,0,,%r2
- VSTBRF STATE1,16,,%r2
- VSTBRF STATE2,32,,%r2
- VSTBRF STATE3,48,,%r2
-.Lstoredone:
+ ALTERNATIVE \
+ __stringify( \
+ /* Convert STATE to little endian and store to OUTPUT */\
+ VPERM TMP0,STATE0,STATE0,BEPERM; \
+ VPERM TMP1,STATE1,STATE1,BEPERM; \
+ VPERM TMP2,STATE2,STATE2,BEPERM; \
+ VPERM TMP3,STATE3,STATE3,BEPERM; \
+ VSTM TMP0,TMP3,0,%r2), \
+ __stringify( \
+ /* 32 bit wise little endian store to OUTPUT */ \
+ VSTBRF STATE0,0,,%r2; \
+ VSTBRF STATE1,16,,%r2; \
+ VSTBRF STATE2,32,,%r2; \
+ VSTBRF STATE3,48,,%r2; \
+ brcl 0,0), \
+ ALT_FACILITY(148)
/* ++COPY3.COUNTER */
/* alsih %r3,1 */
@@ -173,13 +177,5 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
VZERO TMP3
br %r14
-
-.Lstoreslow:
- /* Convert STATE to little endian format and store to OUTPUT */
- VPERM TMP0,STATE0,STATE0,BEPERM
- VPERM TMP1,STATE1,STATE1,BEPERM
- VPERM TMP2,STATE2,STATE2,BEPERM
- VPERM TMP3,STATE3,STATE3,BEPERM
- VSTM TMP0,TMP3,0,%r2
- j .Lstoredone
+ CFI_ENDPROC
SYM_FUNC_END(__arch_chacha20_blocks_nostack)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0fd96860fc45..bb7134faaebf 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -348,20 +348,29 @@ static inline int plo_test_bit(unsigned char nr)
return cc == 0;
}
-static __always_inline void __insn32_query(unsigned int opcode, u8 *query)
+static __always_inline void __sortl_query(u8 (*query)[32])
{
asm volatile(
" lghi 0,0\n"
- " lgr 1,%[query]\n"
+ " la 1,%[query]\n"
/* Parameter registers are ignored */
- " .insn rrf,%[opc] << 16,2,4,6,0\n"
+ " .insn rre,0xb9380000,2,4\n"
+ : [query] "=R" (*query)
:
- : [query] "d" ((unsigned long)query), [opc] "i" (opcode)
- : "cc", "memory", "0", "1");
+ : "cc", "0", "1");
}
-#define INSN_SORTL 0xb938
-#define INSN_DFLTCC 0xb939
+static __always_inline void __dfltcc_query(u8 (*query)[32])
+{
+ asm volatile(
+ " lghi 0,0\n"
+ " la 1,%[query]\n"
+ /* Parameter registers are ignored */
+ " .insn rrf,0xb9390000,2,4,6,0\n"
+ : [query] "=R" (*query)
+ :
+ : "cc", "0", "1");
+}
static void __init kvm_s390_cpu_feat_init(void)
{
@@ -415,10 +424,10 @@ static void __init kvm_s390_cpu_feat_init(void)
kvm_s390_available_subfunc.kdsa);
if (test_facility(150)) /* SORTL */
- __insn32_query(INSN_SORTL, kvm_s390_available_subfunc.sortl);
+ __sortl_query(&kvm_s390_available_subfunc.sortl);
if (test_facility(151)) /* DFLTCC */
- __insn32_query(INSN_DFLTCC, kvm_s390_available_subfunc.dfltcc);
+ __dfltcc_query(&kvm_s390_available_subfunc.dfltcc);
if (MACHINE_HAS_ESOP)
allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index da8b66dce0da..327c45c5013f 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -16,6 +16,7 @@
#include <asm/insn-eval.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
+#include <asm/traps.h>
/* MMIO direction */
#define EPT_READ 0
@@ -433,6 +434,11 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve)
return -EINVAL;
}
+ if (!fault_in_kernel_space(ve->gla)) {
+ WARN_ONCE(1, "Access to userspace address is not supported");
+ return -EINVAL;
+ }
+
/*
* Reject EPT violation #VEs that split pages.
*
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 8db2ec4d6cda..1f650b4dde50 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -163,20 +163,18 @@ static __always_inline s64 arch_atomic64_dec_return(atomic64_t *v)
}
#define arch_atomic64_dec_return arch_atomic64_dec_return
-static __always_inline s64 arch_atomic64_add(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_add(s64 i, atomic64_t *v)
{
__alternative_atomic64(add, add_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
ASM_NO_INPUT_CLOBBER("memory"));
- return i;
}
-static __always_inline s64 arch_atomic64_sub(s64 i, atomic64_t *v)
+static __always_inline void arch_atomic64_sub(s64 i, atomic64_t *v)
{
__alternative_atomic64(sub, sub_return,
ASM_OUTPUT2("+A" (i), "+c" (v)),
ASM_NO_INPUT_CLOBBER("memory"));
- return i;
}
static __always_inline void arch_atomic64_inc(atomic64_t *v)
diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h
index 80cc6386d7b1..ca4243318aad 100644
--- a/arch/x86/include/asm/cpuid.h
+++ b/arch/x86/include/asm/cpuid.h
@@ -179,6 +179,7 @@ static __always_inline bool cpuid_function_is_indexed(u32 function)
case 0x1d:
case 0x1e:
case 0x1f:
+ case 0x24:
case 0x8000001d:
return true;
}
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 44949f972826..1a42f829667a 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -135,6 +135,8 @@
#define INTEL_LUNARLAKE_M IFM(6, 0xBD)
+#define INTEL_PANTHERLAKE_L IFM(6, 0xCC)
+
/* "Small Core" Processors (Atom/E-Core) */
#define INTEL_ATOM_BONNELL IFM(6, 0x1C) /* Diamondville, Pineview */
@@ -178,4 +180,7 @@
#define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */
#define INTEL_QUARK_X1000 IFM(5, 0x09) /* Quark X1000 SoC */
+/* Family 19 */
+#define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */
+
#endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 68ad4f923664..861d080ed4c6 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -14,8 +14,8 @@ BUILD_BUG_ON(1)
* be __static_call_return0.
*/
KVM_X86_OP(check_processor_compatibility)
-KVM_X86_OP(hardware_enable)
-KVM_X86_OP(hardware_disable)
+KVM_X86_OP(enable_virtualization_cpu)
+KVM_X86_OP(disable_virtualization_cpu)
KVM_X86_OP(hardware_unsetup)
KVM_X86_OP(has_emulated_msr)
KVM_X86_OP(vcpu_after_set_cpuid)
@@ -125,7 +125,7 @@ KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
KVM_X86_OP_OPTIONAL(vm_move_enc_context_from)
KVM_X86_OP_OPTIONAL(guest_memory_reclaimed)
-KVM_X86_OP(get_msr_feature)
+KVM_X86_OP(get_feature_msr)
KVM_X86_OP(check_emulate_instruction)
KVM_X86_OP(apic_init_signal_blocked)
KVM_X86_OP_OPTIONAL(enable_l2_tlb_flush)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4a68cb3eba78..6d9f763a7bb9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -36,6 +36,7 @@
#include <asm/kvm_page_track.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/hyperv-tlfs.h>
+#include <asm/reboot.h>
#define __KVM_HAVE_ARCH_VCPU_DEBUGFS
@@ -211,6 +212,7 @@ enum exit_fastpath_completion {
EXIT_FASTPATH_NONE,
EXIT_FASTPATH_REENTER_GUEST,
EXIT_FASTPATH_EXIT_HANDLED,
+ EXIT_FASTPATH_EXIT_USERSPACE,
};
typedef enum exit_fastpath_completion fastpath_t;
@@ -280,10 +282,6 @@ enum x86_intercept_stage;
#define PFERR_PRIVATE_ACCESS BIT_ULL(49)
#define PFERR_SYNTHETIC_MASK (PFERR_IMPLICIT_ACCESS | PFERR_PRIVATE_ACCESS)
-#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
- PFERR_WRITE_MASK | \
- PFERR_PRESENT_MASK)
-
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
/*
@@ -1629,8 +1627,10 @@ struct kvm_x86_ops {
int (*check_processor_compatibility)(void);
- int (*hardware_enable)(void);
- void (*hardware_disable)(void);
+ int (*enable_virtualization_cpu)(void);
+ void (*disable_virtualization_cpu)(void);
+ cpu_emergency_virt_cb *emergency_disable_virtualization_cpu;
+
void (*hardware_unsetup)(void);
bool (*has_emulated_msr)(struct kvm *kvm, u32 index);
void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu);
@@ -1727,6 +1727,8 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+
+ const bool x2apic_icr_is_split;
const unsigned long required_apicv_inhibits;
bool allow_apicv_in_x2apic_without_x2apic_virtualization;
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
@@ -1806,7 +1808,7 @@ struct kvm_x86_ops {
int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
void (*guest_memory_reclaimed)(struct kvm *kvm);
- int (*get_msr_feature)(struct kvm_msr_entry *entry);
+ int (*get_feature_msr)(u32 msr, u64 *data);
int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
void *insn, int insn_len);
@@ -2060,6 +2062,8 @@ void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu);
void kvm_enable_efer_bits(u64);
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
+int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data);
+int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data);
int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data);
@@ -2136,7 +2140,15 @@ int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu);
void kvm_update_dr7(struct kvm_vcpu *vcpu);
-int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry);
+
+static inline bool kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa)
+{
+ return __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, false);
+}
+
void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu,
ulong roots_to_free);
void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu);
@@ -2254,6 +2266,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
+int kvm_cpu_get_extint(struct kvm_vcpu *v);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
@@ -2345,7 +2358,8 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
- KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
+ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS | \
+ KVM_X86_QUIRK_SLOT_ZAP_ALL)
/*
* KVM previously used a u32 field in kvm_run to indicate the hypercall was
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index a7c06a46fb76..3ae84c3b8e6d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -36,6 +36,20 @@
#define EFER_FFXSR (1<<_EFER_FFXSR)
#define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS)
+/*
+ * Architectural memory types that are common to MTRRs, PAT, VMX MSRs, etc.
+ * Most MSRs support/allow only a subset of memory types, but the values
+ * themselves are common across all relevant MSRs.
+ */
+#define X86_MEMTYPE_UC 0ull /* Uncacheable, a.k.a. Strong Uncacheable */
+#define X86_MEMTYPE_WC 1ull /* Write Combining */
+/* RESERVED 2 */
+/* RESERVED 3 */
+#define X86_MEMTYPE_WT 4ull /* Write Through */
+#define X86_MEMTYPE_WP 5ull /* Write Protected */
+#define X86_MEMTYPE_WB 6ull /* Write Back */
+#define X86_MEMTYPE_UC_MINUS 7ull /* Weak Uncacheabled (PAT only) */
+
/* FRED MSRs */
#define MSR_IA32_FRED_RSP0 0x1cc /* Level 0 stack pointer */
#define MSR_IA32_FRED_RSP1 0x1cd /* Level 1 stack pointer */
@@ -365,6 +379,12 @@
#define MSR_IA32_CR_PAT 0x00000277
+#define PAT_VALUE(p0, p1, p2, p3, p4, p5, p6, p7) \
+ ((X86_MEMTYPE_ ## p0) | (X86_MEMTYPE_ ## p1 << 8) | \
+ (X86_MEMTYPE_ ## p2 << 16) | (X86_MEMTYPE_ ## p3 << 24) | \
+ (X86_MEMTYPE_ ## p4 << 32) | (X86_MEMTYPE_ ## p5 << 40) | \
+ (X86_MEMTYPE_ ## p6 << 48) | (X86_MEMTYPE_ ## p7 << 56))
+
#define MSR_IA32_DEBUGCTLMSR 0x000001d9
#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
@@ -1159,15 +1179,6 @@
#define MSR_IA32_VMX_VMFUNC 0x00000491
#define MSR_IA32_VMX_PROCBASED_CTLS3 0x00000492
-/* VMX_BASIC bits and bitmasks */
-#define VMX_BASIC_VMCS_SIZE_SHIFT 32
-#define VMX_BASIC_TRUE_CTLS (1ULL << 55)
-#define VMX_BASIC_64 0x0001000000000000LLU
-#define VMX_BASIC_MEM_TYPE_SHIFT 50
-#define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU
-#define VMX_BASIC_MEM_TYPE_WB 6LLU
-#define VMX_BASIC_INOUT 0x0040000000000000LLU
-
/* Resctrl MSRs: */
/* - Intel: */
#define MSR_IA32_L3_QOS_CFG 0xc81
@@ -1185,11 +1196,6 @@
#define MSR_IA32_SMBA_BW_BASE 0xc0000280
#define MSR_IA32_EVT_CFG_BASE 0xc0000400
-/* MSR_IA32_VMX_MISC bits */
-#define MSR_IA32_VMX_MISC_INTEL_PT (1ULL << 14)
-#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
-#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE 0x1F
-
/* AMD-V MSRs */
#define MSR_VM_CR 0xc0010114
#define MSR_VM_IGNNE 0xc0010115
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 6536873f8fc0..c02183d3cdd7 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -25,12 +25,14 @@ void __noreturn machine_real_restart(unsigned int type);
#define MRR_BIOS 0
#define MRR_APM 1
-#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
typedef void (cpu_emergency_virt_cb)(void);
+#if IS_ENABLED(CONFIG_KVM_INTEL) || IS_ENABLED(CONFIG_KVM_AMD)
void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback);
void cpu_emergency_disable_virtualization(void);
#else
+static inline void cpu_emergency_register_virt_callback(cpu_emergency_virt_cb *callback) {}
+static inline void cpu_emergency_unregister_virt_callback(cpu_emergency_virt_cb *callback) {}
static inline void cpu_emergency_disable_virtualization(void) {}
#endif /* CONFIG_KVM_INTEL || CONFIG_KVM_AMD */
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index f0dea3750ca9..2b59b9951c90 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -516,6 +516,20 @@ struct ghcb {
u32 ghcb_usage;
} __packed;
+struct vmcb {
+ struct vmcb_control_area control;
+ union {
+ struct vmcb_save_area save;
+
+ /*
+ * For SEV-ES VMs, the save area in the VMCB is used only to
+ * save/load host state. Guest state resides in a separate
+ * page, the aptly named VM Save Area (VMSA), that is encrypted
+ * with the guest's private key.
+ */
+ struct sev_es_save_area host_sev_es_save;
+ };
+} __packed;
#define EXPECTED_VMCB_SAVE_AREA_SIZE 744
#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032
@@ -532,6 +546,7 @@ static inline void __unused_size_checks(void)
BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
+ BUILD_BUG_ON(offsetof(struct vmcb, save) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
/* Check offsets of reserved fields */
@@ -568,11 +583,6 @@ static inline void __unused_size_checks(void)
BUILD_BUG_RESERVED_OFFSET(ghcb, 0xff0);
}
-struct vmcb {
- struct vmcb_control_area control;
- struct vmcb_save_area save;
-} __packed;
-
#define SVM_CPUID_FUNC 0x8000000a
#define SVM_SELECTOR_S_SHIFT 4
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index d77a31039f24..f7fd4369b821 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -122,19 +122,17 @@
#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
-#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
-#define VMX_MISC_SAVE_EFER_LMA 0x00000020
-#define VMX_MISC_ACTIVITY_HLT 0x00000040
-#define VMX_MISC_ACTIVITY_WAIT_SIPI 0x00000100
-#define VMX_MISC_ZERO_LEN_INS 0x40000000
-#define VMX_MISC_MSR_LIST_MULTIPLIER 512
-
/* VMFUNC functions */
#define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28)
#define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING)
#define VMFUNC_EPTP_ENTRIES 512
+#define VMX_BASIC_32BIT_PHYS_ADDR_ONLY BIT_ULL(48)
+#define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49)
+#define VMX_BASIC_INOUT BIT_ULL(54)
+#define VMX_BASIC_TRUE_CTLS BIT_ULL(55)
+
static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
{
return vmx_basic & GENMASK_ULL(30, 0);
@@ -145,9 +143,30 @@ static inline u32 vmx_basic_vmcs_size(u64 vmx_basic)
return (vmx_basic & GENMASK_ULL(44, 32)) >> 32;
}
+static inline u32 vmx_basic_vmcs_mem_type(u64 vmx_basic)
+{
+ return (vmx_basic & GENMASK_ULL(53, 50)) >> 50;
+}
+
+static inline u64 vmx_basic_encode_vmcs_info(u32 revision, u16 size, u8 memtype)
+{
+ return revision | ((u64)size << 32) | ((u64)memtype << 50);
+}
+
+#define VMX_MISC_SAVE_EFER_LMA BIT_ULL(5)
+#define VMX_MISC_ACTIVITY_HLT BIT_ULL(6)
+#define VMX_MISC_ACTIVITY_SHUTDOWN BIT_ULL(7)
+#define VMX_MISC_ACTIVITY_WAIT_SIPI BIT_ULL(8)
+#define VMX_MISC_INTEL_PT BIT_ULL(14)
+#define VMX_MISC_RDMSR_IN_SMM BIT_ULL(15)
+#define VMX_MISC_VMXOFF_BLOCK_SMI BIT_ULL(28)
+#define VMX_MISC_VMWRITE_SHADOW_RO_FIELDS BIT_ULL(29)
+#define VMX_MISC_ZERO_LEN_INS BIT_ULL(30)
+#define VMX_MISC_MSR_LIST_MULTIPLIER 512
+
static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
{
- return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ return vmx_misc & GENMASK_ULL(4, 0);
}
static inline int vmx_misc_cr3_count(u64 vmx_misc)
@@ -508,9 +527,10 @@ enum vmcs_field {
#define VMX_EPTP_PWL_4 0x18ull
#define VMX_EPTP_PWL_5 0x20ull
#define VMX_EPTP_AD_ENABLE_BIT (1ull << 6)
+/* The EPTP memtype is encoded in bits 2:0, i.e. doesn't need to be shifted. */
#define VMX_EPTP_MT_MASK 0x7ull
-#define VMX_EPTP_MT_WB 0x6ull
-#define VMX_EPTP_MT_UC 0x0ull
+#define VMX_EPTP_MT_WB X86_MEMTYPE_WB
+#define VMX_EPTP_MT_UC X86_MEMTYPE_UC
#define VMX_EPT_READABLE_MASK 0x1ull
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index bf57a824f722..a8debbf2f702 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -439,6 +439,7 @@ struct kvm_sync_regs {
#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6)
+#define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 2a2fc14955cd..989d368be04f 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -55,6 +55,12 @@
#include "mtrr.h"
+static_assert(X86_MEMTYPE_UC == MTRR_TYPE_UNCACHABLE);
+static_assert(X86_MEMTYPE_WC == MTRR_TYPE_WRCOMB);
+static_assert(X86_MEMTYPE_WT == MTRR_TYPE_WRTHROUGH);
+static_assert(X86_MEMTYPE_WP == MTRR_TYPE_WRPROT);
+static_assert(X86_MEMTYPE_WB == MTRR_TYPE_WRBACK);
+
/* arch_phys_wc_add returns an MTRR register index plus this offset. */
#define MTRR_TO_PHYS_WC_OFFSET 1000
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 2617be544480..41786b834b16 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -705,7 +705,7 @@ void kvm_set_cpu_caps(void)
kvm_cpu_cap_init_kvm_defined(CPUID_7_1_EDX,
F(AVX_VNNI_INT8) | F(AVX_NE_CONVERT) | F(PREFETCHITI) |
- F(AMX_COMPLEX)
+ F(AMX_COMPLEX) | F(AVX10)
);
kvm_cpu_cap_init_kvm_defined(CPUID_7_2_EDX,
@@ -721,6 +721,10 @@ void kvm_set_cpu_caps(void)
SF(SGX1) | SF(SGX2) | SF(SGX_EDECCSSA)
);
+ kvm_cpu_cap_init_kvm_defined(CPUID_24_0_EBX,
+ F(AVX10_128) | F(AVX10_256) | F(AVX10_512)
+ );
+
kvm_cpu_cap_mask(CPUID_8000_0001_ECX,
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
@@ -949,7 +953,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
switch (function) {
case 0:
/* Limited to the highest leaf implemented in KVM. */
- entry->eax = min(entry->eax, 0x1fU);
+ entry->eax = min(entry->eax, 0x24U);
break;
case 1:
cpuid_entry_override(entry, CPUID_1_EDX);
@@ -1174,6 +1178,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
break;
}
break;
+ case 0x24: {
+ u8 avx10_version;
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_AVX10)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
+ /*
+ * The AVX10 version is encoded in EBX[7:0]. Note, the version
+ * is guaranteed to be >=1 if AVX10 is supported. Note #2, the
+ * version needs to be captured before overriding EBX features!
+ */
+ avx10_version = min_t(u8, entry->ebx & 0xff, 1);
+ cpuid_entry_override(entry, CPUID_24_0_EBX);
+ entry->ebx |= avx10_version;
+
+ entry->eax = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
+ }
case KVM_CPUID_SIGNATURE: {
const u32 *sigptr = (const u32 *)KVM_SIGNATURE;
entry->eax = KVM_CPUID_FEATURES;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 3d7eb11d0e45..63f66c51975a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
* Read pending interrupt(from non-APIC source)
* vector and intack.
*/
-static int kvm_cpu_get_extint(struct kvm_vcpu *v)
+int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
if (!kvm_cpu_has_extint(v)) {
WARN_ON(!lapic_in_kernel(v));
@@ -131,6 +131,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
} else
return kvm_pic_read_irq(v->kvm); /* PIC */
}
+EXPORT_SYMBOL_GPL(kvm_cpu_get_extint);
/*
* Read pending interrupt vector and intack.
@@ -141,9 +142,12 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
if (vector != -1)
return vector; /* PIC */
- return kvm_get_apic_interrupt(v); /* APIC */
+ vector = kvm_apic_has_interrupt(v); /* APIC */
+ if (vector != -1)
+ kvm_apic_ack_interrupt(v, vector);
+
+ return vector;
}
-EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5bb481aefcbc..2098dc689088 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1944,7 +1944,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
u64 ns = 0;
ktime_t expire;
struct kvm_vcpu *vcpu = apic->vcpu;
- unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ u32 this_tsc_khz = vcpu->arch.virtual_tsc_khz;
unsigned long flags;
ktime_t now;
@@ -2453,6 +2453,43 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+#define X2APIC_ICR_RESERVED_BITS (GENMASK_ULL(31, 20) | GENMASK_ULL(17, 16) | BIT(13))
+
+int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
+{
+ if (data & X2APIC_ICR_RESERVED_BITS)
+ return 1;
+
+ /*
+ * The BUSY bit is reserved on both Intel and AMD in x2APIC mode, but
+ * only AMD requires it to be zero, Intel essentially just ignores the
+ * bit. And if IPI virtualization (Intel) or x2AVIC (AMD) is enabled,
+ * the CPU performs the reserved bits checks, i.e. the underlying CPU
+ * behavior will "win". Arbitrarily clear the BUSY bit, as there is no
+ * sane way to provide consistent behavior with respect to hardware.
+ */
+ data &= ~APIC_ICR_BUSY;
+
+ kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
+ if (kvm_x86_ops.x2apic_icr_is_split) {
+ kvm_lapic_set_reg(apic, APIC_ICR, data);
+ kvm_lapic_set_reg(apic, APIC_ICR2, data >> 32);
+ } else {
+ kvm_lapic_set_reg64(apic, APIC_ICR, data);
+ }
+ trace_kvm_apic_write(APIC_ICR, data);
+ return 0;
+}
+
+static u64 kvm_x2apic_icr_read(struct kvm_lapic *apic)
+{
+ if (kvm_x86_ops.x2apic_icr_is_split)
+ return (u64)kvm_lapic_get_reg(apic, APIC_ICR) |
+ (u64)kvm_lapic_get_reg(apic, APIC_ICR2) << 32;
+
+ return kvm_lapic_get_reg64(apic, APIC_ICR);
+}
+
/* emulate APIC access in a trap manner */
void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
{
@@ -2470,7 +2507,7 @@ void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset)
* maybe-unecessary write, and both are in the noise anyways.
*/
if (apic_x2apic_mode(apic) && offset == APIC_ICR)
- kvm_x2apic_icr_write(apic, kvm_lapic_get_reg64(apic, APIC_ICR));
+ WARN_ON_ONCE(kvm_x2apic_icr_write(apic, kvm_x2apic_icr_read(apic)));
else
kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
}
@@ -2922,14 +2959,13 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
}
}
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector)
{
- int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
- if (vector == -1)
- return -1;
+ if (WARN_ON_ONCE(vector < 0 || !apic))
+ return;
/*
* We get here even with APIC virtualization enabled, if doing
@@ -2957,8 +2993,8 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
__apic_update_ppr(apic, &ppr);
}
- return vector;
}
+EXPORT_SYMBOL_GPL(kvm_apic_ack_interrupt);
static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s, bool set)
@@ -2990,18 +3026,22 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
/*
* In x2APIC mode, the LDR is fixed and based on the id. And
- * ICR is internally a single 64-bit register, but needs to be
- * split to ICR+ICR2 in userspace for backwards compatibility.
+ * if the ICR is _not_ split, ICR is internally a single 64-bit
+ * register, but needs to be split to ICR+ICR2 in userspace for
+ * backwards compatibility.
*/
- if (set) {
+ if (set)
*ldr = kvm_apic_calc_x2apic_ldr(x2apic_id);
- icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
- (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
- __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
- } else {
- icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
- __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ if (!kvm_x86_ops.x2apic_icr_is_split) {
+ if (set) {
+ icr = __kvm_lapic_get_reg(s->regs, APIC_ICR) |
+ (u64)__kvm_lapic_get_reg(s->regs, APIC_ICR2) << 32;
+ __kvm_lapic_set_reg64(s->regs, APIC_ICR, icr);
+ } else {
+ icr = __kvm_lapic_get_reg64(s->regs, APIC_ICR);
+ __kvm_lapic_set_reg(s->regs, APIC_ICR2, icr >> 32);
+ }
}
}
@@ -3194,22 +3234,12 @@ int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
return 0;
}
-int kvm_x2apic_icr_write(struct kvm_lapic *apic, u64 data)
-{
- data &= ~APIC_ICR_BUSY;
-
- kvm_apic_send_ipi(apic, (u32)data, (u32)(data >> 32));
- kvm_lapic_set_reg64(apic, APIC_ICR, data);
- trace_kvm_apic_write(APIC_ICR, data);
- return 0;
-}
-
static int kvm_lapic_msr_read(struct kvm_lapic *apic, u32 reg, u64 *data)
{
u32 low;
if (reg == APIC_ICR) {
- *data = kvm_lapic_get_reg64(apic, APIC_ICR);
+ *data = kvm_x2apic_icr_read(apic);
return 0;
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 7ef8ae73e82d..1b8ef9856422 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -88,15 +88,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu);
void kvm_free_lapic(struct kvm_vcpu *vcpu);
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_ack_interrupt(struct kvm_vcpu *vcpu, int vector);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_events(struct kvm_vcpu *vcpu);
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
-u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_recalculate_apic_map(struct kvm *kvm);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_after_set_mcg_cap(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 4341e0e28571..9dc5dd43ae7f 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -223,8 +223,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
bool kvm_mmu_may_ignore_guest_pat(void);
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
-
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7813d28b082f..e52f990548df 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -614,32 +614,6 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
-/* Returns the Accessed status of the PTE and resets it at the same time. */
-static bool mmu_spte_age(u64 *sptep)
-{
- u64 spte = mmu_spte_get_lockless(sptep);
-
- if (!is_accessed_spte(spte))
- return false;
-
- if (spte_ad_enabled(spte)) {
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- } else {
- /*
- * Capture the dirty status of the page, so that it doesn't get
- * lost when the SPTE is marked for access tracking.
- */
- if (is_writable_pte(spte))
- kvm_set_pfn_dirty(spte_to_pfn(spte));
-
- spte = mark_spte_for_access_track(spte);
- mmu_spte_update_no_track(sptep, spte);
- }
-
- return true;
-}
-
static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
{
return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
@@ -938,6 +912,7 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
* pte_list_desc containing more mappings.
*/
+#define KVM_RMAP_MANY BIT(0)
/*
* Returns the number of pointers in the rmap chain, not counting the new one.
@@ -950,16 +925,16 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
if (!rmap_head->val) {
rmap_head->val = (unsigned long)spte;
- } else if (!(rmap_head->val & 1)) {
+ } else if (!(rmap_head->val & KVM_RMAP_MANY)) {
desc = kvm_mmu_memory_cache_alloc(cache);
desc->sptes[0] = (u64 *)rmap_head->val;
desc->sptes[1] = spte;
desc->spte_count = 2;
desc->tail_count = 0;
- rmap_head->val = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
++count;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
count = desc->tail_count + desc->spte_count;
/*
@@ -968,10 +943,10 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
*/
if (desc->spte_count == PTE_LIST_EXT) {
desc = kvm_mmu_memory_cache_alloc(cache);
- desc->more = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc->more = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
desc->spte_count = 0;
desc->tail_count = count;
- rmap_head->val = (unsigned long)desc | 1;
+ rmap_head->val = (unsigned long)desc | KVM_RMAP_MANY;
}
desc->sptes[desc->spte_count++] = spte;
}
@@ -982,7 +957,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
struct pte_list_desc *desc, int i)
{
- struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ struct pte_list_desc *head_desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
int j = head_desc->spte_count - 1;
/*
@@ -1011,7 +986,7 @@ static void pte_list_desc_remove_entry(struct kvm *kvm,
if (!head_desc->more)
rmap_head->val = 0;
else
- rmap_head->val = (unsigned long)head_desc->more | 1;
+ rmap_head->val = (unsigned long)head_desc->more | KVM_RMAP_MANY;
mmu_free_pte_list_desc(head_desc);
}
@@ -1024,13 +999,13 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
return;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
return;
rmap_head->val = 0;
} else {
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
while (desc) {
for (i = 0; i < desc->spte_count; ++i) {
if (desc->sptes[i] == spte) {
@@ -1063,12 +1038,12 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
if (!rmap_head->val)
return false;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val);
goto out;
}
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
for (; desc; desc = next) {
for (i = 0; i < desc->spte_count; i++)
@@ -1088,10 +1063,10 @@ unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
if (!rmap_head->val)
return 0;
- else if (!(rmap_head->val & 1))
+ else if (!(rmap_head->val & KVM_RMAP_MANY))
return 1;
- desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
return desc->tail_count + desc->spte_count;
}
@@ -1153,13 +1128,13 @@ static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
if (!rmap_head->val)
return NULL;
- if (!(rmap_head->val & 1)) {
+ if (!(rmap_head->val & KVM_RMAP_MANY)) {
iter->desc = NULL;
sptep = (u64 *)rmap_head->val;
goto out;
}
- iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
+ iter->desc = (struct pte_list_desc *)(rmap_head->val & ~KVM_RMAP_MANY);
iter->pos = 0;
sptep = iter->desc->sptes[iter->pos];
out:
@@ -1307,15 +1282,6 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
return flush;
}
-/**
- * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
- * @kvm: kvm instance
- * @slot: slot to protect
- * @gfn_offset: start of the BITS_PER_LONG pages we care about
- * @mask: indicates which pages we should protect
- *
- * Used when we do not need to care about huge page mappings.
- */
static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
@@ -1339,16 +1305,6 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
}
-/**
- * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
- * protect the page if the D-bit isn't supported.
- * @kvm: kvm instance
- * @slot: slot to clear D-bit
- * @gfn_offset: start of the BITS_PER_LONG pages we care about
- * @mask: indicates which pages we should clear D-bit
- *
- * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
- */
static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
@@ -1372,24 +1328,16 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
}
}
-/**
- * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
- * PT level pages.
- *
- * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
- * enable dirty logging for them.
- *
- * We need to care about huge page mappings: e.g. during dirty logging we may
- * have such mappings.
- */
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask)
{
/*
- * Huge pages are NOT write protected when we start dirty logging in
- * initially-all-set mode; must write protect them here so that they
- * are split to 4K on the first write.
+ * If the slot was assumed to be "initially all dirty", write-protect
+ * huge pages to ensure they are split to 4KiB on the first write (KVM
+ * dirty logs at 4KiB granularity). If eager page splitting is enabled,
+ * immediately try to split huge pages, e.g. so that vCPUs don't get
+ * saddled with the cost of splitting.
*
* The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
* of memslot has no such restriction, so the range can cross two large
@@ -1411,7 +1359,16 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
PG_LEVEL_2M);
}
- /* Now handle 4K PTEs. */
+ /*
+ * (Re)Enable dirty logging for all 4KiB SPTEs that map the GFNs in
+ * mask. If PML is enabled and the GFN doesn't need to be write-
+ * protected for other reasons, e.g. shadow paging, clear the Dirty bit.
+ * Otherwise clear the Writable bit.
+ *
+ * Note that kvm_mmu_clear_dirty_pt_masked() is called whenever PML is
+ * enabled but it chooses between clearing the Dirty bit and Writeable
+ * bit based on the context.
+ */
if (kvm_x86_ops.cpu_dirty_log_size)
kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
else
@@ -1453,16 +1410,10 @@ static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn, PG_LEVEL_4K);
}
-static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot)
-{
- return kvm_zap_all_rmap_sptes(kvm, rmap_head);
-}
-
static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level)
+ const struct kvm_memory_slot *slot)
{
- return __kvm_zap_rmap(kvm, rmap_head, slot);
+ return kvm_zap_all_rmap_sptes(kvm, rmap_head);
}
struct slot_rmap_walk_iterator {
@@ -1513,7 +1464,7 @@ static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
{
while (++iterator->rmap <= iterator->end_rmap) {
- iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ iterator->gfn += KVM_PAGES_PER_HPAGE(iterator->level);
if (iterator->rmap->val)
return;
@@ -1534,23 +1485,71 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
slot_rmap_walk_okay(_iter_); \
slot_rmap_walk_next(_iter_))
-typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn,
- int level);
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ const struct kvm_memory_slot *slot);
-static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
- struct kvm_gfn_range *range,
- rmap_handler_t handler)
+static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn,
+ bool can_yield, bool flush_on_yield,
+ bool flush)
{
struct slot_rmap_walk_iterator iterator;
- bool ret = false;
- for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- range->start, range->end - 1, &iterator)
- ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
- iterator.level);
+ lockdep_assert_held_write(&kvm->mmu_lock);
- return ret;
+ for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap, slot);
+
+ if (!can_yield)
+ continue;
+
+ if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
+ if (flush && flush_on_yield) {
+ kvm_flush_remote_tlbs_range(kvm, start_gfn,
+ iterator.gfn - start_gfn + 1);
+ flush = false;
+ }
+ cond_resched_rwlock_write(&kvm->mmu_lock);
+ }
+ }
+
+ return flush;
+}
+
+static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ int start_level, int end_level,
+ bool flush_on_yield)
+{
+ return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
+ slot->base_gfn, slot->base_gfn + slot->npages - 1,
+ true, flush_on_yield, false);
+}
+
+static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ slot_rmaps_handler fn,
+ bool flush_on_yield)
+{
+ return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
+}
+
+static bool __kvm_rmap_zap_gfn_range(struct kvm *kvm,
+ const struct kvm_memory_slot *slot,
+ gfn_t start, gfn_t end, bool can_yield,
+ bool flush)
+{
+ return __walk_slot_rmaps(kvm, slot, kvm_zap_rmap,
+ PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, can_yield, true, flush);
}
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1558,7 +1557,9 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool flush = false;
if (kvm_memslots_have_rmaps(kvm))
- flush = kvm_handle_gfn_range(kvm, range, kvm_zap_rmap);
+ flush = __kvm_rmap_zap_gfn_range(kvm, range->slot,
+ range->start, range->end,
+ range->may_block, flush);
if (tdp_mmu_enabled)
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
@@ -1570,31 +1571,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
return flush;
}
-static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level)
-{
- u64 *sptep;
- struct rmap_iterator iter;
- int young = 0;
-
- for_each_rmap_spte(rmap_head, &iter, sptep)
- young |= mmu_spte_age(sptep);
-
- return young;
-}
-
-static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
- struct kvm_memory_slot *slot, gfn_t gfn, int level)
-{
- u64 *sptep;
- struct rmap_iterator iter;
-
- for_each_rmap_spte(rmap_head, &iter, sptep)
- if (is_accessed_spte(*sptep))
- return true;
- return false;
-}
-
#define RMAP_RECYCLE_THRESHOLD 1000
static void __rmap_add(struct kvm *kvm,
@@ -1629,12 +1605,52 @@ static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
__rmap_add(vcpu->kvm, cache, slot, spte, gfn, access);
}
+static bool kvm_rmap_age_gfn_range(struct kvm *kvm,
+ struct kvm_gfn_range *range, bool test_only)
+{
+ struct slot_rmap_walk_iterator iterator;
+ struct rmap_iterator iter;
+ bool young = false;
+ u64 *sptep;
+
+ for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
+ range->start, range->end - 1, &iterator) {
+ for_each_rmap_spte(iterator.rmap, &iter, sptep) {
+ u64 spte = *sptep;
+
+ if (!is_accessed_spte(spte))
+ continue;
+
+ if (test_only)
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * Capture the dirty status of the page, so that
+ * it doesn't get lost when the SPTE is marked
+ * for access tracking.
+ */
+ if (is_writable_pte(spte))
+ kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+ spte = mark_spte_for_access_track(spte);
+ mmu_spte_update_no_track(sptep, spte);
+ }
+ young = true;
+ }
+ }
+ return young;
+}
+
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_age_rmap);
+ young = kvm_rmap_age_gfn_range(kvm, range, false);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1647,7 +1663,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
bool young = false;
if (kvm_memslots_have_rmaps(kvm))
- young = kvm_handle_gfn_range(kvm, range, kvm_test_age_rmap);
+ young = kvm_rmap_age_gfn_range(kvm, range, true);
if (tdp_mmu_enabled)
young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
@@ -2713,36 +2729,49 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
write_unlock(&kvm->mmu_lock);
}
-int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
+bool __kvm_mmu_unprotect_gfn_and_retry(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ bool always_retry)
{
- struct kvm_mmu_page *sp;
+ struct kvm *kvm = vcpu->kvm;
LIST_HEAD(invalid_list);
- int r;
+ struct kvm_mmu_page *sp;
+ gpa_t gpa = cr2_or_gpa;
+ bool r = false;
+
+ /*
+ * Bail early if there aren't any write-protected shadow pages to avoid
+ * unnecessarily taking mmu_lock lock, e.g. if the gfn is write-tracked
+ * by a third party. Reading indirect_shadow_pages without holding
+ * mmu_lock is safe, as this is purely an optimization, i.e. a false
+ * positive is benign, and a false negative will simply result in KVM
+ * skipping the unprotect+retry path, which is also an optimization.
+ */
+ if (!READ_ONCE(kvm->arch.indirect_shadow_pages))
+ goto out;
+
+ if (!vcpu->arch.mmu->root_role.direct) {
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
+ if (gpa == INVALID_GPA)
+ goto out;
+ }
- r = 0;
write_lock(&kvm->mmu_lock);
- for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
- r = 1;
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gpa_to_gfn(gpa))
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
- }
+
+ /*
+ * Snapshot the result before zapping, as zapping will remove all list
+ * entries, i.e. checking the list later would yield a false negative.
+ */
+ r = !list_empty(&invalid_list);
kvm_mmu_commit_zap_page(kvm, &invalid_list);
write_unlock(&kvm->mmu_lock);
- return r;
-}
-
-static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
-{
- gpa_t gpa;
- int r;
-
- if (vcpu->arch.mmu->root_role.direct)
- return 0;
-
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
- r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
+out:
+ if (r || always_retry) {
+ vcpu->arch.last_retry_eip = kvm_rip_read(vcpu);
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
+ }
return r;
}
@@ -2914,10 +2943,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
trace_kvm_mmu_set_spte(level, gfn, sptep);
}
- if (wrprot) {
- if (write_fault)
- ret = RET_PF_EMULATE;
- }
+ if (wrprot && write_fault)
+ ret = RET_PF_WRITE_PROTECTED;
if (flush)
kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level);
@@ -4549,7 +4576,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
return RET_PF_RETRY;
if (page_fault_handle_page_track(vcpu, fault))
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
@@ -4618,8 +4645,6 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
if (!flags) {
trace_kvm_page_fault(vcpu, fault_address, error_code);
- if (kvm_event_needs_reinjection(vcpu))
- kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
@@ -4642,7 +4667,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
int r;
if (page_fault_handle_page_track(vcpu, fault))
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
r = fast_page_fault(vcpu, fault);
if (r != RET_PF_INVALID)
@@ -4719,6 +4744,7 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code,
switch (r) {
case RET_PF_FIXED:
case RET_PF_SPURIOUS:
+ case RET_PF_WRITE_PROTECTED:
return 0;
case RET_PF_EMULATE:
@@ -5963,6 +5989,106 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
write_unlock(&vcpu->kvm->mmu_lock);
}
+static bool is_write_to_guest_page_table(u64 error_code)
+{
+ const u64 mask = PFERR_GUEST_PAGE_MASK | PFERR_WRITE_MASK | PFERR_PRESENT_MASK;
+
+ return (error_code & mask) == mask;
+}
+
+static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u64 error_code, int *emulation_type)
+{
+ bool direct = vcpu->arch.mmu->root_role.direct;
+
+ /*
+ * Do not try to unprotect and retry if the vCPU re-faulted on the same
+ * RIP with the same address that was previously unprotected, as doing
+ * so will likely put the vCPU into an infinite. E.g. if the vCPU uses
+ * a non-page-table modifying instruction on the PDE that points to the
+ * instruction, then unprotecting the gfn will unmap the instruction's
+ * code, i.e. make it impossible for the instruction to ever complete.
+ */
+ if (vcpu->arch.last_retry_eip == kvm_rip_read(vcpu) &&
+ vcpu->arch.last_retry_addr == cr2_or_gpa)
+ return RET_PF_EMULATE;
+
+ /*
+ * Reset the unprotect+retry values that guard against infinite loops.
+ * The values will be refreshed if KVM explicitly unprotects a gfn and
+ * retries, in all other cases it's safe to retry in the future even if
+ * the next page fault happens on the same RIP+address.
+ */
+ vcpu->arch.last_retry_eip = 0;
+ vcpu->arch.last_retry_addr = 0;
+
+ /*
+ * It should be impossible to reach this point with an MMIO cache hit,
+ * as RET_PF_WRITE_PROTECTED is returned if and only if there's a valid,
+ * writable memslot, and creating a memslot should invalidate the MMIO
+ * cache by way of changing the memslot generation. WARN and disallow
+ * retry if MMIO is detected, as retrying MMIO emulation is pointless
+ * and could put the vCPU into an infinite loop because the processor
+ * will keep faulting on the non-existent MMIO address.
+ */
+ if (WARN_ON_ONCE(mmio_info_in_cache(vcpu, cr2_or_gpa, direct)))
+ return RET_PF_EMULATE;
+
+ /*
+ * Before emulating the instruction, check to see if the access was due
+ * to a read-only violation while the CPU was walking non-nested NPT
+ * page tables, i.e. for a direct MMU, for _guest_ page tables in L1.
+ * If L1 is sharing (a subset of) its page tables with L2, e.g. by
+ * having nCR3 share lower level page tables with hCR3, then when KVM
+ * (L0) write-protects the nested NPTs, i.e. npt12 entries, KVM is also
+ * unknowingly write-protecting L1's guest page tables, which KVM isn't
+ * shadowing.
+ *
+ * Because the CPU (by default) walks NPT page tables using a write
+ * access (to ensure the CPU can do A/D updates), page walks in L1 can
+ * trigger write faults for the above case even when L1 isn't modifying
+ * PTEs. As a result, KVM will unnecessarily emulate (or at least, try
+ * to emulate) an excessive number of L1 instructions; because L1's MMU
+ * isn't shadowed by KVM, there is no need to write-protect L1's gPTEs
+ * and thus no need to emulate in order to guarantee forward progress.
+ *
+ * Try to unprotect the gfn, i.e. zap any shadow pages, so that L1 can
+ * proceed without triggering emulation. If one or more shadow pages
+ * was zapped, skip emulation and resume L1 to let it natively execute
+ * the instruction. If no shadow pages were zapped, then the write-
+ * fault is due to something else entirely, i.e. KVM needs to emulate,
+ * as resuming the guest will put it into an infinite loop.
+ *
+ * Note, this code also applies to Intel CPUs, even though it is *very*
+ * unlikely that an L1 will share its page tables (IA32/PAE/paging64
+ * format) with L2's page tables (EPT format).
+ *
+ * For indirect MMUs, i.e. if KVM is shadowing the current MMU, try to
+ * unprotect the gfn and retry if an event is awaiting reinjection. If
+ * KVM emulates multiple instructions before completing event injection,
+ * the event could be delayed beyond what is architecturally allowed,
+ * e.g. KVM could inject an IRQ after the TPR has been raised.
+ */
+ if (((direct && is_write_to_guest_page_table(error_code)) ||
+ (!direct && kvm_event_needs_reinjection(vcpu))) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
+ return RET_PF_RETRY;
+
+ /*
+ * The gfn is write-protected, but if KVM detects its emulating an
+ * instruction that is unlikely to be used to modify page tables, or if
+ * emulation fails, KVM can try to unprotect the gfn and let the CPU
+ * re-execute the instruction that caused the page fault. Do not allow
+ * retrying an instruction from a nested guest as KVM is only explicitly
+ * shadowing L1's page tables, i.e. unprotecting something for L1 isn't
+ * going to magically fix whatever issue caused L2 to fail.
+ */
+ if (!is_guest_mode(vcpu))
+ *emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
+
+ return RET_PF_EMULATE;
+}
+
int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
{
@@ -6008,6 +6134,10 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
if (r < 0)
return r;
+ if (r == RET_PF_WRITE_PROTECTED)
+ r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code,
+ &emulation_type);
+
if (r == RET_PF_FIXED)
vcpu->stat.pf_fixed++;
else if (r == RET_PF_EMULATE)
@@ -6018,32 +6148,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err
if (r != RET_PF_EMULATE)
return 1;
- /*
- * Before emulating the instruction, check if the error code
- * was due to a RO violation while translating the guest page.
- * This can occur when using nested virtualization with nested
- * paging in both guests. If true, we simply unprotect the page
- * and resume the guest.
- */
- if (vcpu->arch.mmu->root_role.direct &&
- (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
- return 1;
- }
-
- /*
- * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
- * optimistically try to just unprotect the page and let the processor
- * re-execute the instruction that caused the page fault. Do not allow
- * retrying MMIO emulation, as it's not only pointless but could also
- * cause us to enter an infinite loop because the processor will keep
- * faulting on the non-existent MMIO address. Retrying an instruction
- * from a nested guest is also pointless and dangerous as we are only
- * explicitly shadowing L1's page tables, i.e. unprotecting something
- * for L1 isn't going to magically fix whatever issue cause L2 to fail.
- */
- if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
- emulation_type |= EMULTYPE_ALLOW_RETRY_PF;
emulate:
return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
@@ -6202,59 +6306,6 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
}
EXPORT_SYMBOL_GPL(kvm_configure_mmu);
-/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_rmaps_handler) (struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- const struct kvm_memory_slot *slot);
-
-static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn,
- bool flush_on_yield, bool flush)
-{
- struct slot_rmap_walk_iterator iterator;
-
- lockdep_assert_held_write(&kvm->mmu_lock);
-
- for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
- end_gfn, &iterator) {
- if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap, slot);
-
- if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
- if (flush && flush_on_yield) {
- kvm_flush_remote_tlbs_range(kvm, start_gfn,
- iterator.gfn - start_gfn + 1);
- flush = false;
- }
- cond_resched_rwlock_write(&kvm->mmu_lock);
- }
- }
-
- return flush;
-}
-
-static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- int start_level, int end_level,
- bool flush_on_yield)
-{
- return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
- slot->base_gfn, slot->base_gfn + slot->npages - 1,
- flush_on_yield, false);
-}
-
-static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
- const struct kvm_memory_slot *slot,
- slot_rmaps_handler fn,
- bool flush_on_yield)
-{
- return walk_slot_rmaps(kvm, slot, fn, PG_LEVEL_4K, PG_LEVEL_4K, flush_on_yield);
-}
-
static void free_mmu_pages(struct kvm_mmu *mmu)
{
if (!tdp_enabled && mmu->pae_root)
@@ -6528,9 +6579,8 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e
if (WARN_ON_ONCE(start >= end))
continue;
- flush = __walk_slot_rmaps(kvm, memslot, __kvm_zap_rmap,
- PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true, flush);
+ flush = __kvm_rmap_zap_gfn_range(kvm, memslot, start,
+ end, true, flush);
}
}
@@ -6818,7 +6868,7 @@ static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
*/
for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
__walk_slot_rmaps(kvm, slot, shadow_mmu_try_split_huge_pages,
- level, level, start, end - 1, true, false);
+ level, level, start, end - 1, true, true, false);
}
/* Must be called with the mmu_lock held in write-mode. */
@@ -6997,10 +7047,42 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
kvm_mmu_zap_all(kvm);
}
+/*
+ * Zapping leaf SPTEs with memslot range when a memslot is moved/deleted.
+ *
+ * Zapping non-leaf SPTEs, a.k.a. not-last SPTEs, isn't required, worst
+ * case scenario we'll have unused shadow pages lying around until they
+ * are recycled due to age or when the VM is destroyed.
+ */
+static void kvm_mmu_zap_memslot_leafs(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_gfn_range range = {
+ .slot = slot,
+ .start = slot->base_gfn,
+ .end = slot->base_gfn + slot->npages,
+ .may_block = true,
+ };
+
+ write_lock(&kvm->mmu_lock);
+ if (kvm_unmap_gfn_range(kvm, &range))
+ kvm_flush_remote_tlbs_memslot(kvm, slot);
+
+ write_unlock(&kvm->mmu_lock);
+}
+
+static inline bool kvm_memslot_flush_zap_all(struct kvm *kvm)
+{
+ return kvm->arch.vm_type == KVM_X86_DEFAULT_VM &&
+ kvm_check_has_quirk(kvm, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+}
+
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_zap_all_fast(kvm);
+ if (kvm_memslot_flush_zap_all(kvm))
+ kvm_mmu_zap_all_fast(kvm);
+ else
+ kvm_mmu_zap_memslot_leafs(kvm, slot);
}
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 1721d97743e9..c98827840e07 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
* RET_PF_CONTINUE: So far, so good, keep handling the page fault.
* RET_PF_RETRY: let CPU fault again on the address.
* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the
+ * gfn and retry, or emulate the instruction directly.
* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
* RET_PF_FIXED: The faulting entry has been fixed.
* RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU.
@@ -274,6 +276,7 @@ enum {
RET_PF_CONTINUE = 0,
RET_PF_RETRY,
RET_PF_EMULATE,
+ RET_PF_WRITE_PROTECTED,
RET_PF_INVALID,
RET_PF_FIXED,
RET_PF_SPURIOUS,
@@ -349,8 +352,6 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level);
-void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
-
void track_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void untrack_possible_nx_huge_page(struct kvm *kvm, struct kvm_mmu_page *sp);
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 195d98bc8de8..f35a830ce469 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -57,6 +57,7 @@
TRACE_DEFINE_ENUM(RET_PF_CONTINUE);
TRACE_DEFINE_ENUM(RET_PF_RETRY);
TRACE_DEFINE_ENUM(RET_PF_EMULATE);
+TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED);
TRACE_DEFINE_ENUM(RET_PF_INVALID);
TRACE_DEFINE_ENUM(RET_PF_FIXED);
TRACE_DEFINE_ENUM(RET_PF_SPURIOUS);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 69941cebb3a8..ae7d39ff2d07 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -646,10 +646,10 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
* really care if it changes underneath us after this point).
*/
if (FNAME(gpte_changed)(vcpu, gw, top_level))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
/*
* Load a new root and retry the faulting instruction in the extremely
@@ -659,7 +659,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
*/
if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) {
kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu);
- goto out_gpte_changed;
+ return RET_PF_RETRY;
}
for_each_shadow_entry(vcpu, fault->addr, it) {
@@ -674,34 +674,38 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn,
false, access);
- if (sp != ERR_PTR(-EEXIST)) {
- /*
- * We must synchronize the pagetable before linking it
- * because the guest doesn't need to flush tlb when
- * the gpte is changed from non-present to present.
- * Otherwise, the guest may use the wrong mapping.
- *
- * For PG_LEVEL_4K, kvm_mmu_get_page() has already
- * synchronized it transiently via kvm_sync_page().
- *
- * For higher level pagetable, we synchronize it via
- * the slower mmu_sync_children(). If it needs to
- * break, some progress has been made; return
- * RET_PF_RETRY and retry on the next #PF.
- * KVM_REQ_MMU_SYNC is not necessary but it
- * expedites the process.
- */
- if (sp->unsync_children &&
- mmu_sync_children(vcpu, sp, false))
- return RET_PF_RETRY;
- }
+ /*
+ * Synchronize the new page before linking it, as the CPU (KVM)
+ * is architecturally disallowed from inserting non-present
+ * entries into the TLB, i.e. the guest isn't required to flush
+ * the TLB when changing the gPTE from non-present to present.
+ *
+ * For PG_LEVEL_4K, kvm_mmu_find_shadow_page() has already
+ * synchronized the page via kvm_sync_page().
+ *
+ * For higher level pages, which cannot be unsync themselves
+ * but can have unsync children, synchronize via the slower
+ * mmu_sync_children(). If KVM needs to drop mmu_lock due to
+ * contention or to reschedule, instruct the caller to retry
+ * the #PF (mmu_sync_children() ensures forward progress will
+ * be made).
+ */
+ if (sp != ERR_PTR(-EEXIST) && sp->unsync_children &&
+ mmu_sync_children(vcpu, sp, false))
+ return RET_PF_RETRY;
/*
- * Verify that the gpte in the page we've just write
- * protected is still there.
+ * Verify that the gpte in the page, which is now either
+ * write-protected or unsync, wasn't modified between the fault
+ * and acquiring mmu_lock. This needs to be done even when
+ * reusing an existing shadow page to ensure the information
+ * gathered by the walker matches the information stored in the
+ * shadow page (which could have been modified by a different
+ * vCPU even if the page was already linked). Holding mmu_lock
+ * prevents the shadow page from changing after this point.
*/
if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
- goto out_gpte_changed;
+ return RET_PF_RETRY;
if (sp != ERR_PTR(-EEXIST))
link_shadow_page(vcpu, it.sptep, sp);
@@ -755,9 +759,6 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return ret;
-
-out_gpte_changed:
- return RET_PF_RETRY;
}
/*
@@ -805,7 +806,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
if (page_fault_handle_page_track(vcpu, fault)) {
shadow_page_table_clear_flood(vcpu, fault->addr);
- return RET_PF_EMULATE;
+ return RET_PF_WRITE_PROTECTED;
}
r = mmu_topup_memory_caches(vcpu, true);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 3c55955bcaf8..3b996c1fdaab 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1046,10 +1046,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
* protected, emulation is needed. If the emulation was skipped,
* the vCPU would have the same fault again.
*/
- if (wrprot) {
- if (fault->write)
- ret = RET_PF_EMULATE;
- }
+ if (wrprot && fault->write)
+ ret = RET_PF_WRITE_PROTECTED;
/* If a MMIO SPTE is installed, the MMIO will need to be emulated. */
if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) {
diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h
index 2f4e155080ba..0d17d6b70639 100644
--- a/arch/x86/kvm/reverse_cpuid.h
+++ b/arch/x86/kvm/reverse_cpuid.h
@@ -17,6 +17,7 @@ enum kvm_only_cpuid_leafs {
CPUID_8000_0007_EDX,
CPUID_8000_0022_EAX,
CPUID_7_2_EDX,
+ CPUID_24_0_EBX,
NR_KVM_CPU_CAPS,
NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS,
@@ -46,6 +47,7 @@ enum kvm_only_cpuid_leafs {
#define X86_FEATURE_AVX_NE_CONVERT KVM_X86_FEATURE(CPUID_7_1_EDX, 5)
#define X86_FEATURE_AMX_COMPLEX KVM_X86_FEATURE(CPUID_7_1_EDX, 8)
#define X86_FEATURE_PREFETCHITI KVM_X86_FEATURE(CPUID_7_1_EDX, 14)
+#define X86_FEATURE_AVX10 KVM_X86_FEATURE(CPUID_7_1_EDX, 19)
/* Intel-defined sub-features, CPUID level 0x00000007:2 (EDX) */
#define X86_FEATURE_INTEL_PSFD KVM_X86_FEATURE(CPUID_7_2_EDX, 0)
@@ -55,6 +57,11 @@ enum kvm_only_cpuid_leafs {
#define KVM_X86_FEATURE_BHI_CTRL KVM_X86_FEATURE(CPUID_7_2_EDX, 4)
#define X86_FEATURE_MCDT_NO KVM_X86_FEATURE(CPUID_7_2_EDX, 5)
+/* Intel-defined sub-features, CPUID level 0x00000024:0 (EBX) */
+#define X86_FEATURE_AVX10_128 KVM_X86_FEATURE(CPUID_24_0_EBX, 16)
+#define X86_FEATURE_AVX10_256 KVM_X86_FEATURE(CPUID_24_0_EBX, 17)
+#define X86_FEATURE_AVX10_512 KVM_X86_FEATURE(CPUID_24_0_EBX, 18)
+
/* CPUID level 0x80000007 (EDX). */
#define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8)
@@ -90,6 +97,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX},
[CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX},
[CPUID_7_2_EDX] = { 7, 2, CPUID_EDX},
+ [CPUID_24_0_EBX] = { 0x24, 0, CPUID_EBX},
};
/*
diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c
index 00e3c27d2a87..85241c0c7f56 100644
--- a/arch/x86/kvm/smm.c
+++ b/arch/x86/kvm/smm.c
@@ -624,17 +624,31 @@ int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
#endif
/*
- * Give leave_smm() a chance to make ISA-specific changes to the vCPU
- * state (e.g. enter guest mode) before loading state from the SMM
- * state-save area.
+ * FIXME: When resuming L2 (a.k.a. guest mode), the transition to guest
+ * mode should happen _after_ loading state from SMRAM. However, KVM
+ * piggybacks the nested VM-Enter flows (which is wrong for many other
+ * reasons), and so nSVM/nVMX would clobber state that is loaded from
+ * SMRAM and from the VMCS/VMCB.
*/
if (kvm_x86_call(leave_smm)(vcpu, &smram))
return X86EMUL_UNHANDLEABLE;
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- return rsm_load_state_64(ctxt, &smram.smram64);
+ ret = rsm_load_state_64(ctxt, &smram.smram64);
else
#endif
- return rsm_load_state_32(ctxt, &smram.smram32);
+ ret = rsm_load_state_32(ctxt, &smram.smram32);
+
+ /*
+ * If RSM fails and triggers shutdown, architecturally the shutdown
+ * occurs *before* the transition to guest mode. But due to KVM's
+ * flawed handling of RSM to L2 (see above), the vCPU may already be
+ * in_guest_mode(). Force the vCPU out of guest mode before delivering
+ * the shutdown, so that L1 enters shutdown instead of seeing a VM-Exit
+ * that architecturally shouldn't be possible.
+ */
+ if (ret != X86EMUL_CONTINUE && is_guest_mode(vcpu))
+ kvm_leave_nested(vcpu);
+ return ret;
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 6f704c1037e5..d5314cb7dff4 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1693,8 +1693,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
ret = -ENOMEM;
- ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
- save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ save = kzalloc(sizeof(*save), GFP_KERNEL);
if (!ctl || !save)
goto out_free;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 5ab2c92c7331..9df3e1e5ae81 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -573,7 +573,7 @@ static void __svm_write_tsc_multiplier(u64 multiplier)
static __always_inline struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd)
{
- return page_address(sd->save_area) + 0x400;
+ return &sd->save_area->host_sev_es_save;
}
static inline void kvm_cpu_svm_disable(void)
@@ -592,14 +592,14 @@ static inline void kvm_cpu_svm_disable(void)
}
}
-static void svm_emergency_disable(void)
+static void svm_emergency_disable_virtualization_cpu(void)
{
kvm_rebooting = true;
kvm_cpu_svm_disable();
}
-static void svm_hardware_disable(void)
+static void svm_disable_virtualization_cpu(void)
{
/* Make sure we clean up behind us */
if (tsc_scaling)
@@ -610,7 +610,7 @@ static void svm_hardware_disable(void)
amd_pmu_disable_virt();
}
-static int svm_hardware_enable(void)
+static int svm_enable_virtualization_cpu(void)
{
struct svm_cpu_data *sd;
@@ -696,7 +696,7 @@ static void svm_cpu_uninit(int cpu)
return;
kfree(sd->sev_vmcbs);
- __free_page(sd->save_area);
+ __free_page(__sme_pa_to_page(sd->save_area_pa));
sd->save_area_pa = 0;
sd->save_area = NULL;
}
@@ -704,23 +704,24 @@ static void svm_cpu_uninit(int cpu)
static int svm_cpu_init(int cpu)
{
struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu);
+ struct page *save_area_page;
int ret = -ENOMEM;
memset(sd, 0, sizeof(struct svm_cpu_data));
- sd->save_area = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
- if (!sd->save_area)
+ save_area_page = snp_safe_alloc_page_node(cpu_to_node(cpu), GFP_KERNEL);
+ if (!save_area_page)
return ret;
ret = sev_cpu_init(sd);
if (ret)
goto free_save_area;
- sd->save_area_pa = __sme_page_pa(sd->save_area);
+ sd->save_area = page_address(save_area_page);
+ sd->save_area_pa = __sme_page_pa(save_area_page);
return 0;
free_save_area:
- __free_page(sd->save_area);
- sd->save_area = NULL;
+ __free_page(save_area_page);
return ret;
}
@@ -1124,8 +1125,7 @@ static void svm_hardware_unsetup(void)
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT),
- get_order(IOPM_SIZE));
+ __free_pages(__sme_pa_to_page(iopm_base), get_order(IOPM_SIZE));
iopm_base = 0;
}
@@ -1301,7 +1301,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
if (!kvm_hlt_in_guest(vcpu->kvm))
svm_set_intercept(svm, INTERCEPT_HLT);
- control->iopm_base_pa = __sme_set(iopm_base);
+ control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
control->int_ctl = V_INTR_MASKING_MASK;
@@ -1503,7 +1503,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu)
sev_free_vcpu(vcpu);
- __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT));
+ __free_page(__sme_pa_to_page(svm->vmcb01.pa));
__free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE));
}
@@ -1533,7 +1533,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
* TSC_AUX is always virtualized for SEV-ES guests when the feature is
* available. The user return MSR support is not required in this case
* because TSC_AUX is restored on #VMEXIT from the host save area
- * (which has been initialized in svm_hardware_enable()).
+ * (which has been initialized in svm_enable_virtualization_cpu()).
*/
if (likely(tsc_aux_uret_slot >= 0) &&
(!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm)))
@@ -2825,17 +2825,17 @@ static int efer_trap(struct kvm_vcpu *vcpu)
return kvm_complete_insn_gp(vcpu, ret);
}
-static int svm_get_msr_feature(struct kvm_msr_entry *msr)
+static int svm_get_feature_msr(u32 msr, u64 *data)
{
- msr->data = 0;
+ *data = 0;
- switch (msr->index) {
+ switch (msr) {
case MSR_AMD64_DE_CFG:
if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
- msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
+ *data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
@@ -3144,7 +3144,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
* feature is available. The user return MSR support is not
* required in this case because TSC_AUX is restored on #VMEXIT
* from the host save area (which has been initialized in
- * svm_hardware_enable()).
+ * svm_enable_virtualization_cpu()).
*/
if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(vcpu->kvm))
break;
@@ -3191,18 +3191,21 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
break;
case MSR_AMD64_DE_CFG: {
- struct kvm_msr_entry msr_entry;
+ u64 supported_de_cfg;
- msr_entry.index = msr->index;
- if (svm_get_msr_feature(&msr_entry))
+ if (svm_get_feature_msr(ecx, &supported_de_cfg))
return 1;
- /* Check the supported bits */
- if (data & ~msr_entry.data)
+ if (data & ~supported_de_cfg)
return 1;
- /* Don't allow the guest to change a bit, #GP */
- if (!msr->host_initiated && (data ^ msr_entry.data))
+ /*
+ * Don't let the guest change the host-programmed value. The
+ * MSR is very model specific, i.e. contains multiple bits that
+ * are completely unknown to KVM, and the one bit known to KVM
+ * is simply a reflection of hardware capabilities.
+ */
+ if (!msr->host_initiated && data != svm->msr_decfg)
return 1;
svm->msr_decfg = data;
@@ -4156,12 +4159,21 @@ static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
- if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
- to_svm(vcpu)->vmcb->control.exit_info_1)
+ switch (svm->vmcb->control.exit_code) {
+ case SVM_EXIT_MSR:
+ if (!svm->vmcb->control.exit_info_1)
+ break;
return handle_fastpath_set_msr_irqoff(vcpu);
+ case SVM_EXIT_HLT:
+ return handle_fastpath_hlt(vcpu);
+ default:
+ break;
+ }
return EXIT_FASTPATH_NONE;
}
@@ -4992,8 +5004,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.check_processor_compatibility = svm_check_processor_compat,
.hardware_unsetup = svm_hardware_unsetup,
- .hardware_enable = svm_hardware_enable,
- .hardware_disable = svm_hardware_disable,
+ .enable_virtualization_cpu = svm_enable_virtualization_cpu,
+ .disable_virtualization_cpu = svm_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = svm_emergency_disable_virtualization_cpu,
.has_emulated_msr = svm_has_emulated_msr,
.vcpu_create = svm_vcpu_create,
@@ -5011,7 +5024,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap,
- .get_msr_feature = svm_get_msr_feature,
+ .get_feature_msr = svm_get_feature_msr,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
.get_segment_base = svm_get_segment_base,
@@ -5062,6 +5075,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.enable_nmi_window = svm_enable_nmi_window,
.enable_irq_window = svm_enable_irq_window,
.update_cr8_intercept = svm_update_cr8_intercept,
+
+ .x2apic_icr_is_split = true,
.set_virtual_apic_mode = avic_refresh_virtual_apic_mode,
.refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl,
.apicv_post_state_restore = avic_apicv_post_state_restore,
@@ -5266,7 +5281,7 @@ static __init int svm_hardware_setup(void)
iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+ iopm_base = __sme_page_pa(iopm_pages);
init_msrpm_offsets();
@@ -5425,8 +5440,6 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static void __svm_exit(void)
{
kvm_x86_vendor_exit();
-
- cpu_emergency_unregister_virt_callback(svm_emergency_disable);
}
static int __init svm_init(void)
@@ -5442,8 +5455,6 @@ static int __init svm_init(void)
if (r)
return r;
- cpu_emergency_register_virt_callback(svm_emergency_disable);
-
/*
* Common KVM initialization _must_ come last, after this, /dev/kvm is
* exposed to userspace!
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 76107c7d0595..43fa6a16eb19 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -25,7 +25,21 @@
#include "cpuid.h"
#include "kvm_cache_regs.h"
-#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
+/*
+ * Helpers to convert to/from physical addresses for pages whose address is
+ * consumed directly by hardware. Even though it's a physical address, SVM
+ * often restricts the address to the natural width, hence 'unsigned long'
+ * instead of 'hpa_t'.
+ */
+static inline unsigned long __sme_page_pa(struct page *page)
+{
+ return __sme_set(page_to_pfn(page) << PAGE_SHIFT);
+}
+
+static inline struct page *__sme_pa_to_page(unsigned long pa)
+{
+ return pfn_to_page(__sme_clr(pa) >> PAGE_SHIFT);
+}
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
@@ -321,7 +335,7 @@ struct svm_cpu_data {
u32 next_asid;
u32 min_asid;
- struct page *save_area;
+ struct vmcb *save_area;
unsigned long save_area_pa;
struct vmcb *current_vmcb;
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index a0c8eb37d3e1..2ed80aea3bb1 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -209,10 +209,8 @@ SYM_FUNC_START(__svm_vcpu_run)
7: vmload %_ASM_AX
8:
-#ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
- FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
-#endif
+ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
/* Clobbers RAX, RCX, RDX. */
RESTORE_HOST_SPEC_CTRL
@@ -348,10 +346,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
2: cli
-#ifdef CONFIG_MITIGATION_RETPOLINE
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
- FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
-#endif
+ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
/* Clobbers RAX, RCX, RDX, consumes RDI (@svm) and RSI (@spec_ctrl_intercepted). */
RESTORE_HOST_SPEC_CTRL
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 41a4533f9989..cb6588238f46 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -54,9 +54,7 @@ struct nested_vmx_msrs {
};
struct vmcs_config {
- int size;
- u32 basic_cap;
- u32 revision_id;
+ u64 basic;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
@@ -76,7 +74,7 @@ extern struct vmx_capability vmx_capability __ro_after_init;
static inline bool cpu_has_vmx_basic_inout(void)
{
- return (((u64)vmcs_config.basic_cap << 32) & VMX_BASIC_INOUT);
+ return vmcs_config.basic & VMX_BASIC_INOUT;
}
static inline bool cpu_has_virtual_nmis(void)
@@ -225,7 +223,7 @@ static inline bool cpu_has_vmx_vmfunc(void)
static inline bool cpu_has_vmx_shadow_vmcs(void)
{
/* check if the cpu supports writing r/o exit information fields */
- if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ if (!(vmcs_config.misc & VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
return false;
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -367,7 +365,7 @@ static inline bool cpu_has_vmx_invvpid_global(void)
static inline bool cpu_has_vmx_intel_pt(void)
{
- return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) &&
+ return (vmcs_config.misc & VMX_MISC_INTEL_PT) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 0bf35ebe8a1b..7668e2fb8043 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -23,8 +23,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.hardware_unsetup = vmx_hardware_unsetup,
- .hardware_enable = vmx_hardware_enable,
- .hardware_disable = vmx_hardware_disable,
+ .enable_virtualization_cpu = vmx_enable_virtualization_cpu,
+ .disable_virtualization_cpu = vmx_disable_virtualization_cpu,
+ .emergency_disable_virtualization_cpu = vmx_emergency_disable_virtualization_cpu,
+
.has_emulated_msr = vmx_has_emulated_msr,
.vm_size = sizeof(struct kvm_vmx),
@@ -41,7 +43,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.vcpu_put = vmx_vcpu_put,
.update_exception_bitmap = vmx_update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
+ .get_feature_msr = vmx_get_feature_msr,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
.get_segment_base = vmx_get_segment_base,
@@ -89,6 +91,8 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.enable_nmi_window = vmx_enable_nmi_window,
.enable_irq_window = vmx_enable_irq_window,
.update_cr8_intercept = vmx_update_cr8_intercept,
+
+ .x2apic_icr_is_split = false,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 2392a7ef254d..a8e7bc04d9bf 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -981,7 +981,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
goto fail;
}
- if (kvm_set_msr(vcpu, e.index, e.value)) {
+ if (kvm_set_msr_with_filter(vcpu, e.index, e.value)) {
pr_debug_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
__func__, i, e.index, e.value);
@@ -1017,7 +1017,7 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
}
}
- if (kvm_get_msr(vcpu, msr_index, data)) {
+ if (kvm_get_msr_with_filter(vcpu, msr_index, data)) {
pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
msr_index);
return false;
@@ -1112,9 +1112,9 @@ static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
/*
* Emulated VMEntry does not fail here. Instead a less
* accurate value will be returned by
- * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
- * instead of reading the value from the vmcs02 VMExit
- * MSR-store area.
+ * nested_vmx_get_vmexit_msr_value() by reading KVM's
+ * internal MSR state instead of reading the value from
+ * the vmcs02 VMExit MSR-store area.
*/
pr_warn_ratelimited(
"Not enough msr entries in msr_autostore. Can't add msr %x\n",
@@ -1251,21 +1251,32 @@ static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{
- const u64 feature_and_reserved =
- /* feature (except bit 48; see below) */
- BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
- /* reserved */
- BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT |
+ VMX_BASIC_INOUT |
+ VMX_BASIC_TRUE_CTLS;
+
+ const u64 reserved_bits = GENMASK_ULL(63, 56) |
+ GENMASK_ULL(47, 45) |
+ BIT_ULL(31);
+
u64 vmx_basic = vmcs_config.nested.basic;
- if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * Except for 32BIT_PHYS_ADDR_ONLY, which is an anti-feature bit (has
+ * inverted polarity), the incoming value must not set feature bits or
+ * reserved bits that aren't allowed/supported by KVM. Fields, i.e.
+ * multi-bit values, are explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_basic, data, feature_bits | reserved_bits))
return -EINVAL;
/*
* KVM does not emulate a version of VMX that constrains physical
* addresses of VMX structures (e.g. VMCS) to 32-bits.
*/
- if (data & BIT_ULL(48))
+ if (data & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EINVAL;
if (vmx_basic_vmcs_revision_id(vmx_basic) !=
@@ -1334,16 +1345,29 @@ vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
{
- const u64 feature_and_reserved_bits =
- /* feature */
- BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
- BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
- /* reserved */
- GENMASK_ULL(13, 9) | BIT_ULL(31);
+ const u64 feature_bits = VMX_MISC_SAVE_EFER_LMA |
+ VMX_MISC_ACTIVITY_HLT |
+ VMX_MISC_ACTIVITY_SHUTDOWN |
+ VMX_MISC_ACTIVITY_WAIT_SIPI |
+ VMX_MISC_INTEL_PT |
+ VMX_MISC_RDMSR_IN_SMM |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMXOFF_BLOCK_SMI |
+ VMX_MISC_ZERO_LEN_INS;
+
+ const u64 reserved_bits = BIT_ULL(31) | GENMASK_ULL(13, 9);
+
u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
vmcs_config.nested.misc_high);
- if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ BUILD_BUG_ON(feature_bits & reserved_bits);
+
+ /*
+ * The incoming value must not set feature bits or reserved bits that
+ * aren't allowed/supported by KVM. Fields, i.e. multi-bit values, are
+ * explicitly checked below.
+ */
+ if (!is_bitwise_subset(vmx_misc, data, feature_bits | reserved_bits))
return -EINVAL;
if ((vmx->nested.msrs.pinbased_ctls_high &
@@ -2317,10 +2341,12 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
/* Posted interrupts setting is only taken from vmcs12. */
vmx->nested.pi_pending = false;
- if (nested_cpu_has_posted_intr(vmcs12))
+ if (nested_cpu_has_posted_intr(vmcs12)) {
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- else
+ } else {
+ vmx->nested.posted_intr_nv = -1;
exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
pin_controls_set(vmx, exec_control);
/*
@@ -2470,6 +2496,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
+
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
@@ -2507,7 +2534,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- vmx->segment_cache.bitmask = 0;
+ vmx_segment_cache_clear(vmx);
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -4284,11 +4311,52 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
}
if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
+ int irq;
+
if (block_nested_events)
return -EBUSY;
if (!nested_exit_on_intr(vcpu))
goto no_vmexit;
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+
+ if (!nested_exit_intr_ack_set(vcpu)) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
+ return 0;
+ }
+
+ irq = kvm_cpu_get_extint(vcpu);
+ if (irq != -1) {
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+ return 0;
+ }
+
+ irq = kvm_apic_has_interrupt(vcpu);
+ if (WARN_ON_ONCE(irq < 0))
+ goto no_vmexit;
+
+ /*
+ * If the IRQ is L2's PI notification vector, process posted
+ * interrupts for L2 instead of injecting VM-Exit, as the
+ * detection/morphing architecturally occurs when the IRQ is
+ * delivered to the CPU. Note, only interrupts that are routed
+ * through the local APIC trigger posted interrupt processing,
+ * and enabling posted interrupts requires ACK-on-exit.
+ */
+ if (irq == vmx->nested.posted_intr_nv) {
+ vmx->nested.pi_pending = true;
+ kvm_apic_clear_irr(vcpu, irq);
+ goto no_vmexit;
+ }
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT,
+ INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR | irq, 0);
+
+ /*
+ * ACK the interrupt _after_ emulating VM-Exit, as the IRQ must
+ * be marked as in-service in vmcs01.GUEST_INTERRUPT_STATUS.SVI
+ * if APICv is active.
+ */
+ kvm_apic_ack_interrupt(vcpu, irq);
return 0;
}
@@ -4806,7 +4874,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
goto vmabort;
}
- if (kvm_set_msr(vcpu, h.index, h.value)) {
+ if (kvm_set_msr_with_filter(vcpu, h.index, h.value)) {
pr_debug_ratelimited(
"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
__func__, j, h.index, h.value);
@@ -4969,14 +5037,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- nested_exit_intr_ack_set(vcpu)) {
- int irq = kvm_cpu_get_interrupt(vcpu);
- WARN_ON(irq < 0);
- vmcs12->vm_exit_intr_info = irq |
- INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
- }
-
if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
@@ -7051,7 +7111,7 @@ static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
{
msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
- MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT |
VMX_MISC_ACTIVITY_WAIT_SIPI;
@@ -7066,12 +7126,10 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
* guest, and the VMCS structure we give it - not about the
* VMX support of the underlying hardware.
*/
- msrs->basic =
- VMCS12_REVISION |
- VMX_BASIC_TRUE_CTLS |
- ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
- (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+ msrs->basic = vmx_basic_encode_vmcs_info(VMCS12_REVISION, VMCS12_SIZE,
+ X86_MEMTYPE_WB);
+ msrs->basic |= VMX_BASIC_TRUE_CTLS;
if (cpu_has_vmx_basic_inout())
msrs->basic |= VMX_BASIC_INOUT;
}
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index cce4e2aa30fb..2c296b6abb8c 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -39,11 +39,17 @@ bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_vmcs12;
}
static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
{
+ lockdep_assert_once(lockdep_is_held(&vcpu->mutex) ||
+ !refcount_read(&vcpu->kvm->users_count));
+
return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
}
@@ -109,7 +115,7 @@ static inline unsigned nested_cpu_vmx_misc_cr3_count(struct kvm_vcpu *vcpu)
static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu)
{
return to_vmx(vcpu)->nested.msrs.misc_low &
- MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
+ VMX_MISC_VMWRITE_SHADOW_RO_FIELDS;
}
static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 6fef01e0536e..a3c3d2a51f47 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -274,7 +274,7 @@ static int handle_encls_ecreate(struct kvm_vcpu *vcpu)
* simultaneously set SGX_ATTR_PROVISIONKEY to bypass the check to
* enforce restriction of access to the PROVISIONKEY.
*/
- contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL_ACCOUNT);
+ contents = (struct sgx_secs *)__get_free_page(GFP_KERNEL);
if (!contents)
return -ENOMEM;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 733a0c45d1a6..1a4438358c5e 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -525,10 +525,6 @@ static const struct kvm_vmx_segment_field {
VMX_SEGMENT_FIELD(LDTR),
};
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
-{
- vmx->segment_cache.bitmask = 0;
-}
static unsigned long host_idt_base;
@@ -755,7 +751,7 @@ fault:
return -EIO;
}
-static void vmx_emergency_disable(void)
+void vmx_emergency_disable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
@@ -1998,15 +1994,15 @@ static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
return !(msr->data & ~valid_bits);
}
-int vmx_get_msr_feature(struct kvm_msr_entry *msr)
+int vmx_get_feature_msr(u32 msr, u64 *data)
{
- switch (msr->index) {
+ switch (msr) {
case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
if (!nested)
return 1;
- return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ return vmx_get_vmx_msr(&vmcs_config.nested, msr, data);
default:
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
}
@@ -2605,13 +2601,13 @@ static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
- u32 vmx_msr_low, vmx_msr_high;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ u64 basic_msr;
u64 misc_msr;
int i;
@@ -2734,29 +2730,29 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
_vmexit_control &= ~x_ctrl;
}
- rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
+ rdmsrl(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
- if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
+ if (vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
- /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
- if (vmx_msr_high & (1u<<16))
+ /*
+ * KVM expects to be able to shove all legal physical addresses into
+ * VMCS fields for 64-bit kernels, and per the SDM, "This bit is always
+ * 0 for processors that support Intel 64 architecture".
+ */
+ if (basic_msr & VMX_BASIC_32BIT_PHYS_ADDR_ONLY)
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
- if (((vmx_msr_high >> 18) & 15) != 6)
+ if (vmx_basic_vmcs_mem_type(basic_msr) != X86_MEMTYPE_WB)
return -EIO;
rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
- vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
-
- vmcs_conf->revision_id = vmx_msr_low;
-
+ vmcs_conf->basic = basic_msr;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
@@ -2844,7 +2840,7 @@ fault:
return -EFAULT;
}
-int vmx_hardware_enable(void)
+int vmx_enable_virtualization_cpu(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -2881,7 +2877,7 @@ static void vmclear_local_loaded_vmcss(void)
__loaded_vmcs_clear(v);
}
-void vmx_hardware_disable(void)
+void vmx_disable_virtualization_cpu(void)
{
vmclear_local_loaded_vmcss();
@@ -2903,13 +2899,13 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
if (!pages)
return NULL;
vmcs = page_address(pages);
- memset(vmcs, 0, vmcs_config.size);
+ memset(vmcs, 0, vmx_basic_vmcs_size(vmcs_config.basic));
/* KVM supports Enlightened VMCS v1 only */
if (kvm_is_using_evmcs())
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
if (shadow)
vmcs->hdr.shadow_vmcs = 1;
@@ -3002,7 +2998,7 @@ static __init int alloc_kvm_area(void)
* physical CPU.
*/
if (kvm_is_using_evmcs())
- vmcs->hdr.revision_id = vmcs_config.revision_id;
+ vmcs->hdr.revision_id = vmx_basic_vmcs_revision_id(vmcs_config.basic);
per_cpu(vmxarea, cpu) = vmcs;
}
@@ -4219,6 +4215,13 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * DO NOT query the vCPU's vmcs12, as vmcs12 is dynamically allocated
+ * and freed, and must not be accessed outside of vcpu->mutex. The
+ * vCPU's cached PI NV is valid if and only if posted interrupts
+ * enabled in its vmcs12, i.e. checking the vector also checks that
+ * L1 has enabled posted interrupts for L2.
+ */
if (is_guest_mode(vcpu) &&
vector == vmx->nested.posted_intr_nv) {
/*
@@ -5804,8 +5807,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
? PFERR_PRESENT_MASK : 0;
- error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
- PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
+ if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
+ PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
/*
* Check that the GPA doesn't exceed physical memory limits, as that is
@@ -7265,6 +7269,8 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
return handle_fastpath_set_msr_irqoff(vcpu);
case EXIT_REASON_PREEMPTION_TIMER:
return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
+ case EXIT_REASON_HLT:
+ return handle_fastpath_hlt(vcpu);
default:
return EXIT_FASTPATH_NONE;
}
@@ -7965,6 +7971,7 @@ static __init void vmx_set_cpu_caps(void)
kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
kvm_cpu_cap_clear(X86_FEATURE_SGX1);
kvm_cpu_cap_clear(X86_FEATURE_SGX2);
+ kvm_cpu_cap_clear(X86_FEATURE_SGX_EDECCSSA);
}
if (vmx_umip_emulated())
@@ -8515,7 +8522,7 @@ __init int vmx_hardware_setup(void)
u64 use_timer_freq = 5000ULL * 1000 * 1000;
cpu_preemption_timer_multi =
- vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ vmx_misc_preemption_timer_rate(vmcs_config.misc);
if (tsc_khz)
use_timer_freq = (u64)tsc_khz * 1000;
@@ -8582,8 +8589,6 @@ static void __vmx_exit(void)
{
allow_smaller_maxphyaddr = false;
- cpu_emergency_unregister_virt_callback(vmx_emergency_disable);
-
vmx_cleanup_l1d_flush();
}
@@ -8630,8 +8635,6 @@ static int __init vmx_init(void)
pi_init_cpu(cpu);
}
- cpu_emergency_register_virt_callback(vmx_emergency_disable);
-
vmx_check_vmcs12_offsets();
/*
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 42498fa63abb..2325f773a20b 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -17,10 +17,6 @@
#include "run_flags.h"
#include "../mmu.h"
-#define MSR_TYPE_R 1
-#define MSR_TYPE_W 2
-#define MSR_TYPE_RW 3
-
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
#ifdef CONFIG_X86_64
@@ -756,4 +752,9 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && enable_ipiv;
}
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_onhyperv.h b/arch/x86/kvm/vmx/vmx_onhyperv.h
index eb48153bfd73..bba24ed99ee6 100644
--- a/arch/x86/kvm/vmx/vmx_onhyperv.h
+++ b/arch/x86/kvm/vmx/vmx_onhyperv.h
@@ -104,6 +104,14 @@ static inline void evmcs_load(u64 phys_addr)
struct hv_vp_assist_page *vp_ap =
hv_get_vp_assist_page(smp_processor_id());
+ /*
+ * When enabling eVMCS, KVM verifies that every CPU has a valid hv_vp_assist_page()
+ * and aborts enabling the feature otherwise. CPU onlining path is also checked in
+ * vmx_hardware_enable().
+ */
+ if (KVM_BUG_ON(!vp_ap, kvm_get_running_vcpu()->kvm))
+ return;
+
if (current_evmcs->hv_enlightenments_control.nested_flush_hypercall)
vp_ap->nested_control.features.directhypercall = 1;
vp_ap->current_nested_vmcs = phys_addr;
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 8060e5fc6dbd..93e020dc88f6 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -47,7 +47,7 @@ static __always_inline void vmcs_check16(unsigned long field)
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6001) == 0x2001,
"16-bit accessor invalid for 64-bit high field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x4000,
- "16-bit accessor invalid for 32-bit high field");
+ "16-bit accessor invalid for 32-bit field");
BUILD_BUG_ON_MSG(__builtin_constant_p(field) && ((field) & 0x6000) == 0x6000,
"16-bit accessor invalid for natural width field");
}
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index ce3221cd1d01..a55981c5216e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -13,8 +13,9 @@ extern struct kvm_x86_init_ops vt_init_ops __initdata;
void vmx_hardware_unsetup(void);
int vmx_check_processor_compat(void);
-int vmx_hardware_enable(void);
-void vmx_hardware_disable(void);
+int vmx_enable_virtualization_cpu(void);
+void vmx_disable_virtualization_cpu(void);
+void vmx_emergency_disable_virtualization_cpu(void);
int vmx_vm_init(struct kvm *kvm);
void vmx_vm_destroy(struct kvm *kvm);
int vmx_vcpu_precreate(struct kvm *kvm);
@@ -56,7 +57,7 @@ bool vmx_has_emulated_msr(struct kvm *kvm, u32 index);
void vmx_msr_filter_changed(struct kvm_vcpu *vcpu);
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu);
-int vmx_get_msr_feature(struct kvm_msr_entry *msr);
+int vmx_get_feature_msr(u32 msr, u64 *data);
int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c983c8e434b8..83fe0a78146f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -305,24 +305,237 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
static struct kmem_cache *x86_emulator_cache;
/*
- * When called, it means the previous get/set msr reached an invalid msr.
- * Return true if we want to ignore/silent this failed msr access.
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
+ * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
+ * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
+ * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
+ * MSRs that KVM emulates without strictly requiring host support.
+ * msr_based_features holds MSRs that enumerate features, i.e. are effectively
+ * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
+ * msrs_to_save and emulated_msrs.
*/
-static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
+
+static const u32 msrs_to_save_base[] = {
+ MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+ MSR_STAR,
+#ifdef CONFIG_X86_64
+ MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
+#endif
+ MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
+
+ MSR_IA32_XFD, MSR_IA32_XFD_ERR,
+};
+
+static const u32 msrs_to_save_pmu[] = {
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ MSR_CORE_PERF_GLOBAL_CTRL,
+ MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
+
+ /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+
+ /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
+
+ MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
+ MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
+};
+
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
+ ARRAY_SIZE(msrs_to_save_pmu)];
+static unsigned num_msrs_to_save;
+
+static const u32 emulated_msrs_all[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+
+#ifdef CONFIG_KVM_HYPERV
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
+ HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+ HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+ HV_X64_MSR_RESET,
+ HV_X64_MSR_VP_INDEX,
+ HV_X64_MSR_VP_RUNTIME,
+ HV_X64_MSR_SCONTROL,
+ HV_X64_MSR_STIMER0_CONFIG,
+ HV_X64_MSR_VP_ASSIST_PAGE,
+ HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
+ HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
+ HV_X64_MSR_SYNDBG_OPTIONS,
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
+#endif
+
+ MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
+
+ MSR_IA32_TSC_ADJUST,
+ MSR_IA32_TSC_DEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+ MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MCG_STATUS,
+ MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
+ MSR_IA32_SMBASE,
+ MSR_SMI_COUNT,
+ MSR_PLATFORM_INFO,
+ MSR_MISC_FEATURES_ENABLES,
+ MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_AMD64_TSC_RATIO,
+ MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
+
+ /*
+ * KVM always supports the "true" VMX control MSRs, even if the host
+ * does not. The VMX MSRs as a whole are considered "emulated" as KVM
+ * doesn't strictly require them to exist in the host (ignoring that
+ * KVM would refuse to load in the first place if the core set of MSRs
+ * aren't supported).
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_K7_HWCR,
+ MSR_KVM_POLL_CONTROL,
+};
+
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
+static unsigned num_emulated_msrs;
+
+/*
+ * List of MSRs that control the existence of MSR-based features, i.e. MSRs
+ * that are effectively CPUID leafs. VMX MSRs are also included in the set of
+ * feature MSRs, but are handled separately to allow expedited lookups.
+ */
+static const u32 msr_based_features_all_except_vmx[] = {
+ MSR_AMD64_DE_CFG,
+ MSR_IA32_UCODE_REV,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
+};
+
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
+ (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
+static unsigned int num_msr_based_features;
+
+/*
+ * All feature MSRs except uCode revID, which tracks the currently loaded uCode
+ * patch, are immutable once the vCPU model is defined.
+ */
+static bool kvm_is_immutable_feature_msr(u32 msr)
{
- const char *op = write ? "wrmsr" : "rdmsr";
+ int i;
- if (ignore_msrs) {
- if (report_ignored_msrs)
- kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
- op, msr, data);
- /* Mask the error */
+ if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
return true;
- } else {
+
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
+ if (msr == msr_based_features_all_except_vmx[i])
+ return msr != MSR_IA32_UCODE_REV;
+ }
+
+ return false;
+}
+
+static bool kvm_is_advertised_msr(u32 msr_index)
+{
+ unsigned int i;
+
+ for (i = 0; i < num_msrs_to_save; i++) {
+ if (msrs_to_save[i] == msr_index)
+ return true;
+ }
+
+ for (i = 0; i < num_emulated_msrs; i++) {
+ if (emulated_msrs[i] == msr_index)
+ return true;
+ }
+
+ return false;
+}
+
+typedef int (*msr_access_t)(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated);
+
+static __always_inline int kvm_do_msr_access(struct kvm_vcpu *vcpu, u32 msr,
+ u64 *data, bool host_initiated,
+ enum kvm_msr_access rw,
+ msr_access_t msr_access_fn)
+{
+ const char *op = rw == MSR_TYPE_W ? "wrmsr" : "rdmsr";
+ int ret;
+
+ BUILD_BUG_ON(rw != MSR_TYPE_R && rw != MSR_TYPE_W);
+
+ /*
+ * Zero the data on read failures to avoid leaking stack data to the
+ * guest and/or userspace, e.g. if the failure is ignored below.
+ */
+ ret = msr_access_fn(vcpu, msr, data, host_initiated);
+ if (ret && rw == MSR_TYPE_R)
+ *data = 0;
+
+ if (ret != KVM_MSR_RET_UNSUPPORTED)
+ return ret;
+
+ /*
+ * Userspace is allowed to read MSRs, and write '0' to MSRs, that KVM
+ * advertises to userspace, even if an MSR isn't fully supported.
+ * Simply check that @data is '0', which covers both the write '0' case
+ * and all reads (in which case @data is zeroed on failure; see above).
+ */
+ if (host_initiated && !*data && kvm_is_advertised_msr(msr))
+ return 0;
+
+ if (!ignore_msrs) {
kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
- op, msr, data);
- return false;
+ op, msr, *data);
+ return ret;
}
+
+ if (report_ignored_msrs)
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", op, msr, *data);
+
+ return 0;
}
static struct kmem_cache *kvm_alloc_emulator_cache(void)
@@ -355,7 +568,7 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
/*
* Disabling irqs at this point since the following code could be
- * interrupted and executed through kvm_arch_hardware_disable()
+ * interrupted and executed through kvm_arch_disable_virtualization_cpu()
*/
local_irq_save(flags);
if (msrs->registered) {
@@ -413,8 +626,7 @@ EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
static void kvm_user_return_msr_cpu_online(void)
{
- unsigned int cpu = smp_processor_id();
- struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = this_cpu_ptr(user_return_msrs);
u64 value;
int i;
@@ -621,12 +833,6 @@ static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vecto
ex->payload = payload;
}
-/* Forcibly leave the nested mode in cases like a vCPU reset */
-static void kvm_leave_nested(struct kvm_vcpu *vcpu)
-{
- kvm_x86_ops.nested_ops->leave_nested(vcpu);
-}
-
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
@@ -1412,178 +1618,6 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
/*
- * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track
- * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS,
- * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that
- * require host support, i.e. should be probed via RDMSR. emulated_msrs holds
- * MSRs that KVM emulates without strictly requiring host support.
- * msr_based_features holds MSRs that enumerate features, i.e. are effectively
- * CPUID leafs. Note, msr_based_features isn't mutually exclusive with
- * msrs_to_save and emulated_msrs.
- */
-
-static const u32 msrs_to_save_base[] = {
- MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
- MSR_STAR,
-#ifdef CONFIG_X86_64
- MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
-#endif
- MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
- MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
- MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
- MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
- MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
- MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
- MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
- MSR_IA32_UMWAIT_CONTROL,
-
- MSR_IA32_XFD, MSR_IA32_XFD_ERR,
-};
-
-static const u32 msrs_to_save_pmu[] = {
- MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
- MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
- MSR_CORE_PERF_GLOBAL_CTRL,
- MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
-
- /* This part of MSRs should match KVM_MAX_NR_INTEL_GP_COUNTERS. */
- MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
- MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
- MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
- MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
- MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
- MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
- MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
- MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
-
- MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
- MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
-
- /* This part of MSRs should match KVM_MAX_NR_AMD_GP_COUNTERS. */
- MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
- MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
- MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
- MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
-
- MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
- MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
-};
-
-static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) +
- ARRAY_SIZE(msrs_to_save_pmu)];
-static unsigned num_msrs_to_save;
-
-static const u32 emulated_msrs_all[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
-
-#ifdef CONFIG_KVM_HYPERV
- HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
- HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
- HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
- HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
- HV_X64_MSR_RESET,
- HV_X64_MSR_VP_INDEX,
- HV_X64_MSR_VP_RUNTIME,
- HV_X64_MSR_SCONTROL,
- HV_X64_MSR_STIMER0_CONFIG,
- HV_X64_MSR_VP_ASSIST_PAGE,
- HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
- HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL,
- HV_X64_MSR_SYNDBG_OPTIONS,
- HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
- HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
- HV_X64_MSR_SYNDBG_PENDING_BUFFER,
-#endif
-
- MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
-
- MSR_IA32_TSC_ADJUST,
- MSR_IA32_TSC_DEADLINE,
- MSR_IA32_ARCH_CAPABILITIES,
- MSR_IA32_PERF_CAPABILITIES,
- MSR_IA32_MISC_ENABLE,
- MSR_IA32_MCG_STATUS,
- MSR_IA32_MCG_CTL,
- MSR_IA32_MCG_EXT_CTL,
- MSR_IA32_SMBASE,
- MSR_SMI_COUNT,
- MSR_PLATFORM_INFO,
- MSR_MISC_FEATURES_ENABLES,
- MSR_AMD64_VIRT_SPEC_CTRL,
- MSR_AMD64_TSC_RATIO,
- MSR_IA32_POWER_CTL,
- MSR_IA32_UCODE_REV,
-
- /*
- * KVM always supports the "true" VMX control MSRs, even if the host
- * does not. The VMX MSRs as a whole are considered "emulated" as KVM
- * doesn't strictly require them to exist in the host (ignoring that
- * KVM would refuse to load in the first place if the core set of MSRs
- * aren't supported).
- */
- MSR_IA32_VMX_BASIC,
- MSR_IA32_VMX_TRUE_PINBASED_CTLS,
- MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
- MSR_IA32_VMX_TRUE_EXIT_CTLS,
- MSR_IA32_VMX_TRUE_ENTRY_CTLS,
- MSR_IA32_VMX_MISC,
- MSR_IA32_VMX_CR0_FIXED0,
- MSR_IA32_VMX_CR4_FIXED0,
- MSR_IA32_VMX_VMCS_ENUM,
- MSR_IA32_VMX_PROCBASED_CTLS2,
- MSR_IA32_VMX_EPT_VPID_CAP,
- MSR_IA32_VMX_VMFUNC,
-
- MSR_K7_HWCR,
- MSR_KVM_POLL_CONTROL,
-};
-
-static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
-static unsigned num_emulated_msrs;
-
-/*
- * List of MSRs that control the existence of MSR-based features, i.e. MSRs
- * that are effectively CPUID leafs. VMX MSRs are also included in the set of
- * feature MSRs, but are handled separately to allow expedited lookups.
- */
-static const u32 msr_based_features_all_except_vmx[] = {
- MSR_AMD64_DE_CFG,
- MSR_IA32_UCODE_REV,
- MSR_IA32_ARCH_CAPABILITIES,
- MSR_IA32_PERF_CAPABILITIES,
-};
-
-static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) +
- (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)];
-static unsigned int num_msr_based_features;
-
-/*
- * All feature MSRs except uCode revID, which tracks the currently loaded uCode
- * patch, are immutable once the vCPU model is defined.
- */
-static bool kvm_is_immutable_feature_msr(u32 msr)
-{
- int i;
-
- if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR)
- return true;
-
- for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) {
- if (msr == msr_based_features_all_except_vmx[i])
- return msr != MSR_IA32_UCODE_REV;
- }
-
- return false;
-}
-
-/*
* Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
* does not yet virtualize. These include:
* 10 - MISC_PACKAGE_CTRLS
@@ -1660,40 +1694,31 @@ static u64 kvm_get_arch_capabilities(void)
return data;
}
-static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
+static int kvm_get_feature_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
{
- switch (msr->index) {
+ WARN_ON_ONCE(!host_initiated);
+
+ switch (index) {
case MSR_IA32_ARCH_CAPABILITIES:
- msr->data = kvm_get_arch_capabilities();
+ *data = kvm_get_arch_capabilities();
break;
case MSR_IA32_PERF_CAPABILITIES:
- msr->data = kvm_caps.supported_perf_cap;
+ *data = kvm_caps.supported_perf_cap;
break;
case MSR_IA32_UCODE_REV:
- rdmsrl_safe(msr->index, &msr->data);
+ rdmsrl_safe(index, data);
break;
default:
- return kvm_x86_call(get_msr_feature)(msr);
+ return kvm_x86_call(get_feature_msr)(index, data);
}
return 0;
}
-static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+static int do_get_feature_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- struct kvm_msr_entry msr;
- int r;
-
- /* Unconditionally clear the output for simplicity */
- msr.data = 0;
- msr.index = index;
- r = kvm_get_msr_feature(&msr);
-
- if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(index, 0, false))
- r = 0;
-
- *data = msr.data;
-
- return r;
+ return kvm_do_msr_access(vcpu, index, data, true, MSR_TYPE_R,
+ kvm_get_feature_msr);
}
static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -1880,16 +1905,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
return kvm_x86_call(set_msr)(vcpu, &msr);
}
+static int _kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ return __kvm_set_msr(vcpu, index, *data, host_initiated);
+}
+
static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
u32 index, u64 data, bool host_initiated)
{
- int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
-
- if (ret == KVM_MSR_RET_INVALID)
- if (kvm_msr_ignored_check(index, data, true))
- ret = 0;
-
- return ret;
+ return kvm_do_msr_access(vcpu, index, &data, host_initiated, MSR_TYPE_W,
+ _kvm_set_msr);
}
/*
@@ -1928,31 +1954,25 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
u32 index, u64 *data, bool host_initiated)
{
- int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
-
- if (ret == KVM_MSR_RET_INVALID) {
- /* Unconditionally clear *data for simplicity */
- *data = 0;
- if (kvm_msr_ignored_check(index, 0, false))
- ret = 0;
- }
-
- return ret;
+ return kvm_do_msr_access(vcpu, index, data, host_initiated, MSR_TYPE_R,
+ __kvm_get_msr);
}
-static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
return KVM_MSR_RET_FILTERED;
return kvm_get_msr_ignored_check(vcpu, index, data, false);
}
+EXPORT_SYMBOL_GPL(kvm_get_msr_with_filter);
-static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
+int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
return KVM_MSR_RET_FILTERED;
return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
+EXPORT_SYMBOL_GPL(kvm_set_msr_with_filter);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
@@ -1999,7 +2019,7 @@ static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
static u64 kvm_msr_reason(int r)
{
switch (r) {
- case KVM_MSR_RET_INVALID:
+ case KVM_MSR_RET_UNSUPPORTED:
return KVM_MSR_EXIT_REASON_UNKNOWN;
case KVM_MSR_RET_FILTERED:
return KVM_MSR_EXIT_REASON_FILTER;
@@ -2162,31 +2182,34 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
{
u32 msr = kvm_rcx_read(vcpu);
u64 data;
- fastpath_t ret = EXIT_FASTPATH_NONE;
+ fastpath_t ret;
+ bool handled;
kvm_vcpu_srcu_read_lock(vcpu);
switch (msr) {
case APIC_BASE_MSR + (APIC_ICR >> 4):
data = kvm_read_edx_eax(vcpu);
- if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
- kvm_skip_emulated_instruction(vcpu);
- ret = EXIT_FASTPATH_EXIT_HANDLED;
- }
+ handled = !handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
break;
case MSR_IA32_TSC_DEADLINE:
data = kvm_read_edx_eax(vcpu);
- if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
- kvm_skip_emulated_instruction(vcpu);
- ret = EXIT_FASTPATH_REENTER_GUEST;
- }
+ handled = !handle_fastpath_set_tscdeadline(vcpu, data);
break;
default:
+ handled = false;
break;
}
- if (ret != EXIT_FASTPATH_NONE)
+ if (handled) {
+ if (!kvm_skip_emulated_instruction(vcpu))
+ ret = EXIT_FASTPATH_EXIT_USERSPACE;
+ else
+ ret = EXIT_FASTPATH_REENTER_GUEST;
trace_kvm_msr_write(msr, data);
+ } else {
+ ret = EXIT_FASTPATH_NONE;
+ }
kvm_vcpu_srcu_read_unlock(vcpu);
@@ -3746,18 +3769,6 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
}
-static bool kvm_is_msr_to_save(u32 msr_index)
-{
- unsigned int i;
-
- for (i = 0; i < num_msrs_to_save; i++) {
- if (msrs_to_save[i] == msr_index)
- return true;
- }
-
- return false;
-}
-
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u32 msr = msr_info->index;
@@ -4139,15 +4150,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to write '0' to MSRs that KVM reports
- * as to-be-saved, even if an MSRs isn't fully supported.
- */
- if (msr_info->host_initiated && !data &&
- kvm_is_msr_to_save(msr))
- break;
-
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
}
@@ -4498,17 +4501,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
- /*
- * Userspace is allowed to read MSRs that KVM reports as
- * to-be-saved, even if an MSR isn't fully supported.
- */
- if (msr_info->host_initiated &&
- kvm_is_msr_to_save(msr_info->index)) {
- msr_info->data = 0;
- break;
- }
-
- return KVM_MSR_RET_INVALID;
+ return KVM_MSR_RET_UNSUPPORTED;
}
return 0;
}
@@ -4946,7 +4939,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
break;
}
case KVM_GET_MSRS:
- r = msr_io(NULL, argp, do_get_msr_feature, 1);
+ r = msr_io(NULL, argp, do_get_feature_msr, 1);
break;
#ifdef CONFIG_KVM_HYPERV
case KVM_GET_SUPPORTED_HV_CPUID:
@@ -7383,11 +7376,9 @@ out:
static void kvm_probe_feature_msr(u32 msr_index)
{
- struct kvm_msr_entry msr = {
- .index = msr_index,
- };
+ u64 data;
- if (kvm_get_msr_feature(&msr))
+ if (kvm_get_feature_msr(NULL, msr_index, &data, true))
return;
msr_based_features[num_msr_based_features++] = msr_index;
@@ -8865,60 +8856,13 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
return 1;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
- int emulation_type)
+static bool kvm_unprotect_and_retry_on_failure(struct kvm_vcpu *vcpu,
+ gpa_t cr2_or_gpa,
+ int emulation_type)
{
- gpa_t gpa = cr2_or_gpa;
- kvm_pfn_t pfn;
-
if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
return false;
- if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
- WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
- return false;
-
- if (!vcpu->arch.mmu->root_role.direct) {
- /*
- * Write permission should be allowed since only
- * write access need to be emulated.
- */
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
-
- /*
- * If the mapping is invalid in guest, let cpu retry
- * it to generate fault.
- */
- if (gpa == INVALID_GPA)
- return true;
- }
-
- /*
- * Do not retry the unhandleable instruction if it faults on the
- * readonly host memory, otherwise it will goto a infinite loop:
- * retry instruction -> write #PF -> emulation fail -> retry
- * instruction -> ...
- */
- pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-
- /*
- * If the instruction failed on the error pfn, it can not be fixed,
- * report the error to userspace.
- */
- if (is_error_noslot_pfn(pfn))
- return false;
-
- kvm_release_pfn_clean(pfn);
-
- /*
- * If emulation may have been triggered by a write to a shadowed page
- * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
- * guest to let the CPU re-execute the instruction in the hope that the
- * CPU can cleanly execute the instruction that KVM failed to emulate.
- */
- if (vcpu->kvm->arch.indirect_shadow_pages)
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
-
/*
* If the failed instruction faulted on an access to page tables that
* are used to translate any part of the instruction, KVM can't resolve
@@ -8929,54 +8873,24 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* then zap the SPTE to unprotect the gfn, and then do it all over
* again. Report the error to userspace.
*/
- return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP);
-}
-
-static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- gpa_t cr2_or_gpa, int emulation_type)
-{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
-
- last_retry_eip = vcpu->arch.last_retry_eip;
- last_retry_addr = vcpu->arch.last_retry_addr;
+ if (emulation_type & EMULTYPE_WRITE_PF_TO_SP)
+ return false;
/*
- * If the emulation is caused by #PF and it is non-page_table
- * writing instruction, it means the VM-EXIT is caused by shadow
- * page protected, we can zap the shadow page and retry this
- * instruction directly.
- *
- * Note: if the guest uses a non-page-table modifying instruction
- * on the PDE that points to the instruction, then we will unmap
- * the instruction and go to an infinite loop. So, we cache the
- * last retried eip and the last fault address, if we meet the eip
- * and the address again, we can break out of the potential infinite
- * loop.
+ * If emulation may have been triggered by a write to a shadowed page
+ * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the
+ * guest to let the CPU re-execute the instruction in the hope that the
+ * CPU can cleanly execute the instruction that KVM failed to emulate.
*/
- vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
-
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
- return false;
-
- if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
- WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
- return false;
-
- if (x86_page_table_writing_insn(ctxt))
- return false;
-
- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
- return false;
-
- vcpu->arch.last_retry_eip = ctxt->eip;
- vcpu->arch.last_retry_addr = cr2_or_gpa;
-
- if (!vcpu->arch.mmu->root_role.direct)
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
-
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+ __kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa, true);
+ /*
+ * Retry even if _this_ vCPU didn't unprotect the gfn, as it's possible
+ * all SPTEs were already zapped by a different task. The alternative
+ * is to report the error to userspace and likely terminate the guest,
+ * and the last_retry_{eip,addr} checks will prevent retrying the page
+ * fault indefinitely, i.e. there's nothing to lose by retrying.
+ */
return true;
}
@@ -9176,6 +9090,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
+ if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))))
+ emulation_type &= ~EMULTYPE_ALLOW_RETRY_PF;
+
r = kvm_check_emulate_insn(vcpu, emulation_type, insn, insn_len);
if (r != X86EMUL_CONTINUE) {
if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT)
@@ -9206,8 +9125,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (reexecute_instruction(vcpu, cr2_or_gpa,
- emulation_type))
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
return 1;
if (ctxt->have_exception &&
@@ -9254,7 +9173,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return 1;
}
- if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
+ /*
+ * If emulation was caused by a write-protection #PF on a non-page_table
+ * writing instruction, try to unprotect the gfn, i.e. zap shadow pages,
+ * and retry the instruction, as the vCPU is likely no longer using the
+ * gfn as a page table.
+ */
+ if ((emulation_type & EMULTYPE_ALLOW_RETRY_PF) &&
+ !x86_page_table_writing_insn(ctxt) &&
+ kvm_mmu_unprotect_gfn_and_retry(vcpu, cr2_or_gpa))
return 1;
/* this is needed for vmware backdoor interface to work since it
@@ -9285,7 +9212,8 @@ restart:
return 1;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type))
+ if (kvm_unprotect_and_retry_on_failure(vcpu, cr2_or_gpa,
+ emulation_type))
return 1;
return handle_emulation_failure(vcpu, emulation_type);
@@ -9753,7 +9681,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
guard(mutex)(&vendor_module_lock);
- if (kvm_x86_ops.hardware_enable) {
+ if (kvm_x86_ops.enable_virtualization_cpu) {
pr_err("already loaded vendor module '%s'\n", kvm_x86_ops.name);
return -EEXIST;
}
@@ -9880,7 +9808,7 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops)
return 0;
out_unwind_ops:
- kvm_x86_ops.hardware_enable = NULL;
+ kvm_x86_ops.enable_virtualization_cpu = NULL;
kvm_x86_call(hardware_unsetup)();
out_mmu_exit:
kvm_mmu_vendor_module_exit();
@@ -9921,56 +9849,11 @@ void kvm_x86_vendor_exit(void)
WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
#endif
mutex_lock(&vendor_module_lock);
- kvm_x86_ops.hardware_enable = NULL;
+ kvm_x86_ops.enable_virtualization_cpu = NULL;
mutex_unlock(&vendor_module_lock);
}
EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit);
-static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
-{
- /*
- * The vCPU has halted, e.g. executed HLT. Update the run state if the
- * local APIC is in-kernel, the run loop will detect the non-runnable
- * state and halt the vCPU. Exit to userspace if the local APIC is
- * managed by userspace, in which case userspace is responsible for
- * handling wake events.
- */
- ++vcpu->stat.halt_exits;
- if (lapic_in_kernel(vcpu)) {
- vcpu->arch.mp_state = state;
- return 1;
- } else {
- vcpu->run->exit_reason = reason;
- return 0;
- }
-}
-
-int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
-{
- return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
-
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
-{
- int ret = kvm_skip_emulated_instruction(vcpu);
- /*
- * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
- * KVM_EXIT_DEBUG here.
- */
- return kvm_emulate_halt_noskip(vcpu) && ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_halt);
-
-int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
-{
- int ret = kvm_skip_emulated_instruction(vcpu);
-
- return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
- KVM_EXIT_AP_RESET_HOLD) && ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
-
#ifdef CONFIG_X86_64
static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
unsigned long clock_type)
@@ -11207,6 +11090,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
+ if (unlikely(exit_fastpath == EXIT_FASTPATH_EXIT_USERSPACE))
+ return 0;
+
r = kvm_x86_call(handle_exit)(vcpu, exit_fastpath);
return r;
@@ -11220,6 +11106,67 @@ out:
return r;
}
+static bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted);
+}
+
+static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
+{
+ if (!list_empty_careful(&vcpu->async_pf.done))
+ return true;
+
+ if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+ kvm_apic_init_sipi_allowed(vcpu))
+ return true;
+
+ if (vcpu->arch.pv.pv_unhalted)
+ return true;
+
+ if (kvm_is_exception_pending(vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+ (vcpu->arch.nmi_pending &&
+ kvm_x86_call(nmi_allowed)(vcpu, false)))
+ return true;
+
+#ifdef CONFIG_KVM_SMM
+ if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
+ (vcpu->arch.smi_pending &&
+ kvm_x86_call(smi_allowed)(vcpu, false)))
+ return true;
+#endif
+
+ if (kvm_test_request(KVM_REQ_PMI, vcpu))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
+ return true;
+
+ if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
+ return true;
+
+ if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu, false))
+ return true;
+
+ if (kvm_xen_has_pending_events(vcpu))
+ return true;
+
+ return false;
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
+}
+
/* Called within kvm->srcu read side. */
static inline int vcpu_block(struct kvm_vcpu *vcpu)
{
@@ -11291,12 +11238,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
return 1;
}
-static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
-{
- return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
- !vcpu->arch.apf.halted);
-}
-
/* Called within kvm->srcu read side. */
static int vcpu_run(struct kvm_vcpu *vcpu)
{
@@ -11348,6 +11289,98 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
return r;
}
+static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
+{
+ /*
+ * The vCPU has halted, e.g. executed HLT. Update the run state if the
+ * local APIC is in-kernel, the run loop will detect the non-runnable
+ * state and halt the vCPU. Exit to userspace if the local APIC is
+ * managed by userspace, in which case userspace is responsible for
+ * handling wake events.
+ */
+ ++vcpu->stat.halt_exits;
+ if (lapic_in_kernel(vcpu)) {
+ if (kvm_vcpu_has_events(vcpu))
+ vcpu->arch.pv.pv_unhalted = false;
+ else
+ vcpu->arch.mp_state = state;
+ return 1;
+ } else {
+ vcpu->run->exit_reason = reason;
+ return 0;
+ }
+}
+
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
+{
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+ /*
+ * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_emulate_halt_noskip(vcpu) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+
+fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ kvm_vcpu_srcu_read_lock(vcpu);
+ ret = kvm_emulate_halt(vcpu);
+ kvm_vcpu_srcu_read_unlock(vcpu);
+
+ if (!ret)
+ return EXIT_FASTPATH_EXIT_USERSPACE;
+
+ if (kvm_vcpu_running(vcpu))
+ return EXIT_FASTPATH_REENTER_GUEST;
+
+ return EXIT_FASTPATH_EXIT_HANDLED;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_hlt);
+
+int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
+{
+ int ret = kvm_skip_emulated_instruction(vcpu);
+
+ return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
+ KVM_EXIT_AP_RESET_HOLD) && ret;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
+
+bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_apicv_active(vcpu) &&
+ kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
+}
+
+bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.preempted_in_kernel;
+}
+
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
+ return true;
+
+ if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
+#ifdef CONFIG_KVM_SMM
+ kvm_test_request(KVM_REQ_SMI, vcpu) ||
+#endif
+ kvm_test_request(KVM_REQ_EVENT, vcpu))
+ return true;
+
+ return kvm_arch_dy_has_pending_interrupt(vcpu);
+}
+
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
@@ -12264,8 +12297,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
@@ -12431,6 +12462,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
if (!init_event) {
vcpu->arch.smbase = 0x30000;
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
vcpu->arch.msr_misc_features_enables = 0;
vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
@@ -12516,7 +12549,17 @@ void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
-int kvm_arch_hardware_enable(void)
+void kvm_arch_enable_virtualization(void)
+{
+ cpu_emergency_register_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+}
+
+void kvm_arch_disable_virtualization(void)
+{
+ cpu_emergency_unregister_virt_callback(kvm_x86_ops.emergency_disable_virtualization_cpu);
+}
+
+int kvm_arch_enable_virtualization_cpu(void)
{
struct kvm *kvm;
struct kvm_vcpu *vcpu;
@@ -12532,7 +12575,7 @@ int kvm_arch_hardware_enable(void)
if (ret)
return ret;
- ret = kvm_x86_call(hardware_enable)();
+ ret = kvm_x86_call(enable_virtualization_cpu)();
if (ret != 0)
return ret;
@@ -12612,9 +12655,9 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-void kvm_arch_hardware_disable(void)
+void kvm_arch_disable_virtualization_cpu(void)
{
- kvm_x86_call(hardware_disable)();
+ kvm_x86_call(disable_virtualization_cpu)();
drop_user_return_notifiers();
}
@@ -13162,87 +13205,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
kvm_arch_free_memslot(kvm, old);
}
-static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
-{
- if (!list_empty_careful(&vcpu->async_pf.done))
- return true;
-
- if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
- kvm_apic_init_sipi_allowed(vcpu))
- return true;
-
- if (vcpu->arch.pv.pv_unhalted)
- return true;
-
- if (kvm_is_exception_pending(vcpu))
- return true;
-
- if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
- (vcpu->arch.nmi_pending &&
- kvm_x86_call(nmi_allowed)(vcpu, false)))
- return true;
-
-#ifdef CONFIG_KVM_SMM
- if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
- (vcpu->arch.smi_pending &&
- kvm_x86_call(smi_allowed)(vcpu, false)))
- return true;
-#endif
-
- if (kvm_test_request(KVM_REQ_PMI, vcpu))
- return true;
-
- if (kvm_test_request(KVM_REQ_UPDATE_PROTECTED_GUEST_STATE, vcpu))
- return true;
-
- if (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu))
- return true;
-
- if (kvm_hv_has_stimer_pending(vcpu))
- return true;
-
- if (is_guest_mode(vcpu) &&
- kvm_x86_ops.nested_ops->has_events &&
- kvm_x86_ops.nested_ops->has_events(vcpu, false))
- return true;
-
- if (kvm_xen_has_pending_events(vcpu))
- return true;
-
- return false;
-}
-
-int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
-}
-
-bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_apicv_active(vcpu) &&
- kvm_x86_call(dy_apicv_has_pending_interrupt)(vcpu);
-}
-
-bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.preempted_in_kernel;
-}
-
-bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
-{
- if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
- return true;
-
- if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
-#ifdef CONFIG_KVM_SMM
- kvm_test_request(KVM_REQ_SMI, vcpu) ||
-#endif
- kvm_test_request(KVM_REQ_EVENT, vcpu))
- return true;
-
- return kvm_arch_dy_has_pending_interrupt(vcpu);
-}
-
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.guest_state_protected)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 50596f6f8320..a84c48ef5278 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -103,11 +103,18 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
return max(val, min);
}
-#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+#define MSR_IA32_CR_PAT_DEFAULT \
+ PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC)
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+/* Forcibly leave the nested mode in cases like a vCPU reset */
+static inline void kvm_leave_nested(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+}
+
static inline bool kvm_vcpu_has_run(struct kvm_vcpu *vcpu)
{
return vcpu->arch.last_vmentry_cpu != -1;
@@ -334,6 +341,7 @@ int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
+fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu);
extern struct kvm_caps kvm_caps;
extern struct kvm_host_values kvm_host;
@@ -504,13 +512,26 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva);
bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type);
+enum kvm_msr_access {
+ MSR_TYPE_R = BIT(0),
+ MSR_TYPE_W = BIT(1),
+ MSR_TYPE_RW = MSR_TYPE_R | MSR_TYPE_W,
+};
+
/*
* Internal error codes that are used to indicate that MSR emulation encountered
- * an error that should result in #GP in the guest, unless userspace
- * handles it.
+ * an error that should result in #GP in the guest, unless userspace handles it.
+ * Note, '1', '0', and negative numbers are off limits, as they are used by KVM
+ * as part of KVM's lightly documented internal KVM_RUN return codes.
+ *
+ * UNSUPPORTED - The MSR isn't supported, either because it is completely
+ * unknown to KVM, or because the MSR should not exist according
+ * to the vCPU model.
+ *
+ * FILTERED - Access to the MSR is denied by a userspace MSR filter.
*/
-#define KVM_MSR_RET_INVALID 2 /* in-kernel MSR emulation #GP condition */
-#define KVM_MSR_RET_FILTERED 3 /* #GP due to userspace MSR filter */
+#define KVM_MSR_RET_UNSUPPORTED 2
+#define KVM_MSR_RET_FILTERED 3
#define __cr4_reserved_bits(__cpu_has, __c) \
({ \
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 90afb488b396..b2eff07d65e4 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -16,6 +16,11 @@
cmpxchg8b (\reg)
.endm
+.macro read64_nonatomic reg
+ movl (\reg), %eax
+ movl 4(\reg), %edx
+.endm
+
SYM_FUNC_START(atomic64_read_cx8)
read64 %ecx
RET
@@ -51,7 +56,7 @@ SYM_FUNC_START(atomic64_\func\()_return_cx8)
movl %edx, %edi
movl %ecx, %ebp
- read64 %ecx
+ read64_nonatomic %ecx
1:
movl %eax, %ebx
movl %edx, %ecx
@@ -79,7 +84,7 @@ addsub_return sub sub sbb
SYM_FUNC_START(atomic64_\func\()_return_cx8)
pushl %ebx
- read64 %esi
+ read64_nonatomic %esi
1:
movl %eax, %ebx
movl %edx, %ecx
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index f73b5ce270b3..feb8cc6a12bf 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -176,15 +176,6 @@ static inline void set_page_memtype(struct page *pg,
}
#endif
-enum {
- PAT_UC = 0, /* uncached */
- PAT_WC = 1, /* Write combining */
- PAT_WT = 4, /* Write Through */
- PAT_WP = 5, /* Write Protected */
- PAT_WB = 6, /* Write Back (default) */
- PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */
-};
-
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
@@ -194,13 +185,13 @@ static enum page_cache_mode __init pat_get_cache_mode(unsigned int pat_val,
char *cache_mode;
switch (pat_val) {
- case PAT_UC: cache = CM(UC); cache_mode = "UC "; break;
- case PAT_WC: cache = CM(WC); cache_mode = "WC "; break;
- case PAT_WT: cache = CM(WT); cache_mode = "WT "; break;
- case PAT_WP: cache = CM(WP); cache_mode = "WP "; break;
- case PAT_WB: cache = CM(WB); cache_mode = "WB "; break;
- case PAT_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
- default: cache = CM(WB); cache_mode = "WB "; break;
+ case X86_MEMTYPE_UC: cache = CM(UC); cache_mode = "UC "; break;
+ case X86_MEMTYPE_WC: cache = CM(WC); cache_mode = "WC "; break;
+ case X86_MEMTYPE_WT: cache = CM(WT); cache_mode = "WT "; break;
+ case X86_MEMTYPE_WP: cache = CM(WP); cache_mode = "WP "; break;
+ case X86_MEMTYPE_WB: cache = CM(WB); cache_mode = "WB "; break;
+ case X86_MEMTYPE_UC_MINUS: cache = CM(UC_MINUS); cache_mode = "UC- "; break;
+ default: cache = CM(WB); cache_mode = "WB "; break;
}
memcpy(msg, cache_mode, 4);
@@ -257,12 +248,6 @@ void pat_cpu_init(void)
void __init pat_bp_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
-#define PAT(p0, p1, p2, p3, p4, p5, p6, p7) \
- (((u64)PAT_ ## p0) | ((u64)PAT_ ## p1 << 8) | \
- ((u64)PAT_ ## p2 << 16) | ((u64)PAT_ ## p3 << 24) | \
- ((u64)PAT_ ## p4 << 32) | ((u64)PAT_ ## p5 << 40) | \
- ((u64)PAT_ ## p6 << 48) | ((u64)PAT_ ## p7 << 56))
-
if (!IS_ENABLED(CONFIG_X86_PAT))
pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
@@ -293,7 +278,7 @@ void __init pat_bp_init(void)
* NOTE: When WC or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat_msr_val = PAT(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
+ pat_msr_val = PAT_VALUE(WB, WT, UC_MINUS, UC, WB, WT, UC_MINUS, UC);
}
/*
@@ -328,7 +313,7 @@ void __init pat_bp_init(void)
* NOTE: When WT or WP is used, it is redirected to UC- per
* the default setup in __cachemode2pte_tbl[].
*/
- pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
+ pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WC, UC_MINUS, UC);
} else {
/*
* Full PAT support. We put WT in slot 7 to improve
@@ -356,13 +341,12 @@ void __init pat_bp_init(void)
* The reserved slots are unused, but mapped to their
* corresponding types in the presence of PAT errata.
*/
- pat_msr_val = PAT(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
+ pat_msr_val = PAT_VALUE(WB, WC, UC_MINUS, UC, WB, WP, UC_MINUS, WT);
}
memory_caching_control |= CACHE_PAT;
init_cache_modes(pat_msr_val);
-#undef PAT
}
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index dcd59040c449..9b1e0ede05a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1083,10 +1083,6 @@ struct amdgpu_device {
struct amdgpu_virt virt;
- /* link all shadow bo */
- struct list_head shadow_list;
- struct mutex shadow_list_lock;
-
/* record hw reset is performed */
bool has_hw_reset;
u8 reset_magic[AMDGPU_RESET_MAGIC_NUM];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 57bda66e85ef..2ca127173135 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -511,7 +511,7 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
return -EINVAL;
}
- /* udpate aca bank to aca source error_cache first */
+ /* update aca bank to aca source error_cache first */
ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 1254a43ec96b..3bc0cbf45bc5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
* @inst: xcc's instance number on a multi-XCC setup
*/
static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
- int *wave_cnt, int *vmid, uint32_t inst)
+ struct kfd_cu_occupancy *queue_cnt, uint32_t inst)
{
int pipe_idx;
int queue_slot;
unsigned int reg_val;
-
+ unsigned int wave_cnt;
/*
* Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
* parameters to read out waves in flight. Get VMID if there are
* non-zero waves in flight.
*/
- *vmid = 0xFF;
- *wave_cnt = 0;
pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
- soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
- reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
- queue_slot);
- *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
- if (*wave_cnt != 0)
- *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) &
- CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
+ soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst));
+ reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, inst),
+ mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
+ wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
+ if (wave_cnt != 0) {
+ queue_cnt->wave_cnt += wave_cnt;
+ queue_cnt->doorbell_off =
+ (RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL) &
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
+ CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+ }
}
/**
@@ -981,9 +983,8 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
* or more queues running and submitting waves to compute units.
*
* @adev: Handle of device from which to get number of waves in flight
- * @pasid: Identifies the process for which this query call is invoked
- * @pasid_wave_cnt: Output parameter updated with number of waves in flight that
- * belong to process with given pasid
+ * @cu_occupancy: Array that gets filled with wave_cnt and doorbell offset
+ * for comparison later.
* @max_waves_per_cu: Output parameter updated with maximum number of waves
* possible per Compute Unit
* @inst: xcc's instance number on a multi-XCC setup
@@ -1011,34 +1012,28 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
* number of waves that are in flight for the queue at specified index. The
* index ranges from 0 to 7.
*
- * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
- * of the wave(s).
+ * If non-zero waves are in flight, store the corresponding doorbell offset
+ * of the queue, along with the wave count.
*
- * Determine if VMID from above step maps to pasid provided as parameter. If
- * it matches agrregate the wave count. That the VMID will not match pasid is
- * a normal condition i.e. a device is expected to support multiple queues
- * from multiple proceses.
+ * Determine if the queue belongs to the process by comparing the doorbell
+ * offset against the process's queues. If it matches, aggregate the wave
+ * count for the process.
*
* Reading registers referenced above involves programming GRBM appropriately
*/
-void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
- int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst)
+void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
+ struct kfd_cu_occupancy *cu_occupancy,
+ int *max_waves_per_cu, uint32_t inst)
{
int qidx;
- int vmid;
int se_idx;
- int sh_idx;
int se_cnt;
- int sh_cnt;
- int wave_cnt;
int queue_map;
- int pasid_tmp;
int max_queue_cnt;
- int vmid_wave_cnt = 0;
DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);
lock_spi_csq_mutexes(adev);
- soc15_grbm_select(adev, 1, 0, 0, 0, inst);
+ soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst));
/*
* Iterate through the shader engines and arrays of the device
@@ -1048,51 +1043,38 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
AMDGPU_MAX_QUEUES);
max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
adev->gfx.mec.num_queue_per_pipe;
- sh_cnt = adev->gfx.config.max_sh_per_se;
se_cnt = adev->gfx.config.max_shader_engines;
for (se_idx = 0; se_idx < se_cnt; se_idx++) {
- for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
+ amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
+ queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_STATUS);
+
+ /*
+ * Assumption: queue map encodes following schema: four
+ * pipes per each micro-engine, with each pipe mapping
+ * eight queues. This schema is true for GFX9 devices
+ * and must be verified for newer device families
+ */
+ for (qidx = 0; qidx < max_queue_cnt; qidx++) {
+ /* Skip qeueus that are not associated with
+ * compute functions
+ */
+ if (!test_bit(qidx, cp_queue_bitmap))
+ continue;
- amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst);
- queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
+ if (!(queue_map & (1 << qidx)))
+ continue;
- /*
- * Assumption: queue map encodes following schema: four
- * pipes per each micro-engine, with each pipe mapping
- * eight queues. This schema is true for GFX9 devices
- * and must be verified for newer device families
- */
- for (qidx = 0; qidx < max_queue_cnt; qidx++) {
-
- /* Skip qeueus that are not associated with
- * compute functions
- */
- if (!test_bit(qidx, cp_queue_bitmap))
- continue;
-
- if (!(queue_map & (1 << qidx)))
- continue;
-
- /* Get number of waves in flight and aggregate them */
- get_wave_count(adev, qidx, &wave_cnt, &vmid,
- inst);
- if (wave_cnt != 0) {
- pasid_tmp =
- RREG32(SOC15_REG_OFFSET(OSSSYS, inst,
- mmIH_VMID_0_LUT) + vmid);
- if (pasid_tmp == pasid)
- vmid_wave_cnt += wave_cnt;
- }
- }
+ /* Get number of waves in flight and aggregate them */
+ get_wave_count(adev, qidx, &cu_occupancy[qidx],
+ inst);
}
}
amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst);
- soc15_grbm_select(adev, 0, 0, 0, 0, inst);
+ soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst));
unlock_spi_csq_mutexes(adev);
/* Update the output parameters and return */
- *pasid_wave_cnt = vmid_wave_cnt;
*max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
adev->gfx.cu_info.max_waves_per_simd;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 988c50ac3be0..b6a91a552aa4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -52,8 +52,9 @@ bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
uint8_t vmid, uint16_t *p_pasid);
void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
uint32_t vmid, uint64_t page_table_base);
-void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
- int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst);
+void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
+ struct kfd_cu_occupancy *cu_occupancy,
+ int *max_waves_per_cu, uint32_t inst);
void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
uint32_t inst);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 4afef5b46c7d..ce5ca304dba9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1499,7 +1499,7 @@ static int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)
}
}
- ret = amdgpu_bo_pin_restricted(bo, domain, 0, 0);
+ ret = amdgpu_bo_pin(bo, domain);
if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
index 42e64bce661e..45affc02548c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bios.c
@@ -87,8 +87,9 @@ static bool check_atom_bios(uint8_t *bios, size_t size)
* part of the system bios. On boot, the system bios puts a
* copy of the igp rom at the start of vram if a discrete card is
* present.
+ * For SR-IOV, the vbios image is also put in VRAM in the VF.
*/
-static bool igp_read_bios_from_vram(struct amdgpu_device *adev)
+static bool amdgpu_read_bios_from_vram(struct amdgpu_device *adev)
{
uint8_t __iomem *bios;
resource_size_t vram_base;
@@ -284,10 +285,6 @@ static bool amdgpu_atrm_get_bios(struct amdgpu_device *adev)
acpi_status status;
bool found = false;
- /* ATRM is for the discrete card only */
- if (adev->flags & AMD_IS_APU)
- return false;
-
/* ATRM is for on-platform devices only */
if (dev_is_removable(&adev->pdev->dev))
return false;
@@ -343,11 +340,8 @@ static inline bool amdgpu_atrm_get_bios(struct amdgpu_device *adev)
static bool amdgpu_read_disabled_bios(struct amdgpu_device *adev)
{
- if (adev->flags & AMD_IS_APU)
- return igp_read_bios_from_vram(adev);
- else
- return (!adev->asic_funcs || !adev->asic_funcs->read_disabled_bios) ?
- false : amdgpu_asic_read_disabled_bios(adev);
+ return (!adev->asic_funcs || !adev->asic_funcs->read_disabled_bios) ?
+ false : amdgpu_asic_read_disabled_bios(adev);
}
#ifdef CONFIG_ACPI
@@ -414,7 +408,36 @@ static inline bool amdgpu_acpi_vfct_bios(struct amdgpu_device *adev)
}
#endif
-bool amdgpu_get_bios(struct amdgpu_device *adev)
+static bool amdgpu_get_bios_apu(struct amdgpu_device *adev)
+{
+ if (amdgpu_acpi_vfct_bios(adev)) {
+ dev_info(adev->dev, "Fetched VBIOS from VFCT\n");
+ goto success;
+ }
+
+ if (amdgpu_read_bios_from_vram(adev)) {
+ dev_info(adev->dev, "Fetched VBIOS from VRAM BAR\n");
+ goto success;
+ }
+
+ if (amdgpu_read_bios(adev)) {
+ dev_info(adev->dev, "Fetched VBIOS from ROM BAR\n");
+ goto success;
+ }
+
+ if (amdgpu_read_platform_bios(adev)) {
+ dev_info(adev->dev, "Fetched VBIOS from platform\n");
+ goto success;
+ }
+
+ dev_err(adev->dev, "Unable to locate a BIOS ROM\n");
+ return false;
+
+success:
+ return true;
+}
+
+static bool amdgpu_get_bios_dgpu(struct amdgpu_device *adev)
{
if (amdgpu_atrm_get_bios(adev)) {
dev_info(adev->dev, "Fetched VBIOS from ATRM\n");
@@ -426,7 +449,8 @@ bool amdgpu_get_bios(struct amdgpu_device *adev)
goto success;
}
- if (igp_read_bios_from_vram(adev)) {
+ /* this is required for SR-IOV */
+ if (amdgpu_read_bios_from_vram(adev)) {
dev_info(adev->dev, "Fetched VBIOS from VRAM BAR\n");
goto success;
}
@@ -455,10 +479,24 @@ bool amdgpu_get_bios(struct amdgpu_device *adev)
return false;
success:
- adev->is_atom_fw = adev->asic_type >= CHIP_VEGA10;
return true;
}
+bool amdgpu_get_bios(struct amdgpu_device *adev)
+{
+ bool found;
+
+ if (adev->flags & AMD_IS_APU)
+ found = amdgpu_get_bios_apu(adev);
+ else
+ found = amdgpu_get_bios_dgpu(adev);
+
+ if (found)
+ adev->is_atom_fw = adev->asic_type >= CHIP_VEGA10;
+
+ return found;
+}
+
/* helper function for soc15 and onwards to read bios from rom */
bool amdgpu_soc15_read_bios_from_rom(struct amdgpu_device *adev,
u8 *bios, u32 length_bytes)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f4628412dac4..c2394c8b4d6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4107,9 +4107,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
spin_lock_init(&adev->mm_stats.lock);
spin_lock_init(&adev->wb.lock);
- INIT_LIST_HEAD(&adev->shadow_list);
- mutex_init(&adev->shadow_list_lock);
-
INIT_LIST_HEAD(&adev->reset_list);
INIT_LIST_HEAD(&adev->ras_list);
@@ -5030,80 +5027,6 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
}
/**
- * amdgpu_device_recover_vram - Recover some VRAM contents
- *
- * @adev: amdgpu_device pointer
- *
- * Restores the contents of VRAM buffers from the shadows in GTT. Used to
- * restore things like GPUVM page tables after a GPU reset where
- * the contents of VRAM might be lost.
- *
- * Returns:
- * 0 on success, negative error code on failure.
- */
-static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
-{
- struct dma_fence *fence = NULL, *next = NULL;
- struct amdgpu_bo *shadow;
- struct amdgpu_bo_vm *vmbo;
- long r = 1, tmo;
-
- if (amdgpu_sriov_runtime(adev))
- tmo = msecs_to_jiffies(8000);
- else
- tmo = msecs_to_jiffies(100);
-
- dev_info(adev->dev, "recover vram bo from shadow start\n");
- mutex_lock(&adev->shadow_list_lock);
- list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
- /* If vm is compute context or adev is APU, shadow will be NULL */
- if (!vmbo->shadow)
- continue;
- shadow = vmbo->shadow;
-
- /* No need to recover an evicted BO */
- if (!shadow->tbo.resource ||
- shadow->tbo.resource->mem_type != TTM_PL_TT ||
- shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
- shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
- continue;
-
- r = amdgpu_bo_restore_shadow(shadow, &next);
- if (r)
- break;
-
- if (fence) {
- tmo = dma_fence_wait_timeout(fence, false, tmo);
- dma_fence_put(fence);
- fence = next;
- if (tmo == 0) {
- r = -ETIMEDOUT;
- break;
- } else if (tmo < 0) {
- r = tmo;
- break;
- }
- } else {
- fence = next;
- }
- }
- mutex_unlock(&adev->shadow_list_lock);
-
- if (fence)
- tmo = dma_fence_wait_timeout(fence, false, tmo);
- dma_fence_put(fence);
-
- if (r < 0 || tmo <= 0) {
- dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
- return -EIO;
- }
-
- dev_info(adev->dev, "recover vram bo from shadow done\n");
- return 0;
-}
-
-
-/**
* amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
*
* @adev: amdgpu_device pointer
@@ -5165,12 +5088,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
if (r)
return r;
- if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
+ if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
amdgpu_inc_vram_lost(adev);
- r = amdgpu_device_recover_vram(adev);
- }
- if (r)
- return r;
/* need to be called during full access so we can't do it later like
* bare-metal does.
@@ -5569,9 +5488,7 @@ out:
}
}
- if (!r)
- r = amdgpu_device_recover_vram(tmp_adev);
- else
+ if (r)
tmp_adev->asic_reset_res = r;
}
@@ -6189,7 +6106,7 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
p2p_addressable = !(adev->gmc.aper_base & address_mask ||
aper_limit & address_mask);
}
- return is_large_bar && p2p_access && p2p_addressable;
+ return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
#else
return false;
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 092ec11258cd..b119d27271c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -233,6 +233,7 @@ int amdgpu_display_crtc_page_flip_target(struct drm_crtc *crtc,
}
if (!adev->enable_virtual_display) {
+ new_abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(new_abo,
amdgpu_display_supported_domains(adev, new_abo->flags));
if (unlikely(r != 0)) {
@@ -1474,7 +1475,7 @@ bool amdgpu_display_crtc_scaling_mode_fixup(struct drm_crtc *crtc,
if ((!(mode->flags & DRM_MODE_FLAG_INTERLACE)) &&
((amdgpu_encoder->underscan_type == UNDERSCAN_ON) ||
((amdgpu_encoder->underscan_type == UNDERSCAN_AUTO) &&
- connector->display_info.is_hdmi &&
+ connector && connector->display_info.is_hdmi &&
amdgpu_display_is_hdtv_mode(mode)))) {
if (amdgpu_encoder->underscan_hborder != 0)
amdgpu_crtc->h_border = amdgpu_encoder->underscan_hborder;
@@ -1759,6 +1760,7 @@ int amdgpu_display_resume_helper(struct amdgpu_device *adev)
r = amdgpu_bo_reserve(aobj, true);
if (r == 0) {
+ aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
if (r != 0)
dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index f57411ed2dc2..81d9877c8735 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -117,9 +117,10 @@
* - 3.56.0 - Update IB start address and size alignment for decode and encode
* - 3.57.0 - Compute tunneling on GFX10+
* - 3.58.0 - Add GFX12 DCC support
+ * - 3.59.0 - Cleared VRAM
*/
#define KMS_DRIVER_MAJOR 3
-#define KMS_DRIVER_MINOR 58
+#define KMS_DRIVER_MINOR 59
#define KMS_DRIVER_PATCHLEVEL 0
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 0e617dff8765..1a5df8b94661 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -43,8 +43,6 @@
#include "amdgpu_hmm.h"
#include "amdgpu_xgmi.h"
-static const struct drm_gem_object_funcs amdgpu_gem_object_funcs;
-
static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
{
struct ttm_buffer_object *bo = vmf->vma->vm_private_data;
@@ -87,11 +85,11 @@ static const struct vm_operations_struct amdgpu_gem_vm_ops = {
static void amdgpu_gem_object_free(struct drm_gem_object *gobj)
{
- struct amdgpu_bo *robj = gem_to_amdgpu_bo(gobj);
+ struct amdgpu_bo *aobj = gem_to_amdgpu_bo(gobj);
- if (robj) {
- amdgpu_hmm_unregister(robj);
- amdgpu_bo_unref(&robj);
+ if (aobj) {
+ amdgpu_hmm_unregister(aobj);
+ ttm_bo_put(&aobj->tbo);
}
}
@@ -126,7 +124,6 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size,
bo = &ubo->bo;
*obj = &bo->tbo.base;
- (*obj)->funcs = &amdgpu_gem_object_funcs;
return 0;
}
@@ -295,7 +292,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str
return drm_gem_ttm_mmap(obj, vma);
}
-static const struct drm_gem_object_funcs amdgpu_gem_object_funcs = {
+const struct drm_gem_object_funcs amdgpu_gem_object_funcs = {
.free = amdgpu_gem_object_free,
.open = amdgpu_gem_object_open,
.close = amdgpu_gem_object_close,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h
index f30264782ba2..3a8f57900a3a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h
@@ -33,6 +33,8 @@
#define AMDGPU_GEM_DOMAIN_MAX 0x3
#define gem_to_amdgpu_bo(gobj) container_of((gobj), struct amdgpu_bo, tbo.base)
+extern const struct drm_gem_object_funcs amdgpu_gem_object_funcs;
+
unsigned long amdgpu_gem_timeout(uint64_t timeout_ns);
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index ad6bf5d4e0a9..16f2605ac50b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -107,8 +107,11 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
/*
* Do the coredump immediately after a job timeout to get a very
* close dump/snapshot/representation of GPU's current error status
+ * Skip it for SRIOV, since VF FLR will be triggered by host driver
+ * before job timeout
*/
- amdgpu_job_core_dump(adev, job);
+ if (!amdgpu_sriov_vf(adev))
+ amdgpu_job_core_dump(adev, job);
if (amdgpu_gpu_recovery &&
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index e32161f6b67a..44819cdba7fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -77,24 +77,6 @@ static void amdgpu_bo_user_destroy(struct ttm_buffer_object *tbo)
amdgpu_bo_destroy(tbo);
}
-static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo)
-{
- struct amdgpu_device *adev = amdgpu_ttm_adev(tbo->bdev);
- struct amdgpu_bo *shadow_bo = ttm_to_amdgpu_bo(tbo), *bo;
- struct amdgpu_bo_vm *vmbo;
-
- bo = shadow_bo->parent;
- vmbo = to_amdgpu_bo_vm(bo);
- /* in case amdgpu_device_recover_vram got NULL of bo->parent */
- if (!list_empty(&vmbo->shadow_list)) {
- mutex_lock(&adev->shadow_list_lock);
- list_del_init(&vmbo->shadow_list);
- mutex_unlock(&adev->shadow_list_lock);
- }
-
- amdgpu_bo_destroy(tbo);
-}
-
/**
* amdgpu_bo_is_amdgpu_bo - check if the buffer object is an &amdgpu_bo
* @bo: buffer object to be checked
@@ -108,8 +90,7 @@ static void amdgpu_bo_vm_destroy(struct ttm_buffer_object *tbo)
bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo)
{
if (bo->destroy == &amdgpu_bo_destroy ||
- bo->destroy == &amdgpu_bo_user_destroy ||
- bo->destroy == &amdgpu_bo_vm_destroy)
+ bo->destroy == &amdgpu_bo_user_destroy)
return true;
return false;
@@ -583,6 +564,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
if (bo == NULL)
return -ENOMEM;
drm_gem_private_object_init(adev_to_drm(adev), &bo->tbo.base, size);
+ bo->tbo.base.funcs = &amdgpu_gem_object_funcs;
bo->vm_bo = NULL;
bo->preferred_domains = bp->preferred_domain ? bp->preferred_domain :
bp->domain;
@@ -723,52 +705,6 @@ int amdgpu_bo_create_vm(struct amdgpu_device *adev,
}
/**
- * amdgpu_bo_add_to_shadow_list - add a BO to the shadow list
- *
- * @vmbo: BO that will be inserted into the shadow list
- *
- * Insert a BO to the shadow list.
- */
-void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo)
-{
- struct amdgpu_device *adev = amdgpu_ttm_adev(vmbo->bo.tbo.bdev);
-
- mutex_lock(&adev->shadow_list_lock);
- list_add_tail(&vmbo->shadow_list, &adev->shadow_list);
- vmbo->shadow->parent = amdgpu_bo_ref(&vmbo->bo);
- vmbo->shadow->tbo.destroy = &amdgpu_bo_vm_destroy;
- mutex_unlock(&adev->shadow_list_lock);
-}
-
-/**
- * amdgpu_bo_restore_shadow - restore an &amdgpu_bo shadow
- *
- * @shadow: &amdgpu_bo shadow to be restored
- * @fence: dma_fence associated with the operation
- *
- * Copies a buffer object's shadow content back to the object.
- * This is used for recovering a buffer from its shadow in case of a gpu
- * reset where vram context may be lost.
- *
- * Returns:
- * 0 for success or a negative error code on failure.
- */
-int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence)
-
-{
- struct amdgpu_device *adev = amdgpu_ttm_adev(shadow->tbo.bdev);
- struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
- uint64_t shadow_addr, parent_addr;
-
- shadow_addr = amdgpu_bo_gpu_offset(shadow);
- parent_addr = amdgpu_bo_gpu_offset(shadow->parent);
-
- return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,
- amdgpu_bo_size(shadow), NULL, fence,
- true, false, 0);
-}
-
-/**
* amdgpu_bo_kmap - map an &amdgpu_bo buffer object
* @bo: &amdgpu_bo buffer object to be mapped
* @ptr: kernel virtual address to be returned
@@ -851,7 +787,7 @@ struct amdgpu_bo *amdgpu_bo_ref(struct amdgpu_bo *bo)
if (bo == NULL)
return NULL;
- ttm_bo_get(&bo->tbo);
+ drm_gem_object_get(&bo->tbo.base);
return bo;
}
@@ -863,40 +799,30 @@ struct amdgpu_bo *amdgpu_bo_ref(struct amdgpu_bo *bo)
*/
void amdgpu_bo_unref(struct amdgpu_bo **bo)
{
- struct ttm_buffer_object *tbo;
-
if ((*bo) == NULL)
return;
- tbo = &((*bo)->tbo);
- ttm_bo_put(tbo);
+ drm_gem_object_put(&(*bo)->tbo.base);
*bo = NULL;
}
/**
- * amdgpu_bo_pin_restricted - pin an &amdgpu_bo buffer object
+ * amdgpu_bo_pin - pin an &amdgpu_bo buffer object
* @bo: &amdgpu_bo buffer object to be pinned
* @domain: domain to be pinned to
- * @min_offset: the start of requested address range
- * @max_offset: the end of requested address range
*
- * Pins the buffer object according to requested domain and address range. If
- * the memory is unbound gart memory, binds the pages into gart table. Adjusts
- * pin_count and pin_size accordingly.
+ * Pins the buffer object according to requested domain. If the memory is
+ * unbound gart memory, binds the pages into gart table. Adjusts pin_count and
+ * pin_size accordingly.
*
* Pinning means to lock pages in memory along with keeping them at a fixed
* offset. It is required when a buffer can not be moved, for example, when
* a display buffer is being scanned out.
*
- * Compared with amdgpu_bo_pin(), this function gives more flexibility on
- * where to pin a buffer if there are specific restrictions on where a buffer
- * must be located.
- *
* Returns:
* 0 for success or a negative error code on failure.
*/
-int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
- u64 min_offset, u64 max_offset)
+int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain)
{
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
struct ttm_operation_ctx ctx = { false, false };
@@ -905,9 +831,6 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm))
return -EPERM;
- if (WARN_ON_ONCE(min_offset > max_offset))
- return -EINVAL;
-
/* Check domain to be pinned to against preferred domains */
if (bo->preferred_domains & domain)
domain = bo->preferred_domains & domain;
@@ -933,14 +856,6 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
return -EINVAL;
ttm_bo_pin(&bo->tbo);
-
- if (max_offset != 0) {
- u64 domain_start = amdgpu_ttm_domain_start(adev,
- mem_type);
- WARN_ON_ONCE(max_offset <
- (amdgpu_bo_gpu_offset(bo) - domain_start));
- }
-
return 0;
}
@@ -957,17 +872,6 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
amdgpu_bo_placement_from_domain(bo, domain);
for (i = 0; i < bo->placement.num_placement; i++) {
- unsigned int fpfn, lpfn;
-
- fpfn = min_offset >> PAGE_SHIFT;
- lpfn = max_offset >> PAGE_SHIFT;
-
- if (fpfn > bo->placements[i].fpfn)
- bo->placements[i].fpfn = fpfn;
- if (!bo->placements[i].lpfn ||
- (lpfn && lpfn < bo->placements[i].lpfn))
- bo->placements[i].lpfn = lpfn;
-
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS &&
bo->placements[i].mem_type == TTM_PL_VRAM)
bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
@@ -994,24 +898,6 @@ error:
}
/**
- * amdgpu_bo_pin - pin an &amdgpu_bo buffer object
- * @bo: &amdgpu_bo buffer object to be pinned
- * @domain: domain to be pinned to
- *
- * A simple wrapper to amdgpu_bo_pin_restricted().
- * Provides a simpler API for buffers that do not have any strict restrictions
- * on where a buffer must be located.
- *
- * Returns:
- * 0 for success or a negative error code on failure.
- */
-int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain)
-{
- bo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
- return amdgpu_bo_pin_restricted(bo, domain, 0, 0);
-}
-
-/**
* amdgpu_bo_unpin - unpin an &amdgpu_bo buffer object
* @bo: &amdgpu_bo buffer object to be unpinned
*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index d7e27957013f..717e47b46167 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -136,8 +136,6 @@ struct amdgpu_bo_user {
struct amdgpu_bo_vm {
struct amdgpu_bo bo;
- struct amdgpu_bo *shadow;
- struct list_head shadow_list;
struct amdgpu_vm_bo_base entries[];
};
@@ -275,22 +273,6 @@ static inline bool amdgpu_bo_encrypted(struct amdgpu_bo *bo)
return bo->flags & AMDGPU_GEM_CREATE_ENCRYPTED;
}
-/**
- * amdgpu_bo_shadowed - check if the BO is shadowed
- *
- * @bo: BO to be tested.
- *
- * Returns:
- * NULL if not shadowed or else return a BO pointer.
- */
-static inline struct amdgpu_bo *amdgpu_bo_shadowed(struct amdgpu_bo *bo)
-{
- if (bo->tbo.type == ttm_bo_type_kernel)
- return to_amdgpu_bo_vm(bo)->shadow;
-
- return NULL;
-}
-
bool amdgpu_bo_is_amdgpu_bo(struct ttm_buffer_object *bo);
void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain);
@@ -322,8 +304,6 @@ void amdgpu_bo_kunmap(struct amdgpu_bo *bo);
struct amdgpu_bo *amdgpu_bo_ref(struct amdgpu_bo *bo);
void amdgpu_bo_unref(struct amdgpu_bo **bo);
int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain);
-int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
- u64 min_offset, u64 max_offset);
void amdgpu_bo_unpin(struct amdgpu_bo *bo);
int amdgpu_bo_init(struct amdgpu_device *adev);
void amdgpu_bo_fini(struct amdgpu_device *adev);
@@ -349,9 +329,6 @@ u64 amdgpu_bo_gpu_offset(struct amdgpu_bo *bo);
u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo);
void amdgpu_bo_get_memory(struct amdgpu_bo *bo,
struct amdgpu_mem_stats *stats);
-void amdgpu_bo_add_to_shadow_list(struct amdgpu_bo_vm *vmbo);
-int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow,
- struct dma_fence **fence);
uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,
uint32_t domain);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 189574d53ebd..0b28b2cf1517 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2853,7 +2853,7 @@ static int psp_load_non_psp_fw(struct psp_context *psp)
if (ret)
return ret;
- /* Start rlc autoload after psp recieved all the gfx firmware */
+ /* Start rlc autoload after psp received all the gfx firmware */
if (psp->autoload_supported && ucode->ucode_id == (amdgpu_sriov_vf(adev) ?
adev->virt.autoload_ucode_id : AMDGPU_UCODE_ID_RLC_G)) {
ret = psp_rlc_autoload_start(psp);
@@ -3425,9 +3425,11 @@ int psp_init_sos_microcode(struct psp_context *psp, const char *chip_name)
const struct psp_firmware_header_v1_2 *sos_hdr_v1_2;
const struct psp_firmware_header_v1_3 *sos_hdr_v1_3;
const struct psp_firmware_header_v2_0 *sos_hdr_v2_0;
- int err = 0;
+ const struct psp_firmware_header_v2_1 *sos_hdr_v2_1;
+ int fw_index, fw_bin_count, start_index = 0;
+ const struct psp_fw_bin_desc *fw_bin;
uint8_t *ucode_array_start_addr;
- int fw_index = 0;
+ int err = 0;
err = amdgpu_ucode_request(adev, &adev->psp.sos_fw, "amdgpu/%s_sos.bin", chip_name);
if (err)
@@ -3478,15 +3480,30 @@ int psp_init_sos_microcode(struct psp_context *psp, const char *chip_name)
case 2:
sos_hdr_v2_0 = (const struct psp_firmware_header_v2_0 *)adev->psp.sos_fw->data;
- if (le32_to_cpu(sos_hdr_v2_0->psp_fw_bin_count) >= UCODE_MAX_PSP_PACKAGING) {
+ fw_bin_count = le32_to_cpu(sos_hdr_v2_0->psp_fw_bin_count);
+
+ if (fw_bin_count >= UCODE_MAX_PSP_PACKAGING) {
dev_err(adev->dev, "packed SOS count exceeds maximum limit\n");
err = -EINVAL;
goto out;
}
- for (fw_index = 0; fw_index < le32_to_cpu(sos_hdr_v2_0->psp_fw_bin_count); fw_index++) {
- err = parse_sos_bin_descriptor(psp,
- &sos_hdr_v2_0->psp_fw_bin[fw_index],
+ if (sos_hdr_v2_0->header.header_version_minor == 1) {
+ sos_hdr_v2_1 = (const struct psp_firmware_header_v2_1 *)adev->psp.sos_fw->data;
+
+ fw_bin = sos_hdr_v2_1->psp_fw_bin;
+
+ if (psp_is_aux_sos_load_required(psp))
+ start_index = le32_to_cpu(sos_hdr_v2_1->psp_aux_fw_bin_index);
+ else
+ fw_bin_count -= le32_to_cpu(sos_hdr_v2_1->psp_aux_fw_bin_index);
+
+ } else {
+ fw_bin = sos_hdr_v2_0->psp_fw_bin;
+ }
+
+ for (fw_index = start_index; fw_index < fw_bin_count; fw_index++) {
+ err = parse_sos_bin_descriptor(psp, fw_bin + fw_index,
sos_hdr_v2_0);
if (err)
goto out;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 74a96516c913..e8abbbcb4326 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -138,6 +138,7 @@ struct psp_funcs {
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
bool (*get_ras_capability)(struct psp_context *psp);
+ bool (*is_aux_sos_load_required)(struct psp_context *psp);
};
struct ta_funcs {
@@ -464,6 +465,9 @@ struct amdgpu_psp_funcs {
((psp)->funcs->fatal_error_recovery_quirk ? \
(psp)->funcs->fatal_error_recovery_quirk((psp)) : 0)
+#define psp_is_aux_sos_load_required(psp) \
+ ((psp)->funcs->is_aux_sos_load_required ? (psp)->funcs->is_aux_sos_load_required((psp)) : 0)
+
extern const struct amd_ip_funcs psp_ip_funcs;
extern const struct amdgpu_ip_block_version psp_v3_1_ip_block;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 61a2f386d9fb..1a1395c5fff1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -882,7 +882,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
if (ret)
return ret;
- /* gfx block ras dsiable cmd must send to ras-ta */
+ /* gfx block ras disable cmd must send to ras-ta */
if (head->block == AMDGPU_RAS_BLOCK__GFX)
con->features |= BIT(head->block);
@@ -3468,6 +3468,11 @@ init_ras_enabled_flag:
/* aca is disabled by default */
adev->aca.is_enabled = false;
+
+ /* bad page feature is not applicable to specific app platform */
+ if (adev->gmc.is_app_apu &&
+ amdgpu_ip_version(adev, UMC_HWIP, 0) == IP_VERSION(12, 0, 0))
+ amdgpu_bad_page_threshold = 0;
}
static void amdgpu_ras_counte_dw(struct work_struct *work)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index aab8077e5098..f28f6b4ba765 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -58,7 +58,7 @@
#define EEPROM_I2C_MADDR_4 0x40000
/*
- * The 2 macros bellow represent the actual size in bytes that
+ * The 2 macros below represent the actual size in bytes that
* those entities occupy in the EEPROM memory.
* RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_table_record) which
* uses uint64 to store 6b fields such as retired_page.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
index bdf1ef825d89..c586ab4c911b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
@@ -260,6 +260,36 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
return 0;
}
+/**
+ * amdgpu_sync_kfd - sync to KFD fences
+ *
+ * @sync: sync object to add KFD fences to
+ * @resv: reservation object with KFD fences
+ *
+ * Extract all KFD fences and add them to the sync object.
+ */
+int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv)
+{
+ struct dma_resv_iter cursor;
+ struct dma_fence *f;
+ int r = 0;
+
+ dma_resv_iter_begin(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP);
+ dma_resv_for_each_fence_unlocked(&cursor, f) {
+ void *fence_owner = amdgpu_sync_get_owner(f);
+
+ if (fence_owner != AMDGPU_FENCE_OWNER_KFD)
+ continue;
+
+ r = amdgpu_sync_fence(sync, f);
+ if (r)
+ break;
+ }
+ dma_resv_iter_end(&cursor);
+
+ return r;
+}
+
/* Free the entry back to the slab */
static void amdgpu_sync_entry_free(struct amdgpu_sync_entry *e)
{
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
index cf1e9e858efd..e3272dce798d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.h
@@ -51,6 +51,7 @@ int amdgpu_sync_fence(struct amdgpu_sync *sync, struct dma_fence *f);
int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
struct dma_resv *resv, enum amdgpu_sync_mode mode,
void *owner);
+int amdgpu_sync_kfd(struct amdgpu_sync *sync, struct dma_resv *resv);
struct dma_fence *amdgpu_sync_peek_fence(struct amdgpu_sync *sync,
struct amdgpu_ring *ring);
struct dma_fence *amdgpu_sync_get_fence(struct amdgpu_sync *sync);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index b8bc7fa8c375..74adb983ab03 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1970,7 +1970,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
DRM_INFO("amdgpu: %uM of GTT memory ready.\n",
(unsigned int)(gtt_size / (1024 * 1024)));
- /* Initiailize doorbell pool on PCI BAR */
+ /* Initialize doorbell pool on PCI BAR */
r = amdgpu_ttm_init_on_chip(adev, AMDGPU_PL_DOORBELL, adev->doorbell.size / PAGE_SIZE);
if (r) {
DRM_ERROR("Failed initializing doorbell heap.\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
index 5bc37acd3981..4e23419b92d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
@@ -136,6 +136,14 @@ struct psp_firmware_header_v2_0 {
struct psp_fw_bin_desc psp_fw_bin[];
};
+/* version_major=2, version_minor=1 */
+struct psp_firmware_header_v2_1 {
+ struct common_firmware_header header;
+ uint32_t psp_fw_bin_count;
+ uint32_t psp_aux_fw_bin_index;
+ struct psp_fw_bin_desc psp_fw_bin[];
+};
+
/* version_major=1, version_minor=0 */
struct ta_firmware_header_v1_0 {
struct common_firmware_header header;
@@ -426,6 +434,7 @@ union amdgpu_firmware_header {
struct psp_firmware_header_v1_1 psp_v1_1;
struct psp_firmware_header_v1_3 psp_v1_3;
struct psp_firmware_header_v2_0 psp_v2_0;
+ struct psp_firmware_header_v2_0 psp_v2_1;
struct ta_firmware_header_v1_0 ta;
struct ta_firmware_header_v2_0 ta_v2_0;
struct gfx_firmware_header_v1_0 gfx;
@@ -447,7 +456,7 @@ union amdgpu_firmware_header {
uint8_t raw[0x100];
};
-#define UCODE_MAX_PSP_PACKAGING ((sizeof(union amdgpu_firmware_header) - sizeof(struct common_firmware_header) - 4) / sizeof(struct psp_fw_bin_desc))
+#define UCODE_MAX_PSP_PACKAGING (((sizeof(union amdgpu_firmware_header) - sizeof(struct common_firmware_header) - 4) / sizeof(struct psp_fw_bin_desc)) * 2)
/*
* fw loading support
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index e5f508d34ed8..d4c2afafbb73 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -338,6 +338,7 @@ static int amdgpu_vkms_prepare_fb(struct drm_plane *plane,
else
domain = AMDGPU_GEM_DOMAIN_VRAM;
+ rbo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(rbo, domain);
if (unlikely(r != 0)) {
if (r != -ERESTARTSYS)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 2452dfa6314f..6005280f5f38 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -465,7 +465,6 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
{
uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm);
struct amdgpu_vm_bo_base *bo_base;
- struct amdgpu_bo *shadow;
struct amdgpu_bo *bo;
int r;
@@ -486,16 +485,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
spin_unlock(&vm->status_lock);
bo = bo_base->bo;
- shadow = amdgpu_bo_shadowed(bo);
r = validate(param, bo);
if (r)
return r;
- if (shadow) {
- r = validate(param, shadow);
- if (r)
- return r;
- }
if (bo->tbo.type != ttm_bo_type_kernel) {
amdgpu_vm_bo_moved(bo_base);
@@ -1176,6 +1169,12 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
AMDGPU_SYNC_EQ_OWNER, vm);
if (r)
goto error_free;
+ if (bo) {
+ r = amdgpu_sync_kfd(&sync, bo->tbo.base.resv);
+ if (r)
+ goto error_free;
+ }
+
} else {
struct drm_gem_object *obj = &bo->tbo.base;
@@ -2149,10 +2148,6 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
{
struct amdgpu_vm_bo_base *bo_base;
- /* shadow bo doesn't have bo base, its validation needs its parent */
- if (bo->parent && (amdgpu_bo_shadowed(bo->parent) == bo))
- bo = bo->parent;
-
for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
struct amdgpu_vm *vm = bo_base->vm;
@@ -2482,7 +2477,6 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
root_bo = amdgpu_bo_ref(&root->bo);
r = amdgpu_bo_reserve(root_bo, true);
if (r) {
- amdgpu_bo_unref(&root->shadow);
amdgpu_bo_unref(&root_bo);
goto error_free_delayed;
}
@@ -2575,11 +2569,6 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
vm->last_update = dma_fence_get_stub();
vm->is_compute_context = true;
- /* Free the shadow bo for compute VM */
- amdgpu_bo_unref(&to_amdgpu_bo_vm(vm->root.bo)->shadow);
-
- goto unreserve_bo;
-
unreserve_bo:
amdgpu_bo_unreserve(vm->root.bo);
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a076f43097e4..f78a0434a48f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -383,14 +383,6 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm,
if (r)
return r;
- if (vmbo->shadow) {
- struct amdgpu_bo *shadow = vmbo->shadow;
-
- r = ttm_bo_validate(&shadow->tbo, &shadow->placement, &ctx);
- if (r)
- return r;
- }
-
if (!drm_dev_enter(adev_to_drm(adev), &idx))
return -ENODEV;
@@ -448,10 +440,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
int32_t xcp_id)
{
struct amdgpu_bo_param bp;
- struct amdgpu_bo *bo;
- struct dma_resv *resv;
unsigned int num_entries;
- int r;
memset(&bp, 0, sizeof(bp));
@@ -484,42 +473,7 @@ int amdgpu_vm_pt_create(struct amdgpu_device *adev, struct amdgpu_vm *vm,
if (vm->root.bo)
bp.resv = vm->root.bo->tbo.base.resv;
- r = amdgpu_bo_create_vm(adev, &bp, vmbo);
- if (r)
- return r;
-
- bo = &(*vmbo)->bo;
- if (vm->is_compute_context || (adev->flags & AMD_IS_APU)) {
- (*vmbo)->shadow = NULL;
- return 0;
- }
-
- if (!bp.resv)
- WARN_ON(dma_resv_lock(bo->tbo.base.resv,
- NULL));
- resv = bp.resv;
- memset(&bp, 0, sizeof(bp));
- bp.size = amdgpu_vm_pt_size(adev, level);
- bp.domain = AMDGPU_GEM_DOMAIN_GTT;
- bp.flags = AMDGPU_GEM_CREATE_CPU_GTT_USWC;
- bp.type = ttm_bo_type_kernel;
- bp.resv = bo->tbo.base.resv;
- bp.bo_ptr_size = sizeof(struct amdgpu_bo);
- bp.xcp_id_plus1 = xcp_id + 1;
-
- r = amdgpu_bo_create(adev, &bp, &(*vmbo)->shadow);
-
- if (!resv)
- dma_resv_unlock(bo->tbo.base.resv);
-
- if (r) {
- amdgpu_bo_unref(&bo);
- return r;
- }
-
- amdgpu_bo_add_to_shadow_list(*vmbo);
-
- return 0;
+ return amdgpu_bo_create_vm(adev, &bp, vmbo);
}
/**
@@ -569,7 +523,6 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
return 0;
error_free_pt:
- amdgpu_bo_unref(&pt->shadow);
amdgpu_bo_unref(&pt_bo);
return r;
}
@@ -581,17 +534,10 @@ error_free_pt:
*/
static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry)
{
- struct amdgpu_bo *shadow;
-
if (!entry->bo)
return;
entry->bo->vm_bo = NULL;
- shadow = amdgpu_bo_shadowed(entry->bo);
- if (shadow) {
- ttm_bo_set_bulk_move(&shadow->tbo, NULL);
- amdgpu_bo_unref(&shadow);
- }
ttm_bo_set_bulk_move(&entry->bo->tbo, NULL);
spin_lock(&entry->vm->status_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 4772fba33285..46d9fb433ab2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -35,16 +35,7 @@
*/
static int amdgpu_vm_sdma_map_table(struct amdgpu_bo_vm *table)
{
- int r;
-
- r = amdgpu_ttm_alloc_gart(&table->bo.tbo);
- if (r)
- return r;
-
- if (table->shadow)
- r = amdgpu_ttm_alloc_gart(&table->shadow->tbo);
-
- return r;
+ return amdgpu_ttm_alloc_gart(&table->bo.tbo);
}
/* Allocate a new job for @count PTE updates */
@@ -265,17 +256,13 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p,
if (!p->pages_addr) {
/* set page commands needed */
- if (vmbo->shadow)
- amdgpu_vm_sdma_set_ptes(p, vmbo->shadow, pe, addr,
- count, incr, flags);
amdgpu_vm_sdma_set_ptes(p, bo, pe, addr, count,
incr, flags);
return 0;
}
/* copy commands needed */
- ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw *
- (vmbo->shadow ? 2 : 1);
+ ndw -= p->adev->vm_manager.vm_pte_funcs->copy_pte_num_dw;
/* for padding */
ndw -= 7;
@@ -290,8 +277,6 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p,
pte[i] |= flags;
}
- if (vmbo->shadow)
- amdgpu_vm_sdma_copy_ptes(p, vmbo->shadow, pe, nptes);
amdgpu_vm_sdma_copy_ptes(p, bo, pe, nptes);
pe += nptes * 8;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 90138bc5f03d..32775260556f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -180,6 +180,6 @@ amdgpu_get_next_xcp(struct amdgpu_xcp_mgr *xcp_mgr, int *from)
#define for_each_xcp(xcp_mgr, xcp, i) \
for (i = 0, xcp = amdgpu_get_next_xcp(xcp_mgr, &i); xcp; \
- xcp = amdgpu_get_next_xcp(xcp_mgr, &i))
+ ++i, xcp = amdgpu_get_next_xcp(xcp_mgr, &i))
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 26e2188101e7..5e8833e4fed2 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -94,8 +94,6 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev,
case AMDGPU_RING_TYPE_VCN_ENC:
case AMDGPU_RING_TYPE_VCN_JPEG:
ip_blk = AMDGPU_XCP_VCN;
- if (aqua_vanjaram_xcp_vcn_shared(adev))
- inst_mask = 1 << (inst_idx * 2);
break;
default:
DRM_ERROR("Not support ring type %d!", ring->funcs->type);
@@ -105,6 +103,8 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev,
for (xcp_id = 0; xcp_id < adev->xcp_mgr->num_xcps; xcp_id++) {
if (adev->xcp_mgr->xcp[xcp_id].ip[ip_blk].inst_mask & inst_mask) {
ring->xcp_id = xcp_id;
+ dev_dbg(adev->dev, "ring:%s xcp_id :%u", ring->name,
+ ring->xcp_id);
if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
adev->gfx.enforce_isolation[xcp_id].xcp_id = xcp_id;
break;
@@ -394,38 +394,31 @@ static int __aqua_vanjaram_get_xcp_ip_info(struct amdgpu_xcp_mgr *xcp_mgr, int x
struct amdgpu_xcp_ip *ip)
{
struct amdgpu_device *adev = xcp_mgr->adev;
+ int num_sdma, num_vcn, num_shared_vcn, num_xcp;
int num_xcc_xcp, num_sdma_xcp, num_vcn_xcp;
- int num_sdma, num_vcn;
num_sdma = adev->sdma.num_instances;
num_vcn = adev->vcn.num_vcn_inst;
+ num_shared_vcn = 1;
+
+ num_xcc_xcp = adev->gfx.num_xcc_per_xcp;
+ num_xcp = NUM_XCC(adev->gfx.xcc_mask) / num_xcc_xcp;
switch (xcp_mgr->mode) {
case AMDGPU_SPX_PARTITION_MODE:
- num_sdma_xcp = num_sdma;
- num_vcn_xcp = num_vcn;
- break;
case AMDGPU_DPX_PARTITION_MODE:
- num_sdma_xcp = num_sdma / 2;
- num_vcn_xcp = num_vcn / 2;
- break;
case AMDGPU_TPX_PARTITION_MODE:
- num_sdma_xcp = num_sdma / 3;
- num_vcn_xcp = num_vcn / 3;
- break;
case AMDGPU_QPX_PARTITION_MODE:
- num_sdma_xcp = num_sdma / 4;
- num_vcn_xcp = num_vcn / 4;
- break;
case AMDGPU_CPX_PARTITION_MODE:
- num_sdma_xcp = 2;
- num_vcn_xcp = num_vcn ? 1 : 0;
+ num_sdma_xcp = DIV_ROUND_UP(num_sdma, num_xcp);
+ num_vcn_xcp = DIV_ROUND_UP(num_vcn, num_xcp);
break;
default:
return -EINVAL;
}
- num_xcc_xcp = adev->gfx.num_xcc_per_xcp;
+ if (num_vcn && num_xcp > num_vcn)
+ num_shared_vcn = num_xcp / num_vcn;
switch (ip_id) {
case AMDGPU_XCP_GFXHUB:
@@ -441,7 +434,8 @@ static int __aqua_vanjaram_get_xcp_ip_info(struct amdgpu_xcp_mgr *xcp_mgr, int x
ip->ip_funcs = &sdma_v4_4_2_xcp_funcs;
break;
case AMDGPU_XCP_VCN:
- ip->inst_mask = XCP_INST_MASK(num_vcn_xcp, xcp_id);
+ ip->inst_mask =
+ XCP_INST_MASK(num_vcn_xcp, xcp_id / num_shared_vcn);
/* TODO : Assign IP funcs */
break;
default:
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
index 742adbc460c9..70c1399f738d 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
@@ -1881,6 +1881,7 @@ static int dce_v10_0_crtc_do_set_base(struct drm_crtc *crtc,
return r;
if (!atomic) {
+ abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM);
if (unlikely(r != 0)) {
amdgpu_bo_unreserve(abo);
@@ -2401,6 +2402,7 @@ static int dce_v10_0_crtc_cursor_set2(struct drm_crtc *crtc,
return ret;
}
+ aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
ret = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
amdgpu_bo_unreserve(aobj);
if (ret) {
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
index 8d46ebadfa46..f154c24499c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
@@ -1931,6 +1931,7 @@ static int dce_v11_0_crtc_do_set_base(struct drm_crtc *crtc,
return r;
if (!atomic) {
+ abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM);
if (unlikely(r != 0)) {
amdgpu_bo_unreserve(abo);
@@ -2485,6 +2486,7 @@ static int dce_v11_0_crtc_cursor_set2(struct drm_crtc *crtc,
return ret;
}
+ aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
ret = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
amdgpu_bo_unreserve(aobj);
if (ret) {
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
index f08dc6a3886f..a7fcb135827f 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
@@ -1861,6 +1861,7 @@ static int dce_v6_0_crtc_do_set_base(struct drm_crtc *crtc,
return r;
if (!atomic) {
+ abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM);
if (unlikely(r != 0)) {
amdgpu_bo_unreserve(abo);
@@ -2321,6 +2322,7 @@ static int dce_v6_0_crtc_cursor_set2(struct drm_crtc *crtc,
return ret;
}
+ aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
ret = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
amdgpu_bo_unreserve(aobj);
if (ret) {
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
index a6a3adf2ae13..77ac3f114d24 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c
@@ -1828,6 +1828,7 @@ static int dce_v8_0_crtc_do_set_base(struct drm_crtc *crtc,
return r;
if (!atomic) {
+ abo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(abo, AMDGPU_GEM_DOMAIN_VRAM);
if (unlikely(r != 0)) {
amdgpu_bo_unreserve(abo);
@@ -2320,6 +2321,7 @@ static int dce_v8_0_crtc_cursor_set2(struct drm_crtc *crtc,
return ret;
}
+ aobj->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
ret = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
amdgpu_bo_unreserve(aobj);
if (ret) {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index d1357c01eb39..47b47d21f464 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -202,12 +202,16 @@ static const struct amdgpu_hwip_reg_entry gc_gfx_queue_reg_list_12[] = {
SOC15_REG_ENTRY_STR(GC, 0, regCP_IB1_BUFSZ)
};
-static const struct soc15_reg_golden golden_settings_gc_12_0[] = {
+static const struct soc15_reg_golden golden_settings_gc_12_0_rev0[] = {
SOC15_REG_GOLDEN_VALUE(GC, 0, regDB_MEM_CONFIG, 0x0000000f, 0x0000000f),
SOC15_REG_GOLDEN_VALUE(GC, 0, regCB_HW_CONTROL_1, 0x03000000, 0x03000000),
SOC15_REG_GOLDEN_VALUE(GC, 0, regGL2C_CTRL5, 0x00000070, 0x00000020)
};
+static const struct soc15_reg_golden golden_settings_gc_12_0[] = {
+ SOC15_REG_GOLDEN_VALUE(GC, 0, regDB_MEM_CONFIG, 0x00008000, 0x00008000),
+};
+
#define DEFAULT_SH_MEM_CONFIG \
((SH_MEM_ADDRESS_MODE_64 << SH_MEM_CONFIG__ADDRESS_MODE__SHIFT) | \
(SH_MEM_ALIGNMENT_MODE_UNALIGNED << SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) | \
@@ -3495,10 +3499,14 @@ static void gfx_v12_0_init_golden_registers(struct amdgpu_device *adev)
switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
case IP_VERSION(12, 0, 0):
case IP_VERSION(12, 0, 1):
+ soc15_program_register_sequence(adev,
+ golden_settings_gc_12_0,
+ (const u32)ARRAY_SIZE(golden_settings_gc_12_0));
+
if (adev->rev_id == 0)
soc15_program_register_sequence(adev,
- golden_settings_gc_12_0,
- (const u32)ARRAY_SIZE(golden_settings_gc_12_0));
+ golden_settings_gc_12_0_rev0,
+ (const u32)ARRAY_SIZE(golden_settings_gc_12_0_rev0));
break;
default:
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 408e5600bb61..c100845409f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -1701,7 +1701,15 @@ static void gfx_v9_4_3_xcc_cp_compute_enable(struct amdgpu_device *adev,
WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL, 0);
} else {
WREG32_SOC15_RLC(GC, GET_INST(GC, xcc_id), regCP_MEC_CNTL,
- (CP_MEC_CNTL__MEC_ME1_HALT_MASK | CP_MEC_CNTL__MEC_ME2_HALT_MASK));
+ (CP_MEC_CNTL__MEC_INVALIDATE_ICACHE_MASK |
+ CP_MEC_CNTL__MEC_ME1_PIPE0_RESET_MASK |
+ CP_MEC_CNTL__MEC_ME1_PIPE1_RESET_MASK |
+ CP_MEC_CNTL__MEC_ME1_PIPE2_RESET_MASK |
+ CP_MEC_CNTL__MEC_ME1_PIPE3_RESET_MASK |
+ CP_MEC_CNTL__MEC_ME2_PIPE0_RESET_MASK |
+ CP_MEC_CNTL__MEC_ME2_PIPE1_RESET_MASK |
+ CP_MEC_CNTL__MEC_ME1_HALT_MASK |
+ CP_MEC_CNTL__MEC_ME2_HALT_MASK));
adev->gfx.kiq[xcc_id].ring.sched.ready = false;
}
udelay(50);
@@ -2240,6 +2248,8 @@ static int gfx_v9_4_3_xcc_cp_resume(struct amdgpu_device *adev, int xcc_id)
r = gfx_v9_4_3_xcc_cp_compute_load_microcode(adev, xcc_id);
if (r)
return r;
+ } else {
+ gfx_v9_4_3_xcc_cp_compute_enable(adev, false, xcc_id);
}
r = gfx_v9_4_3_xcc_kiq_resume(adev, xcc_id);
@@ -2299,12 +2309,6 @@ static int gfx_v9_4_3_cp_resume(struct amdgpu_device *adev)
return 0;
}
-static void gfx_v9_4_3_xcc_cp_enable(struct amdgpu_device *adev, bool enable,
- int xcc_id)
-{
- gfx_v9_4_3_xcc_cp_compute_enable(adev, enable, xcc_id);
-}
-
static void gfx_v9_4_3_xcc_fini(struct amdgpu_device *adev, int xcc_id)
{
if (amdgpu_gfx_disable_kcq(adev, xcc_id))
@@ -2336,7 +2340,7 @@ static void gfx_v9_4_3_xcc_fini(struct amdgpu_device *adev, int xcc_id)
}
gfx_v9_4_3_xcc_kcq_fini_register(adev, xcc_id);
- gfx_v9_4_3_xcc_cp_enable(adev, false, xcc_id);
+ gfx_v9_4_3_xcc_cp_compute_enable(adev, false, xcc_id);
}
static int gfx_v9_4_3_hw_init(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c
index 6c1891889c4d..d4f72e47ae9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c
@@ -153,7 +153,7 @@ static void imu_v11_0_setup(struct amdgpu_device *adev)
WREG32_SOC15(GC, 0, regGFX_IMU_C2PMSG_16, imu_reg_val);
}
- //disble imu Rtavfs, SmsRepair, DfllBTC, and ClkB
+ //disable imu Rtavfs, SmsRepair, DfllBTC, and ClkB
imu_reg_val = RREG32_SOC15(GC, 0, regGFX_IMU_SCRATCH_10);
imu_reg_val |= 0x10007;
WREG32_SOC15(GC, 0, regGFX_IMU_SCRATCH_10, imu_reg_val);
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index ee91ff9e52a2..231a3d490ea8 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -161,7 +161,7 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
int api_status_off)
{
union MESAPI__QUERY_MES_STATUS mes_status_pkt;
- signed long timeout = 3000000; /* 3000 ms */
+ signed long timeout = 2100000; /* 2100 ms */
struct amdgpu_device *adev = mes->adev;
struct amdgpu_ring *ring = &mes->ring[0];
struct MES_API_STATUS *api_status;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index e499b2857a01..8d27421689c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -146,7 +146,7 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
int api_status_off)
{
union MESAPI__QUERY_MES_STATUS mes_status_pkt;
- signed long timeout = 3000000; /* 3000 ms */
+ signed long timeout = 2100000; /* 2100 ms */
struct amdgpu_device *adev = mes->adev;
struct amdgpu_ring *ring = &mes->ring[pipe];
spinlock_t *ring_lock = &mes->ring_lock[pipe];
@@ -479,6 +479,11 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes,
union MESAPI__MISC misc_pkt;
int pipe;
+ if (mes->adev->enable_uni_mes)
+ pipe = AMDGPU_MES_KIQ_PIPE;
+ else
+ pipe = AMDGPU_MES_SCHED_PIPE;
+
memset(&misc_pkt, 0, sizeof(misc_pkt));
misc_pkt.header.type = MES_API_TYPE_SCHEDULER;
@@ -513,6 +518,7 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes,
misc_pkt.wait_reg_mem.reg_offset2 = input->wrm_reg.reg1;
break;
case MES_MISC_OP_SET_SHADER_DEBUGGER:
+ pipe = AMDGPU_MES_SCHED_PIPE;
misc_pkt.opcode = MESAPI_MISC__SET_SHADER_DEBUGGER;
misc_pkt.set_shader_debugger.process_context_addr =
input->set_shader_debugger.process_context_addr;
@@ -530,11 +536,6 @@ static int mes_v12_0_misc_op(struct amdgpu_mes *mes,
return -EINVAL;
}
- if (mes->adev->enable_uni_mes)
- pipe = AMDGPU_MES_KIQ_PIPE;
- else
- pipe = AMDGPU_MES_SCHED_PIPE;
-
return mes_v12_0_submit_pkt_and_poll_completion(mes, pipe,
&misc_pkt, sizeof(misc_pkt),
offsetof(union MESAPI__MISC, api_status));
@@ -608,6 +609,7 @@ static int mes_v12_0_set_hw_resources(struct amdgpu_mes *mes, int pipe)
mes_set_hw_res_pkt.disable_mes_log = 1;
mes_set_hw_res_pkt.use_different_vmid_compute = 1;
mes_set_hw_res_pkt.enable_reg_active_poll = 1;
+ mes_set_hw_res_pkt.enable_level_process_quantum_check = 1;
/*
* Keep oversubscribe timer for sdma . When we have unmapped doorbell
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
index fa479dfa1ec1..739fce4fa8fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
@@ -365,7 +365,7 @@ static void nbio_v2_3_enable_aspm(struct amdgpu_device *adev,
data &= ~PCIE_LC_CNTL__LC_PMI_TO_L1_DIS_MASK;
} else {
- /* Disbale ASPM L1 */
+ /* Disable ASPM L1 */
data &= ~PCIE_LC_CNTL__LC_L1_INACTIVITY_MASK;
/* Disable ASPM TxL0s */
data &= ~PCIE_LC_CNTL__LC_L0S_INACTIVITY_MASK;
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 1251ee38a676..51e470e8d67d 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -81,6 +81,8 @@ MODULE_FIRMWARE("amdgpu/psp_14_0_4_ta.bin");
/* memory training timeout define */
#define MEM_TRAIN_SEND_MSG_TIMEOUT_US 3000000
+#define regMP1_PUB_SCRATCH0 0x3b10090
+
static int psp_v13_0_init_microcode(struct psp_context *psp)
{
struct amdgpu_device *adev = psp->adev;
@@ -807,6 +809,20 @@ static bool psp_v13_0_get_ras_capability(struct psp_context *psp)
}
}
+static bool psp_v13_0_is_aux_sos_load_required(struct psp_context *psp)
+{
+ struct amdgpu_device *adev = psp->adev;
+ u32 pmfw_ver;
+
+ if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(13, 0, 6))
+ return false;
+
+ /* load 4e version of sos if pmfw version less than 85.115.0 */
+ pmfw_ver = RREG32(regMP1_PUB_SCRATCH0 / 4);
+
+ return (pmfw_ver < 0x557300);
+}
+
static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.wait_for_bootloader = psp_v13_0_wait_for_bootloader_steady_state,
@@ -830,6 +846,7 @@ static const struct psp_funcs psp_v13_0_funcs = {
.vbflash_stat = psp_v13_0_vbflash_status,
.fatal_error_recovery_quirk = psp_v13_0_fatal_error_recovery_quirk,
.get_ras_capability = psp_v13_0_get_ras_capability,
+ .is_aux_sos_load_required = psp_v13_0_is_aux_sos_load_required,
};
void psp_v13_0_set_psp_funcs(struct psp_context *psp)
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index aa637541da58..e65194fe94af 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -710,7 +710,7 @@ static int sdma_v3_0_gfx_resume(struct amdgpu_device *adev)
upper_32_bits(wptr_gpu_addr));
wptr_poll_cntl = RREG32(mmSDMA0_GFX_RB_WPTR_POLL_CNTL + sdma_offsets[i]);
if (ring->use_pollmem) {
- /*wptr polling is not enogh fast, directly clean the wptr register */
+ /*wptr polling is not enough fast, directly clean the wptr register */
WREG32(mmSDMA0_GFX_RB_WPTR + sdma_offsets[i], 0);
wptr_poll_cntl = REG_SET_FIELD(wptr_poll_cntl,
SDMA0_GFX_RB_WPTR_POLL_CNTL,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index cfd8e183ad50..a8763496aed3 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -1080,13 +1080,16 @@ static void sdma_v7_0_vm_copy_pte(struct amdgpu_ib *ib,
unsigned bytes = count * 8;
ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) |
- SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR);
+ SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) |
+ SDMA_PKT_COPY_LINEAR_HEADER_CPV(1);
+
ib->ptr[ib->length_dw++] = bytes - 1;
ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */
ib->ptr[ib->length_dw++] = lower_32_bits(src);
ib->ptr[ib->length_dw++] = upper_32_bits(src);
ib->ptr[ib->length_dw++] = lower_32_bits(pe);
ib->ptr[ib->length_dw++] = upper_32_bits(pe);
+ ib->ptr[ib->length_dw++] = 0;
}
@@ -1744,7 +1747,7 @@ static void sdma_v7_0_set_buffer_funcs(struct amdgpu_device *adev)
}
static const struct amdgpu_vm_pte_funcs sdma_v7_0_vm_pte_funcs = {
- .copy_pte_num_dw = 7,
+ .copy_pte_num_dw = 8,
.copy_pte = sdma_v7_0_vm_copy_pte,
.write_pte = sdma_v7_0_vm_write_pte,
.set_pte_pde = sdma_v7_0_vm_set_pte_pde,
diff --git a/drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c b/drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c
index e4e30b9d481b..c04fdd2d5b38 100644
--- a/drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/smuio_v9_0.c
@@ -60,7 +60,7 @@ static void smuio_v9_0_get_clock_gating_state(struct amdgpu_device *adev, u64 *f
{
u32 data;
- /* CGTT_ROM_CLK_CTRL0 is not availabe for APUs */
+ /* CGTT_ROM_CLK_CTRL0 is not available for APUs */
if (adev->flags & AMD_IS_APU)
return;
diff --git a/drivers/gpu/drm/amd/amdgpu/soc24.c b/drivers/gpu/drm/amd/amdgpu/soc24.c
index b0c3678cfb31..fd4c3d4f8387 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc24.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc24.c
@@ -250,13 +250,6 @@ static void soc24_program_aspm(struct amdgpu_device *adev)
adev->nbio.funcs->program_aspm(adev);
}
-static void soc24_enable_doorbell_aperture(struct amdgpu_device *adev,
- bool enable)
-{
- adev->nbio.funcs->enable_doorbell_aperture(adev, enable);
- adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, enable);
-}
-
const struct amdgpu_ip_block_version soc24_common_ip_block = {
.type = AMD_IP_BLOCK_TYPE_COMMON,
.major = 1,
@@ -454,6 +447,11 @@ static int soc24_common_late_init(void *handle)
if (amdgpu_sriov_vf(adev))
xgpu_nv_mailbox_get_irq(adev);
+ /* Enable selfring doorbell aperture late because doorbell BAR
+ * aperture will change if resize BAR successfully in gmc sw_init.
+ */
+ adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, true);
+
return 0;
}
@@ -491,7 +489,7 @@ static int soc24_common_hw_init(void *handle)
adev->df.funcs->hw_init(adev);
/* enable the doorbell aperture */
- soc24_enable_doorbell_aperture(adev, true);
+ adev->nbio.funcs->enable_doorbell_aperture(adev, true);
return 0;
}
@@ -500,8 +498,13 @@ static int soc24_common_hw_fini(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
- /* disable the doorbell aperture */
- soc24_enable_doorbell_aperture(adev, false);
+ /* Disable the doorbell aperture and selfring doorbell aperture
+ * separately in hw_fini because soc21_enable_doorbell_aperture
+ * has been removed and there is no need to delay disabling
+ * selfring doorbell.
+ */
+ adev->nbio.funcs->enable_doorbell_aperture(adev, false);
+ adev->nbio.funcs->enable_doorbell_selfring_aperture(adev, false);
if (amdgpu_sriov_vf(adev))
xgpu_nv_mailbox_put_irq(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
index b1fd226b7efb..9d4f5352a62c 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c
@@ -1395,170 +1395,6 @@ static void vcn_v4_0_5_unified_ring_set_wptr(struct amdgpu_ring *ring)
}
}
-static int vcn_v4_0_5_limit_sched(struct amdgpu_cs_parser *p,
- struct amdgpu_job *job)
-{
- struct drm_gpu_scheduler **scheds;
-
- /* The create msg must be in the first IB submitted */
- if (atomic_read(&job->base.entity->fence_seq))
- return -EINVAL;
-
- /* if VCN0 is harvested, we can't support AV1 */
- if (p->adev->vcn.harvest_config & AMDGPU_VCN_HARVEST_VCN0)
- return -EINVAL;
-
- scheds = p->adev->gpu_sched[AMDGPU_HW_IP_VCN_ENC]
- [AMDGPU_RING_PRIO_0].sched;
- drm_sched_entity_modify_sched(job->base.entity, scheds, 1);
- return 0;
-}
-
-static int vcn_v4_0_5_dec_msg(struct amdgpu_cs_parser *p, struct amdgpu_job *job,
- uint64_t addr)
-{
- struct ttm_operation_ctx ctx = { false, false };
- struct amdgpu_bo_va_mapping *map;
- uint32_t *msg, num_buffers;
- struct amdgpu_bo *bo;
- uint64_t start, end;
- unsigned int i;
- void *ptr;
- int r;
-
- addr &= AMDGPU_GMC_HOLE_MASK;
- r = amdgpu_cs_find_mapping(p, addr, &bo, &map);
- if (r) {
- DRM_ERROR("Can't find BO for addr 0x%08llx\n", addr);
- return r;
- }
-
- start = map->start * AMDGPU_GPU_PAGE_SIZE;
- end = (map->last + 1) * AMDGPU_GPU_PAGE_SIZE;
- if (addr & 0x7) {
- DRM_ERROR("VCN messages must be 8 byte aligned!\n");
- return -EINVAL;
- }
-
- bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
- amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
- r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
- if (r) {
- DRM_ERROR("Failed validating the VCN message BO (%d)!\n", r);
- return r;
- }
-
- r = amdgpu_bo_kmap(bo, &ptr);
- if (r) {
- DRM_ERROR("Failed mapping the VCN message (%d)!\n", r);
- return r;
- }
-
- msg = ptr + addr - start;
-
- /* Check length */
- if (msg[1] > end - addr) {
- r = -EINVAL;
- goto out;
- }
-
- if (msg[3] != RDECODE_MSG_CREATE)
- goto out;
-
- num_buffers = msg[2];
- for (i = 0, msg = &msg[6]; i < num_buffers; ++i, msg += 4) {
- uint32_t offset, size, *create;
-
- if (msg[0] != RDECODE_MESSAGE_CREATE)
- continue;
-
- offset = msg[1];
- size = msg[2];
-
- if (offset + size > end) {
- r = -EINVAL;
- goto out;
- }
-
- create = ptr + addr + offset - start;
-
- /* H264, HEVC and VP9 can run on any instance */
- if (create[0] == 0x7 || create[0] == 0x10 || create[0] == 0x11)
- continue;
-
- r = vcn_v4_0_5_limit_sched(p, job);
- if (r)
- goto out;
- }
-
-out:
- amdgpu_bo_kunmap(bo);
- return r;
-}
-
-#define RADEON_VCN_ENGINE_TYPE_ENCODE (0x00000002)
-#define RADEON_VCN_ENGINE_TYPE_DECODE (0x00000003)
-
-#define RADEON_VCN_ENGINE_INFO (0x30000001)
-#define RADEON_VCN_ENGINE_INFO_MAX_OFFSET 16
-
-#define RENCODE_ENCODE_STANDARD_AV1 2
-#define RENCODE_IB_PARAM_SESSION_INIT 0x00000003
-#define RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET 64
-
-/* return the offset in ib if id is found, -1 otherwise
- * to speed up the searching we only search upto max_offset
- */
-static int vcn_v4_0_5_enc_find_ib_param(struct amdgpu_ib *ib, uint32_t id, int max_offset)
-{
- int i;
-
- for (i = 0; i < ib->length_dw && i < max_offset && ib->ptr[i] >= 8; i += ib->ptr[i]/4) {
- if (ib->ptr[i + 1] == id)
- return i;
- }
- return -1;
-}
-
-static int vcn_v4_0_5_ring_patch_cs_in_place(struct amdgpu_cs_parser *p,
- struct amdgpu_job *job,
- struct amdgpu_ib *ib)
-{
- struct amdgpu_ring *ring = amdgpu_job_ring(job);
- struct amdgpu_vcn_decode_buffer *decode_buffer;
- uint64_t addr;
- uint32_t val;
- int idx;
-
- /* The first instance can decode anything */
- if (!ring->me)
- return 0;
-
- /* RADEON_VCN_ENGINE_INFO is at the top of ib block */
- idx = vcn_v4_0_5_enc_find_ib_param(ib, RADEON_VCN_ENGINE_INFO,
- RADEON_VCN_ENGINE_INFO_MAX_OFFSET);
- if (idx < 0) /* engine info is missing */
- return 0;
-
- val = amdgpu_ib_get_value(ib, idx + 2); /* RADEON_VCN_ENGINE_TYPE */
- if (val == RADEON_VCN_ENGINE_TYPE_DECODE) {
- decode_buffer = (struct amdgpu_vcn_decode_buffer *)&ib->ptr[idx + 6];
-
- if (!(decode_buffer->valid_buf_flag & 0x1))
- return 0;
-
- addr = ((u64)decode_buffer->msg_buffer_address_hi) << 32 |
- decode_buffer->msg_buffer_address_lo;
- return vcn_v4_0_5_dec_msg(p, job, addr);
- } else if (val == RADEON_VCN_ENGINE_TYPE_ENCODE) {
- idx = vcn_v4_0_5_enc_find_ib_param(ib, RENCODE_IB_PARAM_SESSION_INIT,
- RENCODE_IB_PARAM_SESSION_INIT_MAX_OFFSET);
- if (idx >= 0 && ib->ptr[idx + 2] == RENCODE_ENCODE_STANDARD_AV1)
- return vcn_v4_0_5_limit_sched(p, job);
- }
- return 0;
-}
-
static const struct amdgpu_ring_funcs vcn_v4_0_5_unified_ring_vm_funcs = {
.type = AMDGPU_RING_TYPE_VCN_ENC,
.align_mask = 0x3f,
@@ -1566,7 +1402,6 @@ static const struct amdgpu_ring_funcs vcn_v4_0_5_unified_ring_vm_funcs = {
.get_rptr = vcn_v4_0_5_unified_ring_get_rptr,
.get_wptr = vcn_v4_0_5_unified_ring_get_wptr,
.set_wptr = vcn_v4_0_5_unified_ring_set_wptr,
- .patch_cs_in_place = vcn_v4_0_5_ring_patch_cs_in_place,
.emit_frame_size =
SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 4 +
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 71b465f8d83e..648f40091aa3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3540,6 +3540,30 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
return debug_map_and_unlock(dqm);
}
+bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ int doorbell_off, u32 *queue_format)
+{
+ struct queue *q;
+ bool r = false;
+
+ if (!queue_format)
+ return r;
+
+ dqm_lock(dqm);
+
+ list_for_each_entry(q, &qpd->queues_list, list) {
+ if (q->properties.doorbell_off == doorbell_off) {
+ *queue_format = q->properties.format;
+ r = true;
+ goto out;
+ }
+ }
+
+out:
+ dqm_unlock(dqm);
+ return r;
+}
#if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 08b40826ad1e..09ab36f8e8c6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q,
int debug_lock_and_unmap(struct device_queue_manager *dqm);
int debug_map_and_unlock(struct device_queue_manager *dqm);
int debug_refresh_runlist(struct device_queue_manager *dqm);
+bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
+ struct qcm_process_device *qpd,
+ int doorbell_off, u32 *queue_format);
static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
{
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
index bb8cbfc39b90..37b69fe0ede3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v10.c
@@ -306,23 +306,8 @@ static void event_interrupt_wq_v10(struct kfd_node *dev,
client_id == SOC15_IH_CLIENTID_UTCL2) {
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
- uint32_t node_id = SOC15_NODEID_FROM_IH_ENTRY(ih_ring_entry);
- uint32_t vmid_type = SOC15_VMID_TYPE_FROM_IH_ENTRY(ih_ring_entry);
- int hub_inst = 0;
struct kfd_hsa_memory_exception_data exception_data;
- /* gfxhub */
- if (!vmid_type && dev->adev->gfx.funcs->ih_node_to_logical_xcc) {
- hub_inst = dev->adev->gfx.funcs->ih_node_to_logical_xcc(dev->adev,
- node_id);
- if (hub_inst < 0)
- hub_inst = 0;
- }
-
- /* mmhub */
- if (vmid_type && client_id == SOC15_IH_CLIENTID_VMC)
- hub_inst = node_id / 4;
-
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c
index d163d92a692f..2b72d5b4949b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v12.c
@@ -341,6 +341,10 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
m->sdmax_rlcx_doorbell_offset =
q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
+ m->sdmax_rlcx_sched_cntl = (amdgpu_sdma_phase_quantum
+ << SDMA0_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM__SHIFT)
+ & SDMA0_QUEUE0_SCHEDULE_CNTL__CONTEXT_QUANTUM_MASK;
+
m->sdma_engine_id = q->sdma_engine_id;
m->sdma_queue_id = q->sdma_queue_id;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a902950cc060..d07acf1b2f93 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -270,6 +270,11 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
struct kfd_node *dev = NULL;
struct kfd_process *proc = NULL;
struct kfd_process_device *pdd = NULL;
+ int i;
+ struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
+ u32 queue_format;
+
+ memset(cu_occupancy, 0x0, sizeof(cu_occupancy));
pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy);
dev = pdd->dev;
@@ -287,8 +292,29 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
/* Collect wave count from device if it supports */
wave_cnt = 0;
max_waves_per_cu = 0;
- dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt,
- &max_waves_per_cu, 0);
+
+ /*
+ * For GFX 9.4.3, fetch the CU occupancy from the first XCC in the partition.
+ * For AQL queues, because of cooperative dispatch we multiply the wave count
+ * by number of XCCs in the partition to get the total wave counts across all
+ * XCCs in the partition.
+ * For PM4 queues, there is no cooperative dispatch so wave_cnt stay as it is.
+ */
+ dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
+ &max_waves_per_cu, ffs(dev->xcc_mask) - 1);
+
+ for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
+ if (cu_occupancy[i].wave_cnt != 0 &&
+ kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd,
+ cu_occupancy[i].doorbell_off,
+ &queue_format)) {
+ if (unlikely(queue_format == KFD_QUEUE_FORMAT_PM4))
+ wave_cnt += cu_occupancy[i].wave_cnt;
+ else
+ wave_cnt += (NUM_XCC(dev->xcc_mask) *
+ cu_occupancy[i].wave_cnt);
+ }
+ }
/* Translate wave count to number of compute units */
cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index b439d4d0bd84..01b960b15274 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -517,7 +517,6 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
if (retval)
goto err_destroy_queue;
- kfd_procfs_del_queue(pqn->q);
dqm = pqn->q->device->dqm;
retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
if (retval) {
@@ -527,6 +526,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
if (retval != -ETIME)
goto err_destroy_queue;
}
+ kfd_procfs_del_queue(pqn->q);
kfd_queue_release_buffers(pdd, &pqn->q->properties);
pqm_clean_queue_resource(pqm, pqn);
uninit_queue(pqn->q);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 0cff66735cfe..6e79028c5d78 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -808,6 +808,20 @@ static void dmub_hpd_callback(struct amdgpu_device *adev,
}
/**
+ * dmub_hpd_sense_callback - DMUB HPD sense processing callback.
+ * @adev: amdgpu_device pointer
+ * @notify: dmub notification structure
+ *
+ * HPD sense changes can occur during low power states and need to be
+ * notified from firmware to driver.
+ */
+static void dmub_hpd_sense_callback(struct amdgpu_device *adev,
+ struct dmub_notification *notify)
+{
+ DRM_DEBUG_DRIVER("DMUB HPD SENSE callback.\n");
+}
+
+/**
* register_dmub_notify_callback - Sets callback for DMUB notify
* @adev: amdgpu_device pointer
* @type: Type of dmub notification
@@ -1757,25 +1771,41 @@ static struct dml2_soc_bb *dm_dmub_get_vbios_bounding_box(struct amdgpu_device *
static enum dmub_ips_disable_type dm_get_default_ips_mode(
struct amdgpu_device *adev)
{
- /*
- * On DCN35 systems with Z8 enabled, it's possible for IPS2 + Z8 to
- * cause a hard hang. A fix exists for newer PMFW.
- *
- * As a workaround, for non-fixed PMFW, force IPS1+RCG as the deepest
- * IPS state in all cases, except for s0ix and all displays off (DPMS),
- * where IPS2 is allowed.
- *
- * When checking pmfw version, use the major and minor only.
- */
- if (amdgpu_ip_version(adev, DCE_HWIP, 0) == IP_VERSION(3, 5, 0) &&
- (adev->pm.fw_version & 0x00FFFF00) < 0x005D6300)
- return DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
+ enum dmub_ips_disable_type ret = DMUB_IPS_ENABLE;
- if (amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 5, 0))
- return DMUB_IPS_ENABLE;
+ switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) {
+ case IP_VERSION(3, 5, 0):
+ /*
+ * On DCN35 systems with Z8 enabled, it's possible for IPS2 + Z8 to
+ * cause a hard hang. A fix exists for newer PMFW.
+ *
+ * As a workaround, for non-fixed PMFW, force IPS1+RCG as the deepest
+ * IPS state in all cases, except for s0ix and all displays off (DPMS),
+ * where IPS2 is allowed.
+ *
+ * When checking pmfw version, use the major and minor only.
+ */
+ if ((adev->pm.fw_version & 0x00FFFF00) < 0x005D6300)
+ ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
+ else if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(11, 5, 0))
+ /*
+ * Other ASICs with DCN35 that have residency issues with
+ * IPS2 in idle.
+ * We want them to use IPS2 only in display off cases.
+ */
+ ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
+ break;
+ case IP_VERSION(3, 5, 1):
+ ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
+ break;
+ default:
+ /* ASICs older than DCN35 do not have IPSs */
+ if (amdgpu_ip_version(adev, DCE_HWIP, 0) < IP_VERSION(3, 5, 0))
+ ret = DMUB_IPS_DISABLE_ALL;
+ break;
+ }
- /* ASICs older than DCN35 do not have IPSs */
- return DMUB_IPS_DISABLE_ALL;
+ return ret;
}
static int amdgpu_dm_init(struct amdgpu_device *adev)
@@ -3808,6 +3838,12 @@ static int register_hpd_handlers(struct amdgpu_device *adev)
DRM_ERROR("amdgpu: fail to register dmub hpd callback");
return -EINVAL;
}
+
+ if (!register_dmub_notify_callback(adev, DMUB_NOTIFICATION_HPD_SENSE_NOTIFY,
+ dmub_hpd_sense_callback, true)) {
+ DRM_ERROR("amdgpu: fail to register dmub hpd sense callback");
+ return -EINVAL;
+ }
}
list_for_each_entry(connector,
@@ -4449,6 +4485,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev)
#define AMDGPU_DM_DEFAULT_MIN_BACKLIGHT 12
#define AMDGPU_DM_DEFAULT_MAX_BACKLIGHT 255
+#define AMDGPU_DM_MIN_SPREAD ((AMDGPU_DM_DEFAULT_MAX_BACKLIGHT - AMDGPU_DM_DEFAULT_MIN_BACKLIGHT) / 2)
#define AUX_BL_DEFAULT_TRANSITION_TIME_MS 50
static void amdgpu_dm_update_backlight_caps(struct amdgpu_display_manager *dm,
@@ -4463,6 +4500,21 @@ static void amdgpu_dm_update_backlight_caps(struct amdgpu_display_manager *dm,
return;
amdgpu_acpi_get_backlight_caps(&caps);
+
+ /* validate the firmware value is sane */
+ if (caps.caps_valid) {
+ int spread = caps.max_input_signal - caps.min_input_signal;
+
+ if (caps.max_input_signal > AMDGPU_DM_DEFAULT_MAX_BACKLIGHT ||
+ caps.min_input_signal < 0 ||
+ spread > AMDGPU_DM_DEFAULT_MAX_BACKLIGHT ||
+ spread < AMDGPU_DM_MIN_SPREAD) {
+ DRM_DEBUG_KMS("DM: Invalid backlight caps: min=%d, max=%d\n",
+ caps.min_input_signal, caps.max_input_signal);
+ caps.caps_valid = false;
+ }
+ }
+
if (caps.caps_valid) {
dm->backlight_caps[bl_idx].caps_valid = true;
if (caps.aux_support)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index 2d7755e2b6c3..15d4690c74d6 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -50,7 +50,7 @@
#define AMDGPU_DM_MAX_NUM_EDP 2
-#define AMDGPU_DMUB_NOTIFICATION_MAX 6
+#define AMDGPU_DMUB_NOTIFICATION_MAX 7
#define HDMI_AMD_VENDOR_SPECIFIC_DATA_BLOCK_IEEE_REGISTRATION_ID 0x00001A
#define AMD_VSDB_VERSION_3_FEATURECAP_REPLAYMODE 0x40
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index c0c61c03984c..83a31b97e96b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -1147,7 +1147,7 @@ static int compute_mst_dsc_configs_for_link(struct drm_atomic_state *state,
params[count].num_slices_v = aconnector->dsc_settings.dsc_num_slices_v;
params[count].bpp_overwrite = aconnector->dsc_settings.dsc_bits_per_pixel;
params[count].compression_possible = stream->sink->dsc_caps.dsc_dec_caps.is_dsc_supported;
- dc_dsc_get_policy_for_timing(params[count].timing, 0, &dsc_policy);
+ dc_dsc_get_policy_for_timing(params[count].timing, 0, &dsc_policy, dc_link_get_highest_encoding_format(stream->link));
if (!dc_dsc_compute_bandwidth_range(
stream->sink->ctx->dc->res_pool->dscs[0],
stream->sink->ctx->dc->debug.dsc_min_slice_height_override,
@@ -1681,7 +1681,7 @@ static bool is_dsc_common_config_possible(struct dc_stream_state *stream,
{
struct dc_dsc_policy dsc_policy = {0};
- dc_dsc_get_policy_for_timing(&stream->timing, 0, &dsc_policy);
+ dc_dsc_get_policy_for_timing(&stream->timing, 0, &dsc_policy, dc_link_get_highest_encoding_format(stream->link));
dc_dsc_compute_bandwidth_range(stream->sink->ctx->dc->res_pool->dscs[0],
stream->sink->ctx->dc->debug.dsc_min_slice_height_override,
dsc_policy.min_target_bpp * 16,
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
index 25f63b2e7a8e..495e3cd70426 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
@@ -961,6 +961,7 @@ static int amdgpu_dm_plane_helper_prepare_fb(struct drm_plane *plane,
else
domain = AMDGPU_GEM_DOMAIN_VRAM;
+ rbo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(rbo, domain);
if (unlikely(r != 0)) {
if (r != -ERESTARTSYS)
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c
index 08c494a7a21b..0d5fefb0f591 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_wb.c
@@ -114,6 +114,7 @@ static int amdgpu_dm_wb_prepare_job(struct drm_writeback_connector *wb_connector
domain = amdgpu_display_supported_domains(adev, rbo->flags);
+ rbo->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
r = amdgpu_bo_pin(rbo, domain);
if (unlikely(r != 0)) {
if (r != -ERESTARTSYS)
diff --git a/drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c b/drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c
index e47e9db062f4..681799468487 100644
--- a/drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c
+++ b/drivers/gpu/drm/amd/display/dc/basics/dce_calcs.c
@@ -569,7 +569,7 @@ static void calculate_bandwidth(
break;
}
data->lb_partitions[i] = bw_floor2(bw_div(data->lb_size_per_component[i], data->lb_line_pitch), bw_int_to_fixed(1));
- /*clamp the partitions to the maxium number supported by the lb*/
+ /* clamp the partitions to the maximum number supported by the lb */
if ((surface_type[i] != bw_def_graphics || dceip->graphics_lb_nodownscaling_multi_line_prefetching == 1)) {
data->lb_partitions_max[i] = bw_int_to_fixed(10);
}
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c
index f770828df149..0e243f4344d0 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c
@@ -59,6 +59,7 @@ int clk_mgr_helper_get_active_display_cnt(
display_count = 0;
for (i = 0; i < context->stream_count; i++) {
const struct dc_stream_state *stream = context->streams[i];
+ const struct dc_stream_status *stream_status = &context->stream_status[i];
/* Don't count SubVP phantom pipes as part of active
* display count
@@ -66,13 +67,7 @@ int clk_mgr_helper_get_active_display_cnt(
if (dc_state_get_stream_subvp_type(context, stream) == SUBVP_PHANTOM)
continue;
- /*
- * Only notify active stream or virtual stream.
- * Need to notify virtual stream to work around
- * headless case. HPD does not fire when system is in
- * S0i2.
- */
- if (!stream->dpms_off || stream->signal == SIGNAL_TYPE_VIRTUAL)
+ if (!stream->dpms_off || (stream_status && stream_status->plane_count))
display_count++;
}
diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
index 97164b5585a8..b46a3afe48ca 100644
--- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
+++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn35/dcn35_clk_mgr.c
@@ -1222,6 +1222,12 @@ void dcn35_clk_mgr_construct(
ctx->dc->debug.disable_dpp_power_gate = false;
ctx->dc->debug.disable_hubp_power_gate = false;
ctx->dc->debug.disable_dsc_power_gate = false;
+
+ /* Disable dynamic IPS2 in older PMFW (93.12) for Z8 interop. */
+ if (ctx->dc->config.disable_ips == DMUB_IPS_ENABLE &&
+ ctx->dce_version == DCN_VERSION_3_5 &&
+ ((clk_mgr->base.smu_ver & 0x00FFFFFF) <= 0x005d0c00))
+ ctx->dc->config.disable_ips = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF;
} else {
/*let's reset the config control flag*/
ctx->dc->config.disable_ips = DMUB_IPS_DISABLE_ALL; /*pmfw not support it, disable it all*/
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c
index ae788154896c..5c39390ecbd5 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -1767,7 +1767,7 @@ bool dc_validate_boot_timing(const struct dc *dc,
if (crtc_timing->pix_clk_100hz != pix_clk_100hz)
return false;
- if (!se->funcs->dp_get_pixel_format)
+ if (!se || !se->funcs->dp_get_pixel_format)
return false;
if (!se->funcs->dp_get_pixel_format(
@@ -2376,7 +2376,7 @@ static bool is_surface_in_context(
return false;
}
-static enum surface_update_type get_plane_info_update_type(const struct dc_surface_update *u)
+static enum surface_update_type get_plane_info_update_type(const struct dc *dc, const struct dc_surface_update *u)
{
union surface_update_flags *update_flags = &u->surface->update_flags;
enum surface_update_type update_type = UPDATE_TYPE_FAST;
@@ -2455,7 +2455,7 @@ static enum surface_update_type get_plane_info_update_type(const struct dc_surfa
/* todo: below are HW dependent, we should add a hook to
* DCE/N resource and validated there.
*/
- if (u->plane_info->tiling_info.gfx9.swizzle != DC_SW_LINEAR) {
+ if (!dc->debug.skip_full_updated_if_possible) {
/* swizzled mode requires RQ to be setup properly,
* thus need to run DML to calculate RQ settings
*/
@@ -2547,7 +2547,7 @@ static enum surface_update_type det_surface_update(const struct dc *dc,
update_flags->raw = 0; // Reset all flags
- type = get_plane_info_update_type(u);
+ type = get_plane_info_update_type(dc, u);
elevate_update_type(&overall_type, type);
type = get_scaling_info_update_type(dc, u);
@@ -2596,6 +2596,12 @@ static enum surface_update_type det_surface_update(const struct dc *dc,
elevate_update_type(&overall_type, UPDATE_TYPE_MED);
}
+ if (u->sdr_white_level_nits)
+ if (u->sdr_white_level_nits != u->surface->sdr_white_level_nits) {
+ update_flags->bits.sdr_white_level_nits = 1;
+ elevate_update_type(&overall_type, UPDATE_TYPE_FULL);
+ }
+
if (u->cm2_params) {
if ((u->cm2_params->component_settings.shaper_3dlut_setting
!= u->surface->mcm_shaper_3dlut_setting)
@@ -2876,6 +2882,10 @@ static void copy_surface_update_to_plane(
surface->hdr_mult =
srf_update->hdr_mult;
+ if (srf_update->sdr_white_level_nits)
+ surface->sdr_white_level_nits =
+ srf_update->sdr_white_level_nits;
+
if (srf_update->blend_tf)
memcpy(&surface->blend_tf, srf_update->blend_tf,
sizeof(surface->blend_tf));
@@ -4679,6 +4689,8 @@ static bool full_update_required(struct dc *dc,
srf_updates[i].scaling_info ||
(srf_updates[i].hdr_mult.value &&
srf_updates[i].hdr_mult.value != srf_updates->surface->hdr_mult.value) ||
+ (srf_updates[i].sdr_white_level_nits &&
+ srf_updates[i].sdr_white_level_nits != srf_updates->surface->sdr_white_level_nits) ||
srf_updates[i].in_transfer_func ||
srf_updates[i].func_shaper ||
srf_updates[i].lut3d_func ||
@@ -5744,6 +5756,27 @@ enum dc_status dc_process_dmub_set_mst_slots(const struct dc *dc,
}
/**
+ * dc_process_dmub_dpia_set_tps_notification - Submits tps notification
+ *
+ * @dc: [in] dc structure
+ * @link_index: [in] link index
+ * @tps: [in] request tps
+ *
+ * Submits set_tps_notification command to dmub via inbox message
+ */
+void dc_process_dmub_dpia_set_tps_notification(const struct dc *dc, uint32_t link_index, uint8_t tps)
+{
+ union dmub_rb_cmd cmd = {0};
+
+ cmd.set_tps_notification.header.type = DMUB_CMD__DPIA;
+ cmd.set_tps_notification.header.sub_type = DMUB_CMD__DPIA_SET_TPS_NOTIFICATION;
+ cmd.set_tps_notification.tps_notification.instance = dc->links[link_index]->ddc_hw_inst;
+ cmd.set_tps_notification.tps_notification.tps = tps;
+
+ dc_wake_and_execute_dmub_cmd(dc->ctx, &cmd, DM_DMUB_WAIT_TYPE_WAIT);
+}
+
+/**
* dc_process_dmub_dpia_hpd_int_enable - Submits DPIA DPD interruption
*
* @dc: [in] dc structure
diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h
index 4c94dd38be4b..3992ad73165b 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -55,7 +55,7 @@ struct aux_payload;
struct set_config_cmd_payload;
struct dmub_notification;
-#define DC_VER "3.2.299"
+#define DC_VER "3.2.301"
#define MAX_SURFACES 3
#define MAX_PLANES 6
@@ -462,6 +462,7 @@ struct dc_config {
bool support_edp0_on_dp1;
unsigned int enable_fpo_flicker_detection;
bool disable_hbr_audio_dp2;
+ bool consolidated_dpia_dp_lt;
};
enum visual_confirm {
@@ -762,7 +763,8 @@ union dpia_debug_options {
uint32_t disable_mst_dsc_work_around:1; /* bit 3 */
uint32_t enable_force_tbt3_work_around:1; /* bit 4 */
uint32_t disable_usb4_pm_support:1; /* bit 5 */
- uint32_t reserved:26;
+ uint32_t enable_consolidated_dpia_dp_lt:1; /* bit 6 */
+ uint32_t reserved:25;
} bits;
uint32_t raw;
};
@@ -1056,6 +1058,9 @@ struct dc_debug_options {
unsigned int force_lls;
bool notify_dpia_hr_bw;
bool enable_ips_visual_confirm;
+ unsigned int sharpen_policy;
+ unsigned int scale_to_sharpness_policy;
+ bool skip_full_updated_if_possible;
};
@@ -1269,6 +1274,7 @@ union surface_update_flags {
uint32_t tmz_changed:1;
uint32_t mcm_transfer_function_enable_change:1; /* disable or enable MCM transfer func */
uint32_t full_update:1;
+ uint32_t sdr_white_level_nits:1;
} bits;
uint32_t raw;
@@ -1351,6 +1357,7 @@ struct dc_plane_state {
bool adaptive_sharpness_en;
int sharpness_level;
enum linear_light_scaling linear_light_scaling;
+ unsigned int sdr_white_level_nits;
};
struct dc_plane_info {
@@ -1508,6 +1515,7 @@ struct dc_surface_update {
*/
struct dc_cm2_parameters *cm2_params;
const struct dc_csc_transform *cursor_csc_color_matrix;
+ unsigned int sdr_white_level_nits;
};
/*
@@ -2520,6 +2528,8 @@ enum dc_status dc_process_dmub_set_mst_slots(const struct dc *dc,
uint8_t mst_alloc_slots,
uint8_t *mst_slots_in_use);
+void dc_process_dmub_dpia_set_tps_notification(const struct dc *dc, uint32_t link_index, uint8_t tps);
+
void dc_process_dmub_dpia_hpd_int_enable(const struct dc *dc,
uint32_t hpd_int_enable);
diff --git a/drivers/gpu/drm/amd/display/dc/dc_dp_types.h b/drivers/gpu/drm/amd/display/dc/dc_dp_types.h
index 519c3df78ee5..41bd95e9177a 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dp_types.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_dp_types.h
@@ -969,6 +969,14 @@ union dp_sink_video_fallback_formats {
uint8_t raw;
};
+union dpcd_max_uncompressed_pixel_rate_cap {
+ struct {
+ uint16_t max_uncompressed_pixel_rate_cap :15;
+ uint16_t valid :1;
+ } bits;
+ uint8_t raw[2];
+};
+
union dp_fec_capability1 {
struct {
uint8_t AGGREGATED_ERROR_COUNTERS_CAPABLE :1;
@@ -1170,6 +1178,7 @@ struct dpcd_caps {
struct dc_lttpr_caps lttpr_caps;
struct adaptive_sync_caps adaptive_sync_caps;
struct dpcd_usb4_dp_tunneling_info usb4_dp_tun_info;
+ union dpcd_max_uncompressed_pixel_rate_cap max_uncompressed_pixel_rate_cap;
union dp_128b_132b_supported_link_rates dp_128b_132b_supported_link_rates;
union dp_main_line_channel_coding_cap channel_coding_cap;
@@ -1340,6 +1349,9 @@ struct dp_trace {
#ifndef DP_CABLE_ATTRIBUTES_UPDATED_BY_DPTX
#define DP_CABLE_ATTRIBUTES_UPDATED_BY_DPTX 0x110
#endif
+#ifndef DPCD_MAX_UNCOMPRESSED_PIXEL_RATE_CAP
+#define DPCD_MAX_UNCOMPRESSED_PIXEL_RATE_CAP 0x221c
+#endif
#ifndef DP_REPEATER_CONFIGURATION_AND_STATUS_SIZE
#define DP_REPEATER_CONFIGURATION_AND_STATUS_SIZE 0x50
#endif
diff --git a/drivers/gpu/drm/amd/display/dc/dc_dsc.h b/drivers/gpu/drm/amd/display/dc/dc_dsc.h
index fe3078b8789e..9014c2409817 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dsc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc_dsc.h
@@ -59,6 +59,7 @@ struct dc_dsc_config_options {
uint32_t max_target_bpp_limit_override_x16;
uint32_t slice_height_granularity;
uint32_t dsc_force_odm_hslice_override;
+ bool force_dsc_when_not_needed;
};
bool dc_dsc_parse_dsc_dpcd(const struct dc *dc,
@@ -100,7 +101,8 @@ uint32_t dc_dsc_stream_bandwidth_overhead_in_kbps(
*/
void dc_dsc_get_policy_for_timing(const struct dc_crtc_timing *timing,
uint32_t max_target_bpp_limit_override_x16,
- struct dc_dsc_policy *policy);
+ struct dc_dsc_policy *policy,
+ const enum dc_link_encoding_format link_encoding);
void dc_dsc_policy_set_max_target_bpp_limit(uint32_t limit);
diff --git a/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c b/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c
index cd6de93eb91c..603552dbd771 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c
+++ b/drivers/gpu/drm/amd/display/dc/dc_spl_translate.c
@@ -186,19 +186,17 @@ void translate_SPL_in_params_from_pipe_ctx(struct pipe_ctx *pipe_ctx, struct spl
spl_in->h_active = pipe_ctx->plane_res.scl_data.h_active;
spl_in->v_active = pipe_ctx->plane_res.scl_data.v_active;
+
+ spl_in->debug.sharpen_policy = (enum sharpen_policy)pipe_ctx->stream->ctx->dc->debug.sharpen_policy;
+ spl_in->debug.scale_to_sharpness_policy =
+ (enum scale_to_sharpness_policy)pipe_ctx->stream->ctx->dc->debug.scale_to_sharpness_policy;
+
/* Check if it is stream is in fullscreen and if its HDR.
* Use this to determine sharpness levels
*/
spl_in->is_fullscreen = dm_helpers_is_fullscreen(pipe_ctx->stream->ctx, pipe_ctx->stream);
spl_in->is_hdr_on = dm_helpers_is_hdr_on(pipe_ctx->stream->ctx, pipe_ctx->stream);
- spl_in->hdr_multx100 = 0;
- if (spl_in->is_hdr_on) {
- spl_in->hdr_multx100 = (uint32_t)dc_fixpt_floor(dc_fixpt_mul(plane_state->hdr_mult,
- dc_fixpt_from_int(100)));
- /* Disable sharpness for HDR Mult > 6.0 */
- if (spl_in->hdr_multx100 > 600)
- spl_in->adaptive_sharpness.enable = false;
- }
+ spl_in->sdr_white_level_nits = plane_state->sdr_white_level_nits;
}
/// @brief Translate SPL output parameters to pipe context
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c
index e7019c95ba79..4fce64a030b6 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20.c
@@ -313,9 +313,6 @@ static void handle_det_buf_split(struct display_mode_lib *mode_lib,
if (swath_height_c > 0)
log2_swath_height_c = dml_log2(swath_height_c);
-
- if (req128_c && log2_swath_height_c > 0)
- log2_swath_height_c -= 1;
}
rq_param->dlg.rq_l.swath_height = 1 << log2_swath_height_l;
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c
index ae5251041728..3fa9a5da02f6 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_rq_dlg_calc_20v2.c
@@ -313,9 +313,6 @@ static void handle_det_buf_split(struct display_mode_lib *mode_lib,
if (swath_height_c > 0)
log2_swath_height_c = dml_log2(swath_height_c);
-
- if (req128_c && log2_swath_height_c > 0)
- log2_swath_height_c -= 1;
}
rq_param->dlg.rq_l.swath_height = 1 << log2_swath_height_l;
diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
index 0b132ce1d2cd..2b275e680379 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
+++ b/drivers/gpu/drm/amd/display/dc/dml/dcn31/display_mode_vba_31.c
@@ -1924,15 +1924,6 @@ static unsigned int CalculateVMAndRowBytes(
*PixelPTEReqWidth = 32768.0 / BytePerPixel;
*PTERequestSize = 64;
FractionOfPTEReturnDrop = 0;
- } else if (MacroTileSizeBytes == 4096) {
- PixelPTEReqHeightPTEs = 1;
- *PixelPTEReqHeight = MacroTileHeight;
- *PixelPTEReqWidth = 8 * *MacroTileWidth;
- *PTERequestSize = 64;
- if (ScanDirection != dm_vert)
- FractionOfPTEReturnDrop = 0;
- else
- FractionOfPTEReturnDrop = 7.0 / 8;
} else if (GPUVMMinPageSize == 4 && MacroTileSizeBytes > 4096) {
PixelPTEReqHeightPTEs = 16;
*PixelPTEReqHeight = 16 * BlockHeight256Bytes;
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c b/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c
index 547dfcc80fde..d851c081e376 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/display_mode_core.c
@@ -8926,7 +8926,7 @@ void dml_core_mode_programming(struct display_mode_lib_st *mode_lib, const struc
// The prefetch scheduling should only be calculated once as per AllowForPStateChangeOrStutterInVBlank requirement
// If the AllowForPStateChangeOrStutterInVBlank requirement is not strict (i.e. only try those power saving feature
- // if possible, then will try to program for the best power saving features in order of diffculty (dram, fclk, stutter)
+ // if possible, then will try to program for the best power saving features in order of difficulty (dram, fclk, stutter)
s->iteration = 0;
s->MaxTotalRDBandwidth = 0;
s->AllPrefetchModeTested = false;
@@ -9977,7 +9977,7 @@ void dml_core_get_row_heights(
dml_print("DML_DLG: %s: GPUVMMinPageSizeKBytes = %u\n", __func__, GPUVMMinPageSizeKBytes);
#endif
- // just suppluy with enough parameters to calculate meta and dte
+ // just supply with enough parameters to calculate meta and dte
CalculateVMAndRowBytes(
0, // dml_bool_t ViewportStationary,
1, // dml_bool_t DCCEnable,
@@ -10110,7 +10110,7 @@ dml_bool_t dml_mode_support(
/// Note: In this function, it is assumed that DCFCLK, SOCCLK freq are the state values, and mode_program will just use the DML calculated DPPCLK and DISPCLK
/// @param mode_lib mode_lib data struct that house all the input/output/bbox and calculation values.
/// @param state_idx Power state idx chosen
-/// @param display_cfg Display Congiuration
+/// @param display_cfg Display Configuration
/// @param call_standalone Calling mode_programming without calling mode support. Some of the "support" struct member will be pre-calculated before doing mode programming
/// TODO: Add clk_cfg input, could be useful for standalone mode
dml_bool_t dml_mode_programming(
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c
index b0d9aed0f265..8697eac1e1f7 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c
@@ -858,7 +858,9 @@ static void populate_dml21_plane_config_from_plane_state(struct dml2_context *dm
plane->immediate_flip = plane_state->flip_immediate;
- plane->composition.rect_out_height_spans_vactive = plane_state->dst_rect.height >= stream->timing.v_addressable;
+ plane->composition.rect_out_height_spans_vactive =
+ plane_state->dst_rect.height >= stream->timing.v_addressable &&
+ stream->dst.height >= stream->timing.v_addressable;
}
//TODO : Could be possibly moved to a common helper layer.
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
index d63558ee3135..1cf9015e854a 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_pmo/dml2_pmo_dcn4_fams2.c
@@ -940,9 +940,11 @@ static void build_synchronized_timing_groups(
/* find synchronizable timing groups */
for (j = i + 1; j < display_config->display_config.num_streams; j++) {
if (memcmp(master_timing,
- &display_config->display_config.stream_descriptors[j].timing,
- sizeof(struct dml2_timing_cfg)) == 0 &&
- display_config->display_config.stream_descriptors[i].output.output_encoder == display_config->display_config.stream_descriptors[j].output.output_encoder) {
+ &display_config->display_config.stream_descriptors[j].timing,
+ sizeof(struct dml2_timing_cfg)) == 0 &&
+ display_config->display_config.stream_descriptors[i].output.output_encoder == display_config->display_config.stream_descriptors[j].output.output_encoder &&
+ (display_config->display_config.stream_descriptors[i].output.output_encoder != dml2_hdmi || //hdmi requires formats match
+ display_config->display_config.stream_descriptors[i].output.output_format == display_config->display_config.stream_descriptors[j].output.output_format)) {
set_bit_in_bitfield(&pmo->scratch.pmo_dcn4.synchronized_timing_group_masks[timing_group_idx], j);
set_bit_in_bitfield(&stream_mapped_mask, j);
}
diff --git a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
index a1727e5bf024..ebd5df1a36e8 100644
--- a/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
+++ b/drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c
@@ -668,6 +668,7 @@ static bool decide_dsc_bandwidth_range(
*/
static bool decide_dsc_target_bpp_x16(
const struct dc_dsc_policy *policy,
+ const struct dc_dsc_config_options *options,
const struct dsc_enc_caps *dsc_common_caps,
const int target_bandwidth_kbps,
const struct dc_crtc_timing *timing,
@@ -682,7 +683,7 @@ static bool decide_dsc_target_bpp_x16(
if (decide_dsc_bandwidth_range(policy->min_target_bpp * 16, policy->max_target_bpp * 16,
num_slices_h, dsc_common_caps, timing, link_encoding, &range)) {
if (target_bandwidth_kbps >= range.stream_kbps) {
- if (policy->enable_dsc_when_not_needed)
+ if (policy->enable_dsc_when_not_needed || options->force_dsc_when_not_needed)
/* enable max bpp even dsc is not needed */
*target_bpp_x16 = range.max_target_bpp_x16;
} else if (target_bandwidth_kbps >= range.max_kbps) {
@@ -882,7 +883,7 @@ static bool setup_dsc_config(
memset(dsc_cfg, 0, sizeof(struct dc_dsc_config));
- dc_dsc_get_policy_for_timing(timing, options->max_target_bpp_limit_override_x16, &policy);
+ dc_dsc_get_policy_for_timing(timing, options->max_target_bpp_limit_override_x16, &policy, link_encoding);
pic_width = timing->h_addressable + timing->h_border_left + timing->h_border_right;
pic_height = timing->v_addressable + timing->v_border_top + timing->v_border_bottom;
@@ -1080,6 +1081,7 @@ static bool setup_dsc_config(
if (target_bandwidth_kbps > 0) {
is_dsc_possible = decide_dsc_target_bpp_x16(
&policy,
+ options,
&dsc_common_caps,
target_bandwidth_kbps,
timing,
@@ -1171,7 +1173,8 @@ uint32_t dc_dsc_stream_bandwidth_overhead_in_kbps(
void dc_dsc_get_policy_for_timing(const struct dc_crtc_timing *timing,
uint32_t max_target_bpp_limit_override_x16,
- struct dc_dsc_policy *policy)
+ struct dc_dsc_policy *policy,
+ const enum dc_link_encoding_format link_encoding)
{
uint32_t bpc = 0;
@@ -1235,10 +1238,7 @@ void dc_dsc_get_policy_for_timing(const struct dc_crtc_timing *timing,
policy->max_target_bpp = max_target_bpp_limit_override_x16 / 16;
/* enable DSC when not needed, default false */
- if (dsc_policy_enable_dsc_when_not_needed)
- policy->enable_dsc_when_not_needed = dsc_policy_enable_dsc_when_not_needed;
- else
- policy->enable_dsc_when_not_needed = false;
+ policy->enable_dsc_when_not_needed = dsc_policy_enable_dsc_when_not_needed;
}
void dc_dsc_policy_set_max_target_bpp_limit(uint32_t limit)
@@ -1267,4 +1267,5 @@ void dc_dsc_get_default_config_option(const struct dc *dc, struct dc_dsc_config_
options->dsc_force_odm_hslice_override = dc->debug.force_odm_combine;
options->max_target_bpp_limit_override_x16 = 0;
options->slice_height_granularity = 1;
+ options->force_dsc_when_not_needed = false;
}
diff --git a/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c b/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c
index 6293173ba2b9..5eb3da8d5206 100644
--- a/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c
+++ b/drivers/gpu/drm/amd/display/dc/hubbub/dcn35/dcn35_hubbub.c
@@ -545,6 +545,7 @@ static void hubbub35_init(struct hubbub *hubbub)
DCHUBBUB_ARB_MAX_REQ_OUTSTAND, 256,
DCHUBBUB_ARB_MIN_REQ_OUTSTAND, 256);
+ memset(&hubbub2->watermarks.a.cstate_pstate, 0, sizeof(hubbub2->watermarks.a.cstate_pstate));
}
/*static void hubbub35_set_request_limit(struct hubbub *hubbub,
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
index d52ce58c6a98..4fbed0298adf 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c
@@ -57,6 +57,7 @@
#include "panel_cntl.h"
#include "dc_state_priv.h"
#include "dpcd_defs.h"
+#include "dsc.h"
/* include DCE11 register header files */
#include "dce/dce_11_0_d.h"
#include "dce/dce_11_0_sh_mask.h"
@@ -1823,6 +1824,48 @@ static void get_edp_links_with_sink(
}
}
+static void clean_up_dsc_blocks(struct dc *dc)
+{
+ struct display_stream_compressor *dsc = NULL;
+ struct timing_generator *tg = NULL;
+ struct stream_encoder *se = NULL;
+ struct dccg *dccg = dc->res_pool->dccg;
+ struct pg_cntl *pg_cntl = dc->res_pool->pg_cntl;
+ int i;
+
+ if (dc->ctx->dce_version != DCN_VERSION_3_5 &&
+ dc->ctx->dce_version != DCN_VERSION_3_51)
+ return;
+
+ for (i = 0; i < dc->res_pool->res_cap->num_dsc; i++) {
+ struct dcn_dsc_state s = {0};
+
+ dsc = dc->res_pool->dscs[i];
+ dsc->funcs->dsc_read_state(dsc, &s);
+ if (s.dsc_fw_en) {
+ /* disable DSC in OPTC */
+ if (i < dc->res_pool->timing_generator_count) {
+ tg = dc->res_pool->timing_generators[i];
+ tg->funcs->set_dsc_config(tg, OPTC_DSC_DISABLED, 0, 0);
+ }
+ /* disable DSC in stream encoder */
+ if (i < dc->res_pool->stream_enc_count) {
+ se = dc->res_pool->stream_enc[i];
+ se->funcs->dp_set_dsc_config(se, OPTC_DSC_DISABLED, 0, 0);
+ se->funcs->dp_set_dsc_pps_info_packet(se, false, NULL, true);
+ }
+ /* disable DSC block */
+ if (dccg->funcs->set_ref_dscclk)
+ dccg->funcs->set_ref_dscclk(dccg, dsc->inst);
+ dsc->funcs->dsc_disable(dsc);
+
+ /* power down DSC */
+ if (pg_cntl != NULL)
+ pg_cntl->funcs->dsc_pg_control(pg_cntl, dsc->inst, false);
+ }
+ }
+}
+
/*
* When ASIC goes from VBIOS/VGA mode to driver/accelerated mode we need:
* 1. Power down all DC HW blocks
@@ -1927,6 +1970,13 @@ void dce110_enable_accelerated_mode(struct dc *dc, struct dc_state *context)
clk_mgr_exit_optimized_pwr_state(dc, dc->clk_mgr);
power_down_all_hw_blocks(dc);
+
+ /* DSC could be enabled on eDP during VBIOS post.
+ * To clean up dsc blocks if eDP is in link but not active.
+ */
+ if (edp_link_with_sink && (edp_stream_num == 0))
+ clean_up_dsc_blocks(dc);
+
disable_vga_and_power_gate_all_controllers(dc);
if (edp_link_with_sink && !keep_edp_vdd_on)
dc->hwss.edp_power_control(edp_link_with_sink, false);
@@ -2046,13 +2096,20 @@ static void set_drr(struct pipe_ctx **pipe_ctx,
* as well.
*/
for (i = 0; i < num_pipes; i++) {
- pipe_ctx[i]->stream_res.tg->funcs->set_drr(
- pipe_ctx[i]->stream_res.tg, &params);
-
- if (adjust.v_total_max != 0 && adjust.v_total_min != 0)
- pipe_ctx[i]->stream_res.tg->funcs->set_static_screen_control(
- pipe_ctx[i]->stream_res.tg,
- event_triggers, num_frames);
+ /* dc_state_destruct() might null the stream resources, so fetch tg
+ * here first to avoid a race condition. The lifetime of the pointee
+ * itself (the timing_generator object) is not a problem here.
+ */
+ struct timing_generator *tg = pipe_ctx[i]->stream_res.tg;
+
+ if ((tg != NULL) && tg->funcs) {
+ if (tg->funcs->set_drr)
+ tg->funcs->set_drr(tg, &params);
+ if (adjust.v_total_max != 0 && adjust.v_total_min != 0)
+ if (tg->funcs->set_static_screen_control)
+ tg->funcs->set_static_screen_control(
+ tg, event_triggers, num_frames);
+ }
}
}
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c
index 42c52284a868..bded33575493 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn30/dcn30_hwseq.c
@@ -455,7 +455,7 @@ bool dcn30_mmhubbub_warmup(
struct mcif_wb *mcif_wb;
struct mcif_warmup_params warmup_params = {0};
unsigned int i, i_buf;
- /*make sure there is no active DWB eanbled */
+ /* make sure there is no active DWB enabled */
for (i = 0; i < num_dwb; i++) {
dwb = dc->res_pool->dwbc[wb_info[i].dwb_pipe_inst];
if (dwb->dwb_is_efc_transition || dwb->dwb_is_drc) {
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c
index a36e11606f90..2e8c9f738259 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn32/dcn32_hwseq.c
@@ -1032,6 +1032,20 @@ void dcn32_update_dsc_on_stream(struct pipe_ctx *pipe_ctx, bool enable)
struct dsc_config dsc_cfg;
struct dsc_optc_config dsc_optc_cfg = {0};
enum optc_dsc_mode optc_dsc_mode;
+ struct dcn_dsc_state dsc_state = {0};
+
+ if (!dsc) {
+ DC_LOG_DSC("DSC is NULL for tg instance %d:", pipe_ctx->stream_res.tg->inst);
+ return;
+ }
+
+ if (dsc->funcs->dsc_read_state) {
+ dsc->funcs->dsc_read_state(dsc, &dsc_state);
+ if (!dsc_state.dsc_fw_en) {
+ DC_LOG_DSC("DSC has been disabled for tg instance %d:", pipe_ctx->stream_res.tg->inst);
+ return;
+ }
+ }
/* Enable DSC hw block */
dsc_cfg.pic_width = (stream->timing.h_addressable + stream->timing.h_border_left + stream->timing.h_border_right) / opp_cnt;
diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
index 479fd3e89e5a..bd309dbdf7b2 100644
--- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
+++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c
@@ -334,7 +334,20 @@ static void update_dsc_on_stream(struct pipe_ctx *pipe_ctx, bool enable)
struct dsc_config dsc_cfg;
struct dsc_optc_config dsc_optc_cfg = {0};
enum optc_dsc_mode optc_dsc_mode;
+ struct dcn_dsc_state dsc_state = {0};
+ if (!dsc) {
+ DC_LOG_DSC("DSC is NULL for tg instance %d:", pipe_ctx->stream_res.tg->inst);
+ return;
+ }
+
+ if (dsc->funcs->dsc_read_state) {
+ dsc->funcs->dsc_read_state(dsc, &dsc_state);
+ if (!dsc_state.dsc_fw_en) {
+ DC_LOG_DSC("DSC has been disabled for tg instance %d:", pipe_ctx->stream_res.tg->inst);
+ return;
+ }
+ }
/* Enable DSC hw block */
dsc_cfg.pic_width = (stream->timing.h_addressable + stream->timing.h_border_left + stream->timing.h_border_right) / opp_cnt;
dsc_cfg.pic_height = stream->timing.v_addressable + stream->timing.v_border_top + stream->timing.v_border_bottom;
diff --git a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c
index 46fb3649bc86..6499807af72a 100644
--- a/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c
+++ b/drivers/gpu/drm/amd/display/dc/link/hwss/link_hwss_dpia.c
@@ -50,8 +50,31 @@ static void update_dpia_stream_allocation_table(struct dc_link *link,
DC_LOG_MST("dpia : status[%d]: alloc_slots[%d]: used_slots[%d]\n",
status, mst_alloc_slots, prev_mst_slots_in_use);
- ASSERT(link_enc);
- link_enc->funcs->update_mst_stream_allocation_table(link_enc, table);
+ if (link_enc)
+ link_enc->funcs->update_mst_stream_allocation_table(link_enc, table);
+}
+
+static void set_dio_dpia_link_test_pattern(struct dc_link *link,
+ const struct link_resource *link_res,
+ struct encoder_set_dp_phy_pattern_param *tp_params)
+{
+ if (tp_params->dp_phy_pattern != DP_TEST_PATTERN_VIDEO_MODE)
+ return;
+
+ struct link_encoder *link_enc = link_enc_cfg_get_link_enc(link);
+
+ if (!link_enc)
+ return;
+
+ link_enc->funcs->dp_set_phy_pattern(link_enc, tp_params);
+ link->dc->link_srv->dp_trace_source_sequence(link, DPCD_SOURCE_SEQ_AFTER_SET_SOURCE_PATTERN);
+}
+
+static void set_dio_dpia_lane_settings(struct dc_link *link,
+ const struct link_resource *link_res,
+ const struct dc_link_settings *link_settings,
+ const struct dc_lane_settings lane_settings[LANE_COUNT_DP_MAX])
+{
}
static const struct link_hwss dpia_link_hwss = {
@@ -65,8 +88,8 @@ static const struct link_hwss dpia_link_hwss = {
.ext = {
.set_throttled_vcp_size = set_dio_throttled_vcp_size,
.enable_dp_link_output = enable_dio_dp_link_output,
- .set_dp_link_test_pattern = set_dio_dp_link_test_pattern,
- .set_dp_lane_settings = set_dio_dp_lane_settings,
+ .set_dp_link_test_pattern = set_dio_dpia_link_test_pattern,
+ .set_dp_lane_settings = set_dio_dpia_lane_settings,
.update_stream_allocation_table = update_dpia_stream_allocation_table,
},
};
diff --git a/drivers/gpu/drm/amd/display/dc/link/link_validation.c b/drivers/gpu/drm/amd/display/dc/link/link_validation.c
index 1aed55b0ab6a..60f15a9ba7a5 100644
--- a/drivers/gpu/drm/amd/display/dc/link/link_validation.c
+++ b/drivers/gpu/drm/amd/display/dc/link/link_validation.c
@@ -287,6 +287,13 @@ static bool dp_validate_mode_timing(
req_bw = dc_bandwidth_in_kbps_from_timing(timing, dc_link_get_highest_encoding_format(link));
max_bw = dp_link_bandwidth_kbps(link, link_setting);
+ bool is_max_uncompressed_pixel_rate_exceeded = link->dpcd_caps.max_uncompressed_pixel_rate_cap.bits.valid &&
+ timing->pix_clk_100hz > link->dpcd_caps.max_uncompressed_pixel_rate_cap.bits.max_uncompressed_pixel_rate_cap * 10000;
+
+ if (is_max_uncompressed_pixel_rate_exceeded && !timing->flags.DSC) {
+ return false;
+ }
+
if (req_bw <= max_bw) {
/* remember the biggest mode here, during
* initial link training (to get
diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c
index 34a618a7278b..d78c8ec4de79 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c
@@ -1942,6 +1942,11 @@ static bool retrieve_link_cap(struct dc_link *link)
DC_LOG_DP2("\tFEC aggregated error counters are supported");
}
+ core_link_read_dpcd(link,
+ DPCD_MAX_UNCOMPRESSED_PIXEL_RATE_CAP,
+ link->dpcd_caps.max_uncompressed_pixel_rate_cap.raw,
+ sizeof(link->dpcd_caps.max_uncompressed_pixel_rate_cap.raw));
+
retrieve_cable_id(link);
dpcd_write_cable_id_to_dprx(link);
diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c
index 988999c44475..27b881f947e8 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.c
@@ -515,6 +515,41 @@ bool dp_is_interlane_aligned(union lane_align_status_updated align_status)
return align_status.bits.INTERLANE_ALIGN_DONE == 1;
}
+bool dp_check_interlane_aligned(union lane_align_status_updated align_status,
+ struct dc_link *link,
+ uint8_t retries)
+{
+ /* Take into consideration corner case for DP 1.4a LL Compliance CTS as USB4
+ * has to share encoders unlike DP and USBC
+ */
+ return (dp_is_interlane_aligned(align_status) ||
+ (link->skip_fallback_on_link_loss && retries));
+}
+
+uint32_t dp_get_eq_aux_rd_interval(
+ const struct dc_link *link,
+ const struct link_training_settings *lt_settings,
+ uint32_t offset,
+ uint8_t retries)
+{
+ if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) {
+ if (offset == 0 && retries == 1 && lt_settings->lttpr_mode == LTTPR_MODE_NON_TRANSPARENT)
+ return max(lt_settings->eq_pattern_time, (uint32_t) DPIA_CLK_SYNC_DELAY);
+ else
+ return dpia_get_eq_aux_rd_interval(link, lt_settings, offset);
+ } else if (is_repeater(lt_settings, offset))
+ return dp_translate_training_aux_read_interval(
+ link->dpcd_caps.lttpr_caps.aux_rd_interval[offset - 1]);
+ else
+ return lt_settings->eq_pattern_time;
+}
+
+bool dp_check_dpcd_reqeust_status(const struct dc_link *link,
+ enum dc_status status)
+{
+ return (status != DC_OK && link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA);
+}
+
enum link_training_result dp_check_link_loss_status(
struct dc_link *link,
const struct link_training_settings *link_training_setting)
@@ -973,13 +1008,17 @@ void repeater_training_done(struct dc_link *link, uint32_t offset)
dpcd_pattern.v1_4.TRAINING_PATTERN_SET);
}
-static void dpcd_exit_training_mode(struct dc_link *link, enum dp_link_encoding encoding)
+static enum link_training_result dpcd_exit_training_mode(struct dc_link *link, enum dp_link_encoding encoding)
{
+ enum dc_status status;
uint8_t sink_status = 0;
uint8_t i;
/* clear training pattern set */
- dpcd_set_training_pattern(link, DP_TRAINING_PATTERN_VIDEOIDLE);
+ status = dpcd_set_training_pattern(link, DP_TRAINING_PATTERN_VIDEOIDLE);
+
+ if (dp_check_dpcd_reqeust_status(link, status))
+ return LINK_TRAINING_ABORT;
if (encoding == DP_128b_132b_ENCODING) {
/* poll for intra-hop disable */
@@ -990,6 +1029,8 @@ static void dpcd_exit_training_mode(struct dc_link *link, enum dp_link_encoding
fsleep(1000);
}
}
+
+ return LINK_TRAINING_SUCCESS;
}
enum dc_status dpcd_configure_channel_coding(struct dc_link *link,
@@ -1013,17 +1054,18 @@ enum dc_status dpcd_configure_channel_coding(struct dc_link *link,
return status;
}
-void dpcd_set_training_pattern(
+enum dc_status dpcd_set_training_pattern(
struct dc_link *link,
enum dc_dp_training_pattern training_pattern)
{
+ enum dc_status status;
union dpcd_training_pattern dpcd_pattern = {0};
dpcd_pattern.v1_4.TRAINING_PATTERN_SET =
dp_training_pattern_to_dpcd_training_pattern(
link, training_pattern);
- core_link_write_dpcd(
+ status = core_link_write_dpcd(
link,
DP_TRAINING_PATTERN_SET,
&dpcd_pattern.raw,
@@ -1033,6 +1075,8 @@ void dpcd_set_training_pattern(
__func__,
DP_TRAINING_PATTERN_SET,
dpcd_pattern.v1_4.TRAINING_PATTERN_SET);
+
+ return status;
}
enum dc_status dpcd_set_link_settings(
@@ -1185,6 +1229,13 @@ void dpcd_set_lt_pattern_and_lane_settings(
dpcd_lt_buffer[DP_TRAINING_PATTERN_SET - DP_TRAINING_PATTERN_SET]
= dpcd_pattern.raw;
+ if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA)
+ dpia_set_tps_notification(
+ link,
+ lt_settings,
+ dpcd_pattern.v1_4.TRAINING_PATTERN_SET,
+ offset);
+
if (is_repeater(lt_settings, offset)) {
DC_LOG_HW_LINK_TRAINING("%s\n LTTPR Repeater ID: %d\n 0x%X pattern = %x\n",
__func__,
@@ -1455,7 +1506,8 @@ static enum link_training_result dp_transition_to_video_idle(
*/
if (link->connector_signal != SIGNAL_TYPE_EDP && status == LINK_TRAINING_SUCCESS) {
msleep(5);
- status = dp_check_link_loss_status(link, lt_settings);
+ if (!link->skip_fallback_on_link_loss)
+ status = dp_check_link_loss_status(link, lt_settings);
}
return status;
}
@@ -1521,7 +1573,9 @@ enum link_training_result dp_perform_link_training(
ASSERT(0);
/* exit training mode */
- dpcd_exit_training_mode(link, encoding);
+ if ((dpcd_exit_training_mode(link, encoding) != LINK_TRAINING_SUCCESS || status == LINK_TRAINING_ABORT) &&
+ link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA)
+ dpia_training_abort(link, &lt_settings, 0);
/* switch to video idle */
if ((status == LINK_TRAINING_SUCCESS) || !skip_video_pattern)
@@ -1599,8 +1653,7 @@ bool perform_link_training_with_retries(
dp_perform_link_training_skip_aux(link, &pipe_ctx->link_res, &cur_link_settings);
return true;
} else {
- /** @todo Consolidate USB4 DP and DPx.x training. */
- if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) {
+ if (!link->dc->config.consolidated_dpia_dp_lt && link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA) {
status = dpia_perform_link_training(
link,
&pipe_ctx->link_res,
@@ -1629,8 +1682,17 @@ bool perform_link_training_with_retries(
dp_trace_lt_total_count_increment(link, false);
dp_trace_lt_result_update(link, status, false);
dp_trace_set_lt_end_timestamp(link, false);
- if (status == LINK_TRAINING_SUCCESS && !is_link_bw_low)
+ if (status == LINK_TRAINING_SUCCESS && !is_link_bw_low) {
+ // Update verified link settings to current one
+ // Because DPIA LT might fallback to lower link setting.
+ if (link->ep_type == DISPLAY_ENDPOINT_USB4_DPIA &&
+ stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST) {
+ link->verified_link_cap.link_rate = link->cur_link_settings.link_rate;
+ link->verified_link_cap.lane_count = link->cur_link_settings.lane_count;
+ dm_helpers_dp_mst_update_branch_bandwidth(link->ctx, link);
+ }
return true;
+ }
}
fail_count++;
diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h
index 851bd17317a0..0b18aa35c33c 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training.h
@@ -55,7 +55,7 @@ void dp_set_hw_test_pattern(
uint8_t *custom_pattern,
uint32_t custom_pattern_size);
-void dpcd_set_training_pattern(
+enum dc_status dpcd_set_training_pattern(
struct dc_link *link,
enum dc_dp_training_pattern training_pattern);
@@ -182,4 +182,18 @@ uint32_t dp_translate_training_aux_read_interval(
uint8_t dp_get_nibble_at_index(const uint8_t *buf,
uint32_t index);
+
+bool dp_check_interlane_aligned(union lane_align_status_updated align_status,
+ struct dc_link *link,
+ uint8_t retries);
+
+uint32_t dp_get_eq_aux_rd_interval(
+ const struct dc_link *link,
+ const struct link_training_settings *lt_settings,
+ uint32_t offset,
+ uint8_t retries);
+
+bool dp_check_dpcd_reqeust_status(const struct dc_link *link,
+ enum dc_status status);
+
#endif /* __DC_LINK_DP_TRAINING_H__ */
diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c
index 2b4c15b0b407..3bdce32a85e3 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c
@@ -157,6 +157,7 @@ enum link_training_result perform_8b_10b_clock_recovery_sequence(
struct link_training_settings *lt_settings,
uint32_t offset)
{
+ enum dc_status status;
uint32_t retries_cr;
uint32_t retry_count;
uint32_t wait_time_microsec;
@@ -216,7 +217,7 @@ enum link_training_result perform_8b_10b_clock_recovery_sequence(
/* 4. Read lane status and requested drive
* settings as set by the sink
*/
- dp_get_lane_status_and_lane_adjust(
+ status = dp_get_lane_status_and_lane_adjust(
link,
lt_settings,
dpcd_lane_status,
@@ -224,6 +225,9 @@ enum link_training_result perform_8b_10b_clock_recovery_sequence(
dpcd_lane_adjust,
offset);
+ if (dp_check_dpcd_reqeust_status(link, status))
+ return LINK_TRAINING_ABORT;
+
/* 5. check CR done*/
if (dp_is_cr_done(lane_count, dpcd_lane_status)) {
DC_LOG_HW_LINK_TRAINING("%s: Clock recovery OK\n", __func__);
@@ -273,6 +277,7 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence(
struct link_training_settings *lt_settings,
uint32_t offset)
{
+ enum dc_status status;
enum dc_dp_training_pattern tr_pattern;
uint32_t retries_ch_eq;
uint32_t wait_time_microsec;
@@ -308,12 +313,7 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence(
dpcd_set_lane_settings(link, lt_settings, offset);
/* 3. wait for receiver to lock-on*/
- wait_time_microsec = lt_settings->eq_pattern_time;
-
- if (is_repeater(lt_settings, offset))
- wait_time_microsec =
- dp_translate_training_aux_read_interval(
- link->dpcd_caps.lttpr_caps.aux_rd_interval[offset - 1]);
+ wait_time_microsec = dp_get_eq_aux_rd_interval(link, lt_settings, offset, retries_ch_eq);
dp_wait_for_training_aux_rd_interval(
link,
@@ -322,7 +322,7 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence(
/* 4. Read lane status and requested
* drive settings as set by the sink*/
- dp_get_lane_status_and_lane_adjust(
+ status = dp_get_lane_status_and_lane_adjust(
link,
lt_settings,
dpcd_lane_status,
@@ -330,6 +330,9 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence(
dpcd_lane_adjust,
offset);
+ if (dp_check_dpcd_reqeust_status(link, status))
+ return LINK_TRAINING_ABORT;
+
/* 5. check CR done*/
if (!dp_is_cr_done(lane_count, dpcd_lane_status))
return dpcd_lane_status[0].bits.CR_DONE_0 ?
@@ -339,7 +342,7 @@ enum link_training_result perform_8b_10b_channel_equalization_sequence(
/* 6. check CHEQ done*/
if (dp_is_ch_eq_done(lane_count, dpcd_lane_status) &&
dp_is_symbol_locked(lane_count, dpcd_lane_status) &&
- dp_is_interlane_aligned(dpcd_lane_status_updated))
+ dp_check_interlane_aligned(dpcd_lane_status_updated, link, retries_ch_eq))
return LINK_TRAINING_SUCCESS;
/* 7. update VS/PE/PC2 in lt_settings*/
diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c
index cd1975c03f38..39e4b7dc9588 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.c
@@ -43,9 +43,6 @@
#define DC_LOGGER \
link->ctx->logger
-/* The approximate time (us) it takes to transmit 9 USB4 DP clock sync packets. */
-#define DPIA_CLK_SYNC_DELAY 16000
-
/* Extend interval between training status checks for manual testing. */
#define DPIA_DEBUG_EXTENDED_AUX_RD_INTERVAL_US 60000000
@@ -566,28 +563,6 @@ static enum link_training_result dpia_training_cr_phase(
return result;
}
-/* Return status read interval during equalization phase. */
-static uint32_t dpia_get_eq_aux_rd_interval(
- const struct dc_link *link,
- const struct link_training_settings *lt_settings,
- uint32_t hop)
-{
- uint32_t wait_time_microsec;
-
- if (hop == DPRX)
- wait_time_microsec = lt_settings->eq_pattern_time;
- else
- wait_time_microsec =
- dp_translate_training_aux_read_interval(
- link->dpcd_caps.lttpr_caps.aux_rd_interval[hop - 1]);
-
- /* Check debug option for extending aux read interval. */
- if (link->dc->debug.dpia_debug.bits.extend_aux_rd_interval)
- wait_time_microsec = DPIA_DEBUG_EXTENDED_AUX_RD_INTERVAL_US;
-
- return wait_time_microsec;
-}
-
/* Execute equalization phase of link training for specified hop in display
* path in non-transparent mode:
* - driver issues both DPCD and SET_CONFIG transactions.
@@ -936,6 +911,22 @@ static enum link_training_result dpia_training_end(
return result;
}
+/* Return status read interval during equalization phase. */
+uint32_t dpia_get_eq_aux_rd_interval(
+ const struct dc_link *link,
+ const struct link_training_settings *lt_settings,
+ uint32_t hop)
+{
+ /* Check debug option for extending aux read interval. */
+ if (link->dc->debug.dpia_debug.bits.extend_aux_rd_interval)
+ return DPIA_DEBUG_EXTENDED_AUX_RD_INTERVAL_US;
+ else if (hop == DPRX)
+ return lt_settings->eq_pattern_time;
+ else
+ return dp_translate_training_aux_read_interval(
+ link->dpcd_caps.lttpr_caps.aux_rd_interval[hop - 1]);
+}
+
/* When aborting training of specified hop in display path, clean up by:
* - Attempting to clear DPCD TRAINING_PATTERN_SET, LINK_BW_SET and LANE_COUNT_SET.
* - Sending SET_CONFIG(SET_LINK) with lane count and link rate set to 0.
@@ -943,7 +934,7 @@ static enum link_training_result dpia_training_end(
* @param link DPIA link being trained.
* @param hop Hop in display path. DPRX = 0.
*/
-static void dpia_training_abort(
+void dpia_training_abort(
struct dc_link *link,
struct link_training_settings *lt_settings,
uint32_t hop)
@@ -968,7 +959,26 @@ static void dpia_training_abort(
core_link_write_dpcd(link, dpcd_tps_offset, &data, 1);
core_link_write_dpcd(link, DP_LINK_BW_SET, &data, 1);
core_link_write_dpcd(link, DP_LANE_COUNT_SET, &data, 1);
- core_link_send_set_config(link, DPIA_SET_CFG_SET_LINK, data);
+
+ if (!link->dc->config.consolidated_dpia_dp_lt)
+ core_link_send_set_config(link, DPIA_SET_CFG_SET_LINK, data);
+}
+
+void dpia_set_tps_notification(
+ struct dc_link *link,
+ const struct link_training_settings *lt_settings,
+ uint8_t pattern,
+ uint32_t hop)
+{
+ uint8_t repeater_cnt = 0; /* Number of hops/repeaters in display path. */
+
+ if (lt_settings->lttpr_mode != LTTPR_MODE_NON_TRANSPARENT || pattern == DPCD_TRAINING_PATTERN_VIDEOIDLE)
+ return;
+
+ repeater_cnt = dp_parse_lttpr_repeater_count(link->dpcd_caps.lttpr_caps.phy_repeater_cnt);
+
+ if (hop != repeater_cnt)
+ dc_process_dmub_dpia_set_tps_notification(link->ctx->dc, link->link_index, pattern);
}
enum link_training_result dpia_perform_link_training(
diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h
index b39fb9faf1c2..9f4eceb494c2 100644
--- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h
+++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_dpia.h
@@ -28,6 +28,9 @@
#define __DC_LINK_DP_TRAINING_DPIA_H__
#include "link_dp_training.h"
+/* The approximate time (us) it takes to transmit 9 USB4 DP clock sync packets. */
+#define DPIA_CLK_SYNC_DELAY 16000
+
/* Train DP tunneling link for USB4 DPIA display endpoint.
* DPIA equivalent of dc_link_dp_perfrorm_link_training.
* Aborts link training upon detection of sink unplug.
@@ -38,4 +41,20 @@ enum link_training_result dpia_perform_link_training(
const struct dc_link_settings *link_setting,
bool skip_video_pattern);
+void dpia_training_abort(
+ struct dc_link *link,
+ struct link_training_settings *lt_settings,
+ uint32_t hop);
+
+uint32_t dpia_get_eq_aux_rd_interval(
+ const struct dc_link *link,
+ const struct link_training_settings *lt_settings,
+ uint32_t hop);
+
+void dpia_set_tps_notification(
+ struct dc_link *link,
+ const struct link_training_settings *lt_settings,
+ uint8_t pattern,
+ uint32_t offset);
+
#endif /* __DC_LINK_DP_TRAINING_DPIA_H__ */
diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c
index 46ad684fe192..893a9d9ee870 100644
--- a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c
@@ -2155,6 +2155,7 @@ static bool dcn35_resource_construct(
dc->dml2_options.max_segments_per_hubp = 24;
dc->dml2_options.det_segment_size = DCN3_2_DET_SEG_SIZE;/*todo*/
+ dc->dml2_options.override_det_buffer_size_kbytes = true;
if (dc->config.sdpif_request_limit_words_per_umc == 0)
dc->config.sdpif_request_limit_words_per_umc = 16;/*todo*/
diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c
index 4c5e722baa3a..da9101b83e8c 100644
--- a/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c
@@ -736,7 +736,7 @@ static const struct dc_debug_options debug_defaults_drv = {
.hdmichar = true,
.dpstream = true,
.symclk32_se = true,
- .symclk32_le = true,
+ .symclk32_le = false,
.symclk_fe = true,
.physymclk = false,
.dpiasymclk = true,
@@ -2133,6 +2133,7 @@ static bool dcn351_resource_construct(
dc->dml2_options.max_segments_per_hubp = 24;
dc->dml2_options.det_segment_size = DCN3_2_DET_SEG_SIZE;/*todo*/
+ dc->dml2_options.override_det_buffer_size_kbytes = true;
if (dc->config.sdpif_request_limit_words_per_umc == 0)
dc->config.sdpif_request_limit_words_per_umc = 16;/*todo*/
diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c b/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c
index 15f7eda903e6..014e8a296f0c 100644
--- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c
+++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl.c
@@ -813,6 +813,14 @@ static bool enable_easf(struct spl_in *spl_in, struct spl_scratch *spl_scratch)
return skip_easf;
}
+/* Check if video is in fullscreen mode */
+static bool spl_is_video_fullscreen(struct spl_in *spl_in)
+{
+ if (spl_is_yuv420(spl_in->basic_in.format) && spl_in->is_fullscreen)
+ return true;
+ return false;
+}
+
static bool spl_get_isharp_en(struct spl_in *spl_in,
struct spl_scratch *spl_scratch)
{
@@ -820,6 +828,7 @@ static bool spl_get_isharp_en(struct spl_in *spl_in,
int vratio = 0;
int hratio = 0;
struct spl_taps taps = spl_scratch->scl_data.taps;
+ bool fullscreen = spl_is_video_fullscreen(spl_in);
/* Return if adaptive sharpness is disabled */
if (spl_in->adaptive_sharpness.enable == false)
@@ -835,9 +844,18 @@ static bool spl_get_isharp_en(struct spl_in *spl_in,
// Scaling is up to 1:1 (no scaling) or upscaling
/*
- * Apply sharpness to all RGB surfaces and to
- * NV12/P010 surfaces
+ * Apply sharpness to RGB and YUV (NV12/P010)
+ * surfaces based on policy setting
*/
+ if (!spl_is_yuv420(spl_in->basic_in.format) &&
+ (spl_in->debug.sharpen_policy == SHARPEN_YUV))
+ return enable_isharp;
+ else if ((spl_is_yuv420(spl_in->basic_in.format) && !fullscreen) &&
+ (spl_in->debug.sharpen_policy == SHARPEN_RGB_FULLSCREEN_YUV))
+ return enable_isharp;
+ else if (!spl_in->is_fullscreen &&
+ spl_in->debug.sharpen_policy == SHARPEN_FULLSCREEN_ALL)
+ return enable_isharp;
/*
* Apply sharpness if supports horizontal taps 4,6 AND
@@ -1155,14 +1173,19 @@ static void spl_set_dscl_prog_data(struct spl_in *spl_in, struct spl_scratch *sp
}
/* Calculate C0-C3 coefficients based on HDR_mult */
-static void spl_calculate_c0_c3_hdr(struct dscl_prog_data *dscl_prog_data, uint32_t hdr_multx100)
+static void spl_calculate_c0_c3_hdr(struct dscl_prog_data *dscl_prog_data, uint32_t sdr_white_level_nits)
{
struct spl_fixed31_32 hdr_mult, c0_mult, c1_mult, c2_mult;
struct spl_fixed31_32 c0_calc, c1_calc, c2_calc;
struct spl_custom_float_format fmt;
+ uint32_t hdr_multx100_int;
- SPL_ASSERT(hdr_multx100);
- hdr_mult = spl_fixpt_from_fraction((long long)hdr_multx100, 100LL);
+ if ((sdr_white_level_nits >= 80) && (sdr_white_level_nits <= 480))
+ hdr_multx100_int = sdr_white_level_nits * 100 / 80;
+ else
+ hdr_multx100_int = 100; /* default for 80 nits otherwise */
+
+ hdr_mult = spl_fixpt_from_fraction((long long)hdr_multx100_int, 100LL);
c0_mult = spl_fixpt_from_fraction(2126LL, 10000LL);
c1_mult = spl_fixpt_from_fraction(7152LL, 10000LL);
c2_mult = spl_fixpt_from_fraction(722LL, 10000LL);
@@ -1191,7 +1214,7 @@ static void spl_calculate_c0_c3_hdr(struct dscl_prog_data *dscl_prog_data, uint3
static void spl_set_easf_data(struct spl_scratch *spl_scratch, struct spl_out *spl_out, bool enable_easf_v,
bool enable_easf_h, enum linear_light_scaling lls_pref,
enum spl_pixel_format format, enum system_setup setup,
- uint32_t hdr_multx100)
+ uint32_t sdr_white_level_nits)
{
struct dscl_prog_data *dscl_prog_data = spl_out->dscl_prog_data;
if (enable_easf_v) {
@@ -1499,7 +1522,7 @@ static void spl_set_easf_data(struct spl_scratch *spl_scratch, struct spl_out *s
dscl_prog_data->easf_ltonl_en = 1; // Linear input
if ((setup == HDR_L) && (spl_is_rgb8(format))) {
/* Calculate C0-C3 coefficients based on HDR multiplier */
- spl_calculate_c0_c3_hdr(dscl_prog_data, hdr_multx100);
+ spl_calculate_c0_c3_hdr(dscl_prog_data, sdr_white_level_nits);
} else { // HDR_L ( DWM ) and SDR_L
dscl_prog_data->easf_matrix_c0 =
0x4EF7; // fp1.5.10, C0 coefficient (LN_rec709: 0.2126 * (2^14)/125 = 27.86590720)
@@ -1557,7 +1580,7 @@ static void spl_set_isharp_data(struct dscl_prog_data *dscl_prog_data,
struct adaptive_sharpness adp_sharpness, bool enable_isharp,
enum linear_light_scaling lls_pref, enum spl_pixel_format format,
const struct spl_scaler_data *data, struct spl_fixed31_32 ratio,
- enum system_setup setup)
+ enum system_setup setup, enum scale_to_sharpness_policy scale_to_sharpness_policy)
{
/* Turn off sharpener if not required */
if (!enable_isharp) {
@@ -1565,6 +1588,11 @@ static void spl_set_isharp_data(struct dscl_prog_data *dscl_prog_data,
return;
}
+ spl_build_isharp_1dlut_from_reference_curve(ratio, setup, adp_sharpness,
+ scale_to_sharpness_policy);
+ dscl_prog_data->isharp_delta = spl_get_pregen_filter_isharp_1D_lut(setup);
+ dscl_prog_data->sharpness_level = adp_sharpness.sharpness_level;
+
dscl_prog_data->isharp_en = 1; // ISHARP_EN
// Set ISHARP_NOISEDET_MODE if htaps = 6-tap
if (data->taps.h_taps == 6) {
@@ -1662,11 +1690,6 @@ static void spl_set_isharp_data(struct dscl_prog_data *dscl_prog_data,
dscl_prog_data->isharp_lba.base_seg[5] = 0; // ISHARP LBA PWL for Seg 5. BASE value in U0.6 format
}
-
- spl_build_isharp_1dlut_from_reference_curve(ratio, setup, adp_sharpness);
- dscl_prog_data->isharp_delta = spl_get_pregen_filter_isharp_1D_lut(setup);
- dscl_prog_data->sharpness_level = adp_sharpness.sharpness_level;
-
// Program the nldelta soft clip values
if (lls_pref == LLS_PREF_YES) {
dscl_prog_data->isharp_nldelta_sclip.enable_p = 0; /* ISHARP_NLDELTA_SCLIP_EN_P */
@@ -1750,7 +1773,7 @@ bool spl_calculate_scaler_params(struct spl_in *spl_in, struct spl_out *spl_out)
// Set EASF
spl_set_easf_data(&spl_scratch, spl_out, enable_easf_v, enable_easf_h, spl_in->lls_pref,
- spl_in->basic_in.format, setup, spl_in->hdr_multx100);
+ spl_in->basic_in.format, setup, spl_in->sdr_white_level_nits);
// Set iSHARP
vratio = spl_fixpt_ceil(spl_scratch.scl_data.ratios.vert);
@@ -1761,7 +1784,8 @@ bool spl_calculate_scaler_params(struct spl_in *spl_in, struct spl_out *spl_out)
isharp_scale_ratio = spl_scratch.scl_data.recip_ratios.horz;
spl_set_isharp_data(spl_out->dscl_prog_data, spl_in->adaptive_sharpness, enable_isharp,
- spl_in->lls_pref, spl_in->basic_in.format, data, isharp_scale_ratio, setup);
+ spl_in->lls_pref, spl_in->basic_in.format, data, isharp_scale_ratio, setup,
+ spl_in->debug.scale_to_sharpness_policy);
return res;
}
diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c
index 33712f50d303..e0572252c640 100644
--- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c
+++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.c
@@ -500,6 +500,15 @@ struct isharp_1D_lut_pregen filter_isharp_1D_lut_pregen[NUM_SHARPNESS_SETUPS] =
},
};
+struct scale_ratio_to_sharpness_level_adj sharpness_level_adj[NUM_SHARPNESS_ADJ_LEVELS] = {
+ {1125, 1000, 0},
+ {11, 10, 1},
+ {1075, 1000, 2},
+ {105, 100, 3},
+ {1025, 1000, 4},
+ {1, 1, 5},
+};
+
const uint32_t *spl_get_filter_isharp_1D_lut_0(void)
{
return filter_isharp_1D_lut_0;
@@ -541,19 +550,72 @@ uint16_t *spl_get_filter_isharp_bs_3tap_64p(void)
return filter_isharp_bs_3tap_64p_s1_12;
}
-static unsigned int spl_calculate_sharpness_level(int discrete_sharpness_level, enum system_setup setup,
- struct spl_sharpness_range sharpness_range)
+static unsigned int spl_calculate_sharpness_level_adj(struct spl_fixed31_32 ratio)
+{
+ int j;
+ struct spl_fixed31_32 ratio_level;
+ struct scale_ratio_to_sharpness_level_adj *lookup_ptr;
+ unsigned int sharpness_level_down_adj;
+
+ /*
+ * Adjust sharpness level based on current scaling ratio
+ *
+ * We have 5 discrete scaling ratios which we will use to adjust the
+ * sharpness level down by 1 as we pass each ratio. The ratios
+ * are
+ *
+ * 1.125 upscale and higher - no adj
+ * 1.100 - under 1.125 - adj level down 1
+ * 1.075 - under 1.100 - adj level down 2
+ * 1.050 - under 1.075 - adj level down 3
+ * 1.025 - under 1.050 - adj level down 4
+ * 1.000 - under 1.025 - adj level down 5
+ *
+ */
+ j = 0;
+ sharpness_level_down_adj = 0;
+ lookup_ptr = sharpness_level_adj;
+ while (j < NUM_SHARPNESS_ADJ_LEVELS) {
+ ratio_level = spl_fixpt_from_fraction(lookup_ptr->ratio_numer,
+ lookup_ptr->ratio_denom);
+ if (ratio.value >= ratio_level.value) {
+ sharpness_level_down_adj = lookup_ptr->level_down_adj;
+ break;
+ }
+ lookup_ptr++;
+ j++;
+ }
+ return sharpness_level_down_adj;
+}
+
+static unsigned int spl_calculate_sharpness_level(struct spl_fixed31_32 ratio,
+ int discrete_sharpness_level, enum system_setup setup,
+ struct spl_sharpness_range sharpness_range,
+ enum scale_to_sharpness_policy scale_to_sharpness_policy)
{
unsigned int sharpness_level = 0;
+ unsigned int sharpness_level_down_adj = 0;
int min_sharpness, max_sharpness, mid_sharpness;
+ /*
+ * Adjust sharpness level if policy requires we adjust it based on
+ * scale ratio. Based on scale ratio, we may adjust the sharpness
+ * level down by a certain number of steps. We will not select
+ * a sharpness value of 0 so the lowest sharpness level will be
+ * 0 or 1 depending on what the min_sharpness is
+ *
+ * If the policy is no required, this code maybe removed at a later
+ * date
+ */
switch (setup) {
case HDR_L:
min_sharpness = sharpness_range.hdr_rgb_min;
max_sharpness = sharpness_range.hdr_rgb_max;
mid_sharpness = sharpness_range.hdr_rgb_mid;
+ if (scale_to_sharpness_policy == SCALE_TO_SHARPNESS_ADJ_ALL)
+ sharpness_level_down_adj = spl_calculate_sharpness_level_adj(ratio);
break;
case HDR_NL:
/* currently no use case, use Non-linear SDR values for now */
@@ -561,15 +623,26 @@ static unsigned int spl_calculate_sharpness_level(int discrete_sharpness_level,
min_sharpness = sharpness_range.sdr_yuv_min;
max_sharpness = sharpness_range.sdr_yuv_max;
mid_sharpness = sharpness_range.sdr_yuv_mid;
+ if (scale_to_sharpness_policy >= SCALE_TO_SHARPNESS_ADJ_YUV)
+ sharpness_level_down_adj = spl_calculate_sharpness_level_adj(ratio);
break;
case SDR_L:
default:
min_sharpness = sharpness_range.sdr_rgb_min;
max_sharpness = sharpness_range.sdr_rgb_max;
mid_sharpness = sharpness_range.sdr_rgb_mid;
+ if (scale_to_sharpness_policy == SCALE_TO_SHARPNESS_ADJ_ALL)
+ sharpness_level_down_adj = spl_calculate_sharpness_level_adj(ratio);
break;
}
+ if ((min_sharpness == 0) && (sharpness_level_down_adj >= discrete_sharpness_level))
+ discrete_sharpness_level = 1;
+ else if (sharpness_level_down_adj >= discrete_sharpness_level)
+ discrete_sharpness_level = 0;
+ else
+ discrete_sharpness_level -= sharpness_level_down_adj;
+
int lower_half_step_size = (mid_sharpness - min_sharpness) / 5;
int upper_half_step_size = (max_sharpness - mid_sharpness) / 5;
@@ -584,7 +657,7 @@ static unsigned int spl_calculate_sharpness_level(int discrete_sharpness_level,
}
void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, enum system_setup setup,
- struct adaptive_sharpness sharpness)
+ struct adaptive_sharpness sharpness, enum scale_to_sharpness_policy scale_to_sharpness_policy)
{
uint8_t *byte_ptr_1dlut_src, *byte_ptr_1dlut_dst;
struct spl_fixed31_32 sharp_base, sharp_calc, sharp_level;
@@ -594,8 +667,9 @@ void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, en
uint32_t filter_pregen_store[ISHARP_LUT_TABLE_SIZE];
/* Custom sharpnessX1000 value */
- unsigned int sharpnessX1000 = spl_calculate_sharpness_level(sharpness.sharpness_level,
- setup, sharpness.sharpness_range);
+ unsigned int sharpnessX1000 = spl_calculate_sharpness_level(ratio,
+ sharpness.sharpness_level, setup,
+ sharpness.sharpness_range, scale_to_sharpness_policy);
sharp_level = spl_fixpt_from_fraction(sharpnessX1000, 1000);
/*
@@ -606,7 +680,6 @@ void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, en
(filter_isharp_1D_lut_pregen[setup].sharpness_denom == 1000))
return;
-
/*
* Calculate LUT_128_gained with this equation:
*
diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h
index fe0b12571f2c..afcc66206ca2 100644
--- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h
+++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_isharp_filters.h
@@ -20,11 +20,11 @@ uint16_t *spl_get_filter_isharp_bs_3tap_64p(void);
const uint16_t *spl_get_filter_isharp_wide_6tap_64p(void);
uint16_t *spl_dscl_get_blur_scale_coeffs_64p(int taps);
-struct scale_ratio_to_sharpness_level_lookup {
+#define NUM_SHARPNESS_ADJ_LEVELS 6
+struct scale_ratio_to_sharpness_level_adj {
unsigned int ratio_numer;
unsigned int ratio_denom;
- unsigned int sharpness_numer;
- unsigned int sharpness_denom;
+ unsigned int level_down_adj; /* adjust sharpness level down */
};
struct isharp_1D_lut_pregen {
@@ -45,6 +45,7 @@ void spl_init_blur_scale_coeffs(void);
void spl_set_blur_scale_data(struct dscl_prog_data *dscl_prog_data,
const struct spl_scaler_data *data);
-void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, enum system_setup setup, struct adaptive_sharpness sharpness);
+void spl_build_isharp_1dlut_from_reference_curve(struct spl_fixed31_32 ratio, enum system_setup setup,
+ struct adaptive_sharpness sharpness, enum scale_to_sharpness_policy scale_to_sharpness_policy);
uint32_t *spl_get_pregen_filter_isharp_1D_lut(enum system_setup setup);
#endif /* __DC_SPL_ISHARP_FILTERS_H__ */
diff --git a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h
index 85b19ebe2c57..2a74ff5fdfdb 100644
--- a/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h
+++ b/drivers/gpu/drm/amd/display/dc/spl/dc_spl_types.h
@@ -487,6 +487,17 @@ enum linear_light_scaling { // convert it in translation logic
LLS_PREF_YES,
LLS_PREF_NO
};
+enum sharpen_policy {
+ SHARPEN_ALWAYS = 0,
+ SHARPEN_YUV = 1,
+ SHARPEN_RGB_FULLSCREEN_YUV = 2,
+ SHARPEN_FULLSCREEN_ALL = 3
+};
+enum scale_to_sharpness_policy {
+ NO_SCALE_TO_SHARPNESS_ADJ = 0,
+ SCALE_TO_SHARPNESS_ADJ_YUV = 1,
+ SCALE_TO_SHARPNESS_ADJ_ALL = 2
+};
struct spl_funcs {
void (*spl_calc_lb_num_partitions)
(bool alpha_en,
@@ -499,6 +510,8 @@ struct spl_funcs {
struct spl_debug {
int visual_confirm_base_offset;
int visual_confirm_dpp_offset;
+ enum sharpen_policy sharpen_policy;
+ enum scale_to_sharpness_policy scale_to_sharpness_policy;
};
struct spl_in {
@@ -518,7 +531,7 @@ struct spl_in {
bool is_hdr_on;
int h_active;
int v_active;
- int hdr_multx100;
+ int sdr_white_level_nits;
};
// end of SPL inputs
diff --git a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
index cd70453aeae0..fe5b6f7a3eb1 100644
--- a/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
+++ b/drivers/gpu/drm/amd/display/dmub/dmub_srv.h
@@ -300,6 +300,7 @@ struct dmub_srv_hw_params {
enum dmub_ips_disable_type disable_ips;
bool disallow_phy_access;
bool disable_sldo_opt;
+ bool enable_non_transparent_setconfig;
};
/**
diff --git a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
index e20c220aa8b4..ebcf68bfae2b 100644
--- a/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
+++ b/drivers/gpu/drm/amd/display/dmub/inc/dmub_cmd.h
@@ -682,7 +682,7 @@ union dmub_fw_boot_options {
uint32_t gpint_scratch8: 1; /* 1 if GPINT is in scratch8*/
uint32_t usb4_cm_version: 1; /**< 1 CM support */
uint32_t dpia_hpd_int_enable_supported: 1; /* 1 if dpia hpd int enable supported */
- uint32_t reserved0: 1;
+ uint32_t enable_non_transparent_setconfig: 1; /* 1 if dpia use conventional dp lt flow*/
uint32_t disable_clk_ds: 1; /* 1 if disallow dispclk_ds and dppclk_ds*/
uint32_t disable_timeout_recovery : 1; /* 1 if timeout recovery should be disabled */
uint32_t ips_pg_disable: 1; /* 1 to disable ONO domains power gating*/
@@ -1308,6 +1308,7 @@ enum dmub_cmd_dpia_type {
DMUB_CMD__DPIA_DIG1_DPIA_CONTROL = 0,
DMUB_CMD__DPIA_SET_CONFIG_ACCESS = 1,
DMUB_CMD__DPIA_MST_ALLOC_SLOTS = 2,
+ DMUB_CMD__DPIA_SET_TPS_NOTIFICATION = 3,
};
/* DMUB_OUT_CMD__DPIA_NOTIFICATION command types. */
@@ -2139,6 +2140,24 @@ struct dmub_rb_cmd_set_mst_alloc_slots {
};
/**
+ * Data passed from driver to FW in a DMUB_CMD__SET_TPS_NOTIFICATION command.
+ */
+struct dmub_cmd_tps_notification_data {
+ uint8_t instance; /* DPIA instance */
+ uint8_t tps; /* requested training pattern */
+ uint8_t reserved1;
+ uint8_t reserved2;
+};
+
+/**
+ * DMUB command structure for SET_TPS_NOTIFICATION command.
+ */
+struct dmub_rb_cmd_set_tps_notification {
+ struct dmub_cmd_header header; /* header */
+ struct dmub_cmd_tps_notification_data tps_notification; /* set tps_notification data */
+};
+
+/**
* DMUB command structure for DPIA HPD int enable control.
*/
struct dmub_rb_cmd_dpia_hpd_int_enable {
@@ -5305,6 +5324,10 @@ union dmub_rb_cmd {
*/
struct dmub_rb_cmd_set_mst_alloc_slots set_mst_alloc_slots;
/**
+ * Definition of a DMUB_CMD__DPIA_SET_TPS_NOTIFICATION command.
+ */
+ struct dmub_rb_cmd_set_tps_notification set_tps_notification;
+ /**
* Definition of a DMUB_CMD__EDID_CEA command.
*/
struct dmub_rb_cmd_edid_cea edid_cea;
diff --git a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c
index 746696b6f09a..2ccad79053c5 100644
--- a/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c
+++ b/drivers/gpu/drm/amd/display/dmub/src/dmub_dcn35.c
@@ -425,6 +425,7 @@ void dmub_dcn35_enable_dmub_boot_options(struct dmub_srv *dmub, const struct dmu
boot_options.bits.ips_disable = params->disable_ips;
boot_options.bits.ips_sequential_ono = params->ips_sequential_ono;
boot_options.bits.disable_sldo_opt = params->disable_sldo_opt;
+ boot_options.bits.enable_non_transparent_setconfig = params->enable_non_transparent_setconfig;
REG_WRITE(DMCUB_SCRATCH14, boot_options.all);
}
diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
index a40e6590215a..bbd259cea4f4 100644
--- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
+++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
@@ -134,7 +134,7 @@ unsigned int mod_freesync_calc_v_total_from_refresh(
v_total = div64_u64(div64_u64(((unsigned long long)(
frame_duration_in_ns) * (stream->timing.pix_clk_100hz / 10)),
- stream->timing.h_total), 1000000);
+ stream->timing.h_total) + 500000, 1000000);
/* v_total cannot be less than nominal */
if (v_total < stream->timing.v_total) {
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index 745fd052840d..3f91926a50e9 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -85,7 +85,7 @@ enum amd_apu_flags {
* @AMD_IP_BLOCK_TYPE_MES: Micro-Engine Scheduler
* @AMD_IP_BLOCK_TYPE_JPEG: JPEG Engine
* @AMD_IP_BLOCK_TYPE_VPE: Video Processing Engine
-* @AMD_IP_BLOCK_TYPE_UMSCH_MM: User Mode Schduler for Multimedia
+* @AMD_IP_BLOCK_TYPE_UMSCH_MM: User Mode Scheduler for Multimedia
* @AMD_IP_BLOCK_TYPE_ISP: Image Signal Processor
* @AMD_IP_BLOCK_TYPE_NUM: Total number of IP block types
*/
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 7744ca3ef4b1..e3e635a31b8a 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -71,6 +71,11 @@ enum kgd_memory_pool {
KGD_POOL_FRAMEBUFFER = 3,
};
+struct kfd_cu_occupancy {
+ u32 wave_cnt;
+ u32 doorbell_off;
+};
+
/**
* enum kfd_sched_policy
*
@@ -313,8 +318,9 @@ struct kfd2kgd_calls {
uint32_t grace_period,
uint32_t *reg_offset,
uint32_t *reg_data);
- void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
- int *wave_cnt, int *max_waves_per_cu, uint32_t inst);
+ void (*get_cu_occupancy)(struct amdgpu_device *adev,
+ struct kfd_cu_occupancy *cu_occupancy,
+ int *max_waves_per_cu, uint32_t inst);
void (*program_trap_handler_settings)(struct amdgpu_device *adev,
uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
uint32_t inst);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
index 0b3c2f54a343..822c6425d90e 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_6_pmfw.h
@@ -123,7 +123,7 @@ typedef enum {
VOLTAGE_GUARDBAND_COUNT
} GFX_GUARDBAND_e;
-#define SMU_METRICS_TABLE_VERSION 0xC
+#define SMU_METRICS_TABLE_VERSION 0xD
typedef struct __attribute__((packed, aligned(4))) {
uint32_t AccumulationCounter;
@@ -227,6 +227,10 @@ typedef struct __attribute__((packed, aligned(4))) {
// PCIE LINK Speed and width
uint32_t PCIeLinkSpeed;
uint32_t PCIeLinkWidth;
+
+ // PER XCD ACTIVITY
+ uint32_t GfxBusy[8];
+ uint64_t GfxBusyAcc[8];
} MetricsTableX_t;
typedef struct __attribute__((packed, aligned(4))) {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index a887ab945dfa..1d024b122b0c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2569,10 +2569,14 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu,
}
}
- return smu_cmn_send_smc_msg_with_param(smu,
+ ret = smu_cmn_send_smc_msg_with_param(smu,
SMU_MSG_SetWorkloadMask,
workload_mask,
NULL);
+ if (!ret)
+ smu->workload_mask = workload_mask;
+
+ return ret;
}
static bool smu_v13_0_0_is_mode1_reset_supported(struct smu_context *smu)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 9974c9f8135e..55ed6247eb61 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2107,8 +2107,12 @@ static int smu_v13_0_6_i2c_xfer(struct i2c_adapter *i2c_adap,
}
mutex_lock(&adev->pm.mutex);
r = smu_v13_0_6_request_i2c_xfer(smu, req);
- if (r)
- goto fail;
+ if (r) {
+ /* Retry once, in case of an i2c collision */
+ r = smu_v13_0_6_request_i2c_xfer(smu, req);
+ if (r)
+ goto fail;
+ }
for (c = i = 0; i < num_msgs; i++) {
if (!(msg[i].flags & I2C_M_RD)) {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
index 7bc95c404377..b891a5e0a396 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c
@@ -2501,8 +2501,11 @@ static int smu_v13_0_7_set_power_profile_mode(struct smu_context *smu, long *inp
return -EINVAL;
ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask,
1 << workload_type, NULL);
+
if (ret)
dev_err(smu->adev->dev, "[%s] Failed to set work load mask!", __func__);
+ else
+ smu->workload_mask = (1 << workload_type);
return ret;
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
index 43820d7d2c54..5899d01fa73d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
@@ -1861,10 +1861,14 @@ static int smu_v14_0_2_set_power_profile_mode(struct smu_context *smu,
if (workload_type < 0)
return -EINVAL;
- return smu_cmn_send_smc_msg_with_param(smu,
+ ret = smu_cmn_send_smc_msg_with_param(smu,
SMU_MSG_SetWorkloadMask,
1 << workload_type,
NULL);
+ if (!ret)
+ smu->workload_mask = 1 << workload_type;
+
+ return ret;
}
static int smu_v14_0_2_baco_enter(struct smu_context *smu)
diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c
index 00fbe9f8c03a..b1c294236cc8 100644
--- a/drivers/gpu/drm/i915/display/intel_ddi.c
+++ b/drivers/gpu/drm/i915/display/intel_ddi.c
@@ -916,7 +916,7 @@ intel_ddi_main_link_aux_domain(struct intel_digital_port *dig_port,
* instead of a specific AUX_IO_<port> reference without powering up any
* extra wells.
*/
- if (intel_encoder_can_psr(&dig_port->base))
+ if (intel_psr_needs_aux_io_power(&dig_port->base, crtc_state))
return intel_display_power_aux_io_domain(i915, dig_port->aux_ch);
else if (DISPLAY_VER(i915) < 14 &&
(intel_crtc_has_dp_encoder(crtc_state) ||
diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
index a1fcedfd404b..90fa73575feb 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -531,6 +531,10 @@ static void
intel_dp_set_source_rates(struct intel_dp *intel_dp)
{
/* The values must be in increasing order */
+ static const int bmg_rates[] = {
+ 162000, 216000, 243000, 270000, 324000, 432000, 540000, 675000,
+ 810000, 1000000, 1350000,
+ };
static const int mtl_rates[] = {
162000, 216000, 243000, 270000, 324000, 432000, 540000, 675000,
810000, 1000000, 2000000,
@@ -561,8 +565,13 @@ intel_dp_set_source_rates(struct intel_dp *intel_dp)
intel_dp->source_rates || intel_dp->num_source_rates);
if (DISPLAY_VER(dev_priv) >= 14) {
- source_rates = mtl_rates;
- size = ARRAY_SIZE(mtl_rates);
+ if (IS_BATTLEMAGE(dev_priv)) {
+ source_rates = bmg_rates;
+ size = ARRAY_SIZE(bmg_rates);
+ } else {
+ source_rates = mtl_rates;
+ size = ARRAY_SIZE(mtl_rates);
+ }
max_rate = mtl_max_source_rate(intel_dp);
} else if (DISPLAY_VER(dev_priv) >= 11) {
source_rates = icl_rates;
@@ -4058,6 +4067,9 @@ intel_edp_init_dpcd(struct intel_dp *intel_dp, struct intel_connector *connector
drm_dp_is_branch(intel_dp->dpcd));
intel_init_dpcd_quirks(intel_dp, &intel_dp->desc.ident);
+ intel_dp->colorimetry_support =
+ intel_dp_get_colorimetry_status(intel_dp);
+
/*
* Read the eDP display control registers.
*
@@ -4171,6 +4183,9 @@ intel_dp_get_dpcd(struct intel_dp *intel_dp)
intel_init_dpcd_quirks(intel_dp, &intel_dp->desc.ident);
+ intel_dp->colorimetry_support =
+ intel_dp_get_colorimetry_status(intel_dp);
+
intel_dp_update_sink_caps(intel_dp);
}
@@ -6922,9 +6937,6 @@ intel_dp_init_connector(struct intel_digital_port *dig_port,
"HDCP init failed, skipping.\n");
}
- intel_dp->colorimetry_support =
- intel_dp_get_colorimetry_status(intel_dp);
-
intel_dp->frl.is_trained = false;
intel_dp->frl.trained_rate_gbps = 0;
diff --git a/drivers/gpu/drm/i915/display/intel_psr.c b/drivers/gpu/drm/i915/display/intel_psr.c
index 1f83b3b67ea6..136a0d6ca970 100644
--- a/drivers/gpu/drm/i915/display/intel_psr.c
+++ b/drivers/gpu/drm/i915/display/intel_psr.c
@@ -203,6 +203,25 @@ bool intel_encoder_can_psr(struct intel_encoder *encoder)
return false;
}
+bool intel_psr_needs_aux_io_power(struct intel_encoder *encoder,
+ const struct intel_crtc_state *crtc_state)
+{
+ /*
+ * For PSR/PR modes only eDP requires the AUX IO power to be enabled whenever
+ * the output is enabled. For non-eDP outputs the main link is always
+ * on, hence it doesn't require the HW initiated AUX wake-up signaling used
+ * for eDP.
+ *
+ * TODO:
+ * - Consider leaving AUX IO disabled for eDP / PR as well, in case
+ * the ALPM with main-link off mode is not enabled.
+ * - Leave AUX IO enabled for DP / PR, once support for ALPM with
+ * main-link off mode is added for it and this mode gets enabled.
+ */
+ return intel_crtc_has_type(crtc_state, INTEL_OUTPUT_EDP) &&
+ intel_encoder_can_psr(encoder);
+}
+
static bool psr_global_enabled(struct intel_dp *intel_dp)
{
struct intel_display *display = to_intel_display(intel_dp);
@@ -2784,13 +2803,6 @@ static int _psr1_ready_for_pipe_update_locked(struct intel_dp *intel_dp)
EDP_PSR_STATUS_STATE_MASK, 50);
}
-static int _panel_replay_ready_for_pipe_update_locked(struct intel_dp *intel_dp)
-{
- return intel_dp_is_edp(intel_dp) ?
- _psr2_ready_for_pipe_update_locked(intel_dp) :
- _psr1_ready_for_pipe_update_locked(intel_dp);
-}
-
/**
* intel_psr_wait_for_idle_locked - wait for PSR be ready for a pipe update
* @new_crtc_state: new CRTC state
@@ -2813,12 +2825,10 @@ void intel_psr_wait_for_idle_locked(const struct intel_crtc_state *new_crtc_stat
lockdep_assert_held(&intel_dp->psr.lock);
- if (!intel_dp->psr.enabled)
+ if (!intel_dp->psr.enabled || intel_dp->psr.panel_replay_enabled)
continue;
- if (intel_dp->psr.panel_replay_enabled)
- ret = _panel_replay_ready_for_pipe_update_locked(intel_dp);
- else if (intel_dp->psr.sel_update_enabled)
+ if (intel_dp->psr.sel_update_enabled)
ret = _psr2_ready_for_pipe_update_locked(intel_dp);
else
ret = _psr1_ready_for_pipe_update_locked(intel_dp);
diff --git a/drivers/gpu/drm/i915/display/intel_psr.h b/drivers/gpu/drm/i915/display/intel_psr.h
index 4e09c10908e4..6eb5f15f674f 100644
--- a/drivers/gpu/drm/i915/display/intel_psr.h
+++ b/drivers/gpu/drm/i915/display/intel_psr.h
@@ -25,6 +25,8 @@ struct intel_plane_state;
(intel_dp)->psr.source_panel_replay_support)
bool intel_encoder_can_psr(struct intel_encoder *encoder);
+bool intel_psr_needs_aux_io_power(struct intel_encoder *encoder,
+ const struct intel_crtc_state *crtc_state);
void intel_psr_init_dpcd(struct intel_dp *intel_dp);
void intel_psr_enable_sink(struct intel_dp *intel_dp,
const struct intel_crtc_state *crtc_state);
diff --git a/drivers/gpu/drm/xe/xe_bb.c b/drivers/gpu/drm/xe/xe_bb.c
index a13e0b3a169e..ef777dbdf4ec 100644
--- a/drivers/gpu/drm/xe/xe_bb.c
+++ b/drivers/gpu/drm/xe/xe_bb.c
@@ -65,7 +65,8 @@ __xe_bb_create_job(struct xe_exec_queue *q, struct xe_bb *bb, u64 *addr)
{
u32 size = drm_suballoc_size(bb->bo);
- bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
+ if (bb->len == 0 || bb->cs[bb->len - 1] != MI_BATCH_BUFFER_END)
+ bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
xe_gt_assert(q->gt, bb->len * 4 + bb_prefetch(q->gt) <= size);
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c
index 06911e9a3bf5..f379df3a12bf 100644
--- a/drivers/gpu/drm/xe/xe_bo.c
+++ b/drivers/gpu/drm/xe/xe_bo.c
@@ -2320,6 +2320,20 @@ void xe_bo_put_commit(struct llist_head *deferred)
drm_gem_object_free(&bo->ttm.base.refcount);
}
+void xe_bo_put(struct xe_bo *bo)
+{
+ might_sleep();
+ if (bo) {
+#ifdef CONFIG_PROC_FS
+ if (bo->client)
+ might_lock(&bo->client->bos_lock);
+#endif
+ if (bo->ggtt_node && bo->ggtt_node->ggtt)
+ might_lock(&bo->ggtt_node->ggtt->lock);
+ drm_gem_object_put(&bo->ttm.base);
+ }
+}
+
/**
* xe_bo_dumb_create - Create a dumb bo as backing for a fb
* @file_priv: ...
diff --git a/drivers/gpu/drm/xe/xe_bo.h b/drivers/gpu/drm/xe/xe_bo.h
index dbfb3209615d..6e4be52306df 100644
--- a/drivers/gpu/drm/xe/xe_bo.h
+++ b/drivers/gpu/drm/xe/xe_bo.h
@@ -126,11 +126,7 @@ static inline struct xe_bo *xe_bo_get(struct xe_bo *bo)
return bo;
}
-static inline void xe_bo_put(struct xe_bo *bo)
-{
- if (bo)
- drm_gem_object_put(&bo->ttm.base);
-}
+void xe_bo_put(struct xe_bo *bo);
static inline void __xe_bo_unset_bulk_move(struct xe_bo *bo)
{
diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
index 95a05c5bc897..c4add8b38bbd 100644
--- a/drivers/gpu/drm/xe/xe_drm_client.c
+++ b/drivers/gpu/drm/xe/xe_drm_client.c
@@ -168,15 +168,10 @@ static void bo_meminfo(struct xe_bo *bo,
struct drm_memory_stats stats[TTM_NUM_MEM_TYPES])
{
u64 sz = bo->size;
- u32 mem_type;
+ u32 mem_type = bo->ttm.resource->mem_type;
xe_bo_assert_held(bo);
- if (bo->placement.placement)
- mem_type = bo->placement.placement->mem_type;
- else
- mem_type = XE_PL_TT;
-
if (drm_gem_object_is_shared_for_memory_stats(&bo->ttm.base))
stats[mem_type].shared += sz;
else
diff --git a/drivers/gpu/drm/xe/xe_gt_pagefault.c b/drivers/gpu/drm/xe/xe_gt_pagefault.c
index 730eec07795e..00af059a8971 100644
--- a/drivers/gpu/drm/xe/xe_gt_pagefault.c
+++ b/drivers/gpu/drm/xe/xe_gt_pagefault.c
@@ -212,6 +212,12 @@ static int handle_pagefault(struct xe_gt *gt, struct pagefault *pf)
* TODO: Change to read lock? Using write lock for simplicity.
*/
down_write(&vm->lock);
+
+ if (xe_vm_is_closed(vm)) {
+ err = -ENOENT;
+ goto unlock_vm;
+ }
+
vma = lookup_vma(vm, pf->page_addr);
if (!vma) {
err = -EINVAL;
diff --git a/drivers/gpu/drm/xe/xe_guc.h b/drivers/gpu/drm/xe/xe_guc.h
index c3e6b51f7a09..42116b167c98 100644
--- a/drivers/gpu/drm/xe/xe_guc.h
+++ b/drivers/gpu/drm/xe/xe_guc.h
@@ -18,8 +18,10 @@
*/
#define MAKE_GUC_VER(maj, min, pat) (((maj) << 16) | ((min) << 8) | (pat))
#define MAKE_GUC_VER_STRUCT(ver) MAKE_GUC_VER((ver).major, (ver).minor, (ver).patch)
-#define GUC_SUBMIT_VER(guc) MAKE_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY])
-#define GUC_FIRMWARE_VER(guc) MAKE_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_RELEASE])
+#define GUC_SUBMIT_VER(guc) \
+ MAKE_GUC_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY])
+#define GUC_FIRMWARE_VER(guc) \
+ MAKE_GUC_VER_STRUCT((guc)->fw.versions.found[XE_UC_FW_VER_RELEASE])
struct drm_printer;
diff --git a/drivers/gpu/drm/xe/xe_vram.c b/drivers/gpu/drm/xe/xe_vram.c
index 5bcd59190353..80ba2fc78837 100644
--- a/drivers/gpu/drm/xe/xe_vram.c
+++ b/drivers/gpu/drm/xe/xe_vram.c
@@ -182,6 +182,7 @@ static inline u64 get_flat_ccs_offset(struct xe_gt *gt, u64 tile_size)
offset = offset_hi << 32; /* HW view bits 39:32 */
offset |= offset_lo << 6; /* HW view bits 31:6 */
offset *= num_enabled; /* convert to SW view */
+ offset = round_up(offset, SZ_128K); /* SW must round up to nearest 128K */
/* We don't expect any holes */
xe_assert_msg(xe, offset == (xe_mmio_read64_2x32(gt, GSMBASE) - ccs_size),
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 53f18b351f53..6b3ba7e5723a 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -782,6 +782,7 @@ config I2C_JZ4780
config I2C_KEBA
tristate "KEBA I2C controller support"
depends on HAS_IOMEM
+ depends on KEBA_CP500 || COMPILE_TEST
select AUXILIARY_BUS
help
This driver supports the I2C controller found in KEBA system FPGA
diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c
index 080204182bb5..f31d352d98b5 100644
--- a/drivers/i2c/busses/i2c-designware-common.c
+++ b/drivers/i2c/busses/i2c-designware-common.c
@@ -523,6 +523,7 @@ err_release_lock:
void __i2c_dw_disable(struct dw_i2c_dev *dev)
{
+ struct i2c_timings *t = &dev->timings;
unsigned int raw_intr_stats;
unsigned int enable;
int timeout = 100;
@@ -535,6 +536,19 @@ void __i2c_dw_disable(struct dw_i2c_dev *dev)
abort_needed = raw_intr_stats & DW_IC_INTR_MST_ON_HOLD;
if (abort_needed) {
+ if (!(enable & DW_IC_ENABLE_ENABLE)) {
+ regmap_write(dev->map, DW_IC_ENABLE, DW_IC_ENABLE_ENABLE);
+ /*
+ * Wait 10 times the signaling period of the highest I2C
+ * transfer supported by the driver (for 400KHz this is
+ * 25us) to ensure the I2C ENABLE bit is already set
+ * as described in the DesignWare I2C databook.
+ */
+ fsleep(DIV_ROUND_CLOSEST_ULL(10 * MICRO, t->bus_freq_hz));
+ /* Set ENABLE bit before setting ABORT */
+ enable |= DW_IC_ENABLE_ENABLE;
+ }
+
regmap_write(dev->map, DW_IC_ENABLE, enable | DW_IC_ENABLE_ABORT);
ret = regmap_read_poll_timeout(dev->map, DW_IC_ENABLE, enable,
!(enable & DW_IC_ENABLE_ABORT), 10,
diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h
index 1ac2afd03a0a..8e8854ec9882 100644
--- a/drivers/i2c/busses/i2c-designware-core.h
+++ b/drivers/i2c/busses/i2c-designware-core.h
@@ -108,6 +108,7 @@
DW_IC_INTR_RX_UNDER | \
DW_IC_INTR_RD_REQ)
+#define DW_IC_ENABLE_ENABLE BIT(0)
#define DW_IC_ENABLE_ABORT BIT(1)
#define DW_IC_STATUS_ACTIVITY BIT(0)
diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c
index e46f1b22c360..e8ac9a7bf0b3 100644
--- a/drivers/i2c/busses/i2c-designware-master.c
+++ b/drivers/i2c/busses/i2c-designware-master.c
@@ -271,6 +271,34 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev)
__i2c_dw_write_intr_mask(dev, DW_IC_INTR_MASTER_MASK);
}
+/*
+ * This function waits for the controller to be idle before disabling I2C
+ * When the controller is not in the IDLE state, the MST_ACTIVITY bit
+ * (IC_STATUS[5]) is set.
+ *
+ * Values:
+ * 0x1 (ACTIVE): Controller not idle
+ * 0x0 (IDLE): Controller is idle
+ *
+ * The function is called after completing the current transfer.
+ *
+ * Returns:
+ * False when the controller is in the IDLE state.
+ * True when the controller is in the ACTIVE state.
+ */
+static bool i2c_dw_is_controller_active(struct dw_i2c_dev *dev)
+{
+ u32 status;
+
+ regmap_read(dev->map, DW_IC_STATUS, &status);
+ if (!(status & DW_IC_STATUS_MASTER_ACTIVITY))
+ return false;
+
+ return regmap_read_poll_timeout(dev->map, DW_IC_STATUS, status,
+ !(status & DW_IC_STATUS_MASTER_ACTIVITY),
+ 1100, 20000) != 0;
+}
+
static int i2c_dw_check_stopbit(struct dw_i2c_dev *dev)
{
u32 val;
@@ -807,6 +835,16 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
}
/*
+ * This happens rarely (~1:500) and is hard to reproduce. Debug trace
+ * showed that IC_STATUS had value of 0x23 when STOP_DET occurred,
+ * if disable IC_ENABLE.ENABLE immediately that can result in
+ * IC_RAW_INTR_STAT.MASTER_ON_HOLD holding SCL low. Check if
+ * controller is still ACTIVE before disabling I2C.
+ */
+ if (i2c_dw_is_controller_active(dev))
+ dev_err(dev->dev, "controller active\n");
+
+ /*
* We must disable the adapter before returning and signaling the end
* of the current transfer. Otherwise the hardware might continue
* generating interrupts which in turn causes a race condition with
diff --git a/drivers/i2c/busses/i2c-synquacer.c b/drivers/i2c/busses/i2c-synquacer.c
index 4eccbcd0fbfc..bbb9062669e4 100644
--- a/drivers/i2c/busses/i2c-synquacer.c
+++ b/drivers/i2c/busses/i2c-synquacer.c
@@ -550,12 +550,13 @@ static int synquacer_i2c_probe(struct platform_device *pdev)
device_property_read_u32(&pdev->dev, "socionext,pclk-rate",
&i2c->pclkrate);
- pclk = devm_clk_get_enabled(&pdev->dev, "pclk");
+ pclk = devm_clk_get_optional_enabled(&pdev->dev, "pclk");
if (IS_ERR(pclk))
return dev_err_probe(&pdev->dev, PTR_ERR(pclk),
"failed to get and enable clock\n");
- i2c->pclkrate = clk_get_rate(pclk);
+ if (pclk)
+ i2c->pclkrate = clk_get_rate(pclk);
if (i2c->pclkrate < SYNQUACER_I2C_MIN_CLK_RATE ||
i2c->pclkrate > SYNQUACER_I2C_MAX_CLK_RATE)
diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c
index 4c89aad02451..1d68177241a6 100644
--- a/drivers/i2c/busses/i2c-xiic.c
+++ b/drivers/i2c/busses/i2c-xiic.c
@@ -1337,8 +1337,8 @@ static int xiic_i2c_probe(struct platform_device *pdev)
return 0;
err_pm_disable:
- pm_runtime_set_suspended(&pdev->dev);
pm_runtime_disable(&pdev->dev);
+ pm_runtime_set_suspended(&pdev->dev);
return ret;
}
diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index 4eed97295927..6fb995778636 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -25,6 +25,7 @@ config ARM_MHU_V2
config ARM_MHU_V3
tristate "ARM MHUv3 Mailbox"
+ depends on ARM64 || COMPILE_TEST
depends on HAS_IOMEM || COMPILE_TEST
depends on OF
help
@@ -73,7 +74,7 @@ config ARMADA_37XX_RWTM_MBOX
config OMAP2PLUS_MBOX
tristate "OMAP2+ Mailbox framework support"
- depends on ARCH_OMAP2PLUS || ARCH_K3
+ depends on ARCH_OMAP2PLUS || ARCH_K3 || COMPILE_TEST
help
Mailbox implementation for OMAP family chips with hardware for
interprocessor communication involving DSP, IVA1.0 and IVA2 in
diff --git a/drivers/mailbox/bcm2835-mailbox.c b/drivers/mailbox/bcm2835-mailbox.c
index fbfd0202047c..ea12fb8d2401 100644
--- a/drivers/mailbox/bcm2835-mailbox.c
+++ b/drivers/mailbox/bcm2835-mailbox.c
@@ -145,7 +145,8 @@ static int bcm2835_mbox_probe(struct platform_device *pdev)
spin_lock_init(&mbox->lock);
ret = devm_request_irq(dev, irq_of_parse_and_map(dev->of_node, 0),
- bcm2835_mbox_irq, 0, dev_name(dev), mbox);
+ bcm2835_mbox_irq, IRQF_NO_SUSPEND, dev_name(dev),
+ mbox);
if (ret) {
dev_err(dev, "Failed to register a mailbox IRQ handler: %d\n",
ret);
diff --git a/drivers/mailbox/imx-mailbox.c b/drivers/mailbox/imx-mailbox.c
index d17efb1dd0cb..f815dab3be50 100644
--- a/drivers/mailbox/imx-mailbox.c
+++ b/drivers/mailbox/imx-mailbox.c
@@ -30,7 +30,7 @@
#define IMX_MU_SCU_CHANS 6
/* TX0/RX0 */
#define IMX_MU_S4_CHANS 2
-#define IMX_MU_CHAN_NAME_SIZE 20
+#define IMX_MU_CHAN_NAME_SIZE 32
#define IMX_MU_V2_PAR_OFF 0x4
#define IMX_MU_V2_TR_MASK GENMASK(7, 0)
@@ -782,7 +782,7 @@ static int imx_mu_init_generic(struct imx_mu_priv *priv)
cp->chan = &priv->mbox_chans[i];
priv->mbox_chans[i].con_priv = cp;
snprintf(cp->irq_desc, sizeof(cp->irq_desc),
- "imx_mu_chan[%i-%i]", cp->type, cp->idx);
+ "%s[%i-%i]", dev_name(priv->dev), cp->type, cp->idx);
}
priv->mbox.num_chans = IMX_MU_CHANS;
@@ -819,7 +819,7 @@ static int imx_mu_init_specific(struct imx_mu_priv *priv)
cp->chan = &priv->mbox_chans[i];
priv->mbox_chans[i].con_priv = cp;
snprintf(cp->irq_desc, sizeof(cp->irq_desc),
- "imx_mu_chan[%i-%i]", cp->type, cp->idx);
+ "%s[%i-%i]", dev_name(priv->dev), cp->type, cp->idx);
}
priv->mbox.num_chans = num_chans;
diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c
index ebff3baf3045..d3d26a2c9895 100644
--- a/drivers/mailbox/mailbox.c
+++ b/drivers/mailbox/mailbox.c
@@ -450,30 +450,20 @@ struct mbox_chan *mbox_request_channel_byname(struct mbox_client *cl,
const char *name)
{
struct device_node *np = cl->dev->of_node;
- struct property *prop;
- const char *mbox_name;
- int index = 0;
+ int index;
if (!np) {
dev_err(cl->dev, "%s() currently only supports DT\n", __func__);
return ERR_PTR(-EINVAL);
}
- if (!of_get_property(np, "mbox-names", NULL)) {
- dev_err(cl->dev,
- "%s() requires an \"mbox-names\" property\n", __func__);
+ index = of_property_match_string(np, "mbox-names", name);
+ if (index < 0) {
+ dev_err(cl->dev, "%s() could not locate channel named \"%s\"\n",
+ __func__, name);
return ERR_PTR(-EINVAL);
}
-
- of_property_for_each_string(np, "mbox-names", prop, mbox_name) {
- if (!strncmp(name, mbox_name, strlen(name)))
- return mbox_request_channel(cl, index);
- index++;
- }
-
- dev_err(cl->dev, "%s() could not locate channel named \"%s\"\n",
- __func__, name);
- return ERR_PTR(-EINVAL);
+ return mbox_request_channel(cl, index);
}
EXPORT_SYMBOL_GPL(mbox_request_channel_byname);
diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index 7a87424657a1..6797770474a5 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -603,7 +603,7 @@ static struct platform_driver omap_mbox_driver = {
.driver = {
.name = "omap-mailbox",
.pm = &omap_mbox_pm_ops,
- .of_match_table = of_match_ptr(omap_mailbox_of_match),
+ .of_match_table = omap_mailbox_of_match,
},
};
module_platform_driver(omap_mbox_driver);
diff --git a/drivers/mailbox/rockchip-mailbox.c b/drivers/mailbox/rockchip-mailbox.c
index 8ffad059e898..4d966cb2ed03 100644
--- a/drivers/mailbox/rockchip-mailbox.c
+++ b/drivers/mailbox/rockchip-mailbox.c
@@ -159,7 +159,7 @@ static const struct of_device_id rockchip_mbox_of_match[] = {
{ .compatible = "rockchip,rk3368-mailbox", .data = &rk3368_drv_data},
{ },
};
-MODULE_DEVICE_TABLE(of, rockchp_mbox_of_match);
+MODULE_DEVICE_TABLE(of, rockchip_mbox_of_match);
static int rockchip_mbox_probe(struct platform_device *pdev)
{
diff --git a/drivers/mailbox/sprd-mailbox.c b/drivers/mailbox/sprd-mailbox.c
index 9ae57de77d4d..ee8539dfcef5 100644
--- a/drivers/mailbox/sprd-mailbox.c
+++ b/drivers/mailbox/sprd-mailbox.c
@@ -62,7 +62,6 @@ struct sprd_mbox_priv {
void __iomem *outbox_base;
/* Base register address for supplementary outbox */
void __iomem *supp_base;
- struct clk *clk;
u32 outbox_fifo_depth;
struct mutex lock;
@@ -291,19 +290,13 @@ static const struct mbox_chan_ops sprd_mbox_ops = {
.shutdown = sprd_mbox_shutdown,
};
-static void sprd_mbox_disable(void *data)
-{
- struct sprd_mbox_priv *priv = data;
-
- clk_disable_unprepare(priv->clk);
-}
-
static int sprd_mbox_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct sprd_mbox_priv *priv;
int ret, inbox_irq, outbox_irq, supp_irq;
unsigned long id, supp;
+ struct clk *clk;
priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
if (!priv)
@@ -331,20 +324,10 @@ static int sprd_mbox_probe(struct platform_device *pdev)
if (IS_ERR(priv->outbox_base))
return PTR_ERR(priv->outbox_base);
- priv->clk = devm_clk_get(dev, "enable");
- if (IS_ERR(priv->clk)) {
+ clk = devm_clk_get_enabled(dev, "enable");
+ if (IS_ERR(clk)) {
dev_err(dev, "failed to get mailbox clock\n");
- return PTR_ERR(priv->clk);
- }
-
- ret = clk_prepare_enable(priv->clk);
- if (ret)
- return ret;
-
- ret = devm_add_action_or_reset(dev, sprd_mbox_disable, priv);
- if (ret) {
- dev_err(dev, "failed to add mailbox disable action\n");
- return ret;
+ return PTR_ERR(clk);
}
inbox_irq = platform_get_irq_byname(pdev, "inbox");
diff --git a/drivers/message/fusion/mptctl.c b/drivers/message/fusion/mptctl.c
index 4766d8518dc9..77fa55df70d0 100644
--- a/drivers/message/fusion/mptctl.c
+++ b/drivers/message/fusion/mptctl.c
@@ -1609,7 +1609,7 @@ mptctl_eventreport (MPT_ADAPTER *ioc, unsigned long arg)
maxEvents = numBytes/sizeof(MPT_IOCTL_EVENTS);
- max = MPTCTL_EVENT_LOG_SIZE < maxEvents ? MPTCTL_EVENT_LOG_SIZE : maxEvents;
+ max = min(maxEvents, MPTCTL_EVENT_LOG_SIZE);
/* If fewer than 1 event is requested, there must have
* been some type of error.
diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
index 7a80c92b785e..955e4e38477e 100644
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -330,8 +330,7 @@ config STM32_RPROC
config TI_K3_DSP_REMOTEPROC
tristate "TI K3 DSP remoteproc support"
depends on ARCH_K3
- select MAILBOX
- select OMAP2PLUS_MBOX
+ depends on OMAP2PLUS_MBOX
help
Say m here to support TI's C66x and C71x DSP remote processor
subsystems on various TI K3 family of SoCs through the remote
@@ -356,8 +355,7 @@ config TI_K3_M4_REMOTEPROC
config TI_K3_R5_REMOTEPROC
tristate "TI K3 R5 remoteproc support"
depends on ARCH_K3
- select MAILBOX
- select OMAP2PLUS_MBOX
+ depends on OMAP2PLUS_MBOX
help
Say m here to support TI's R5F remote processor subsystems
on various TI K3 family of SoCs through the remote processor
diff --git a/drivers/s390/crypto/vfio_ap_drv.c b/drivers/s390/crypto/vfio_ap_drv.c
index 4aeb3e1213c7..67a807e2e75b 100644
--- a/drivers/s390/crypto/vfio_ap_drv.c
+++ b/drivers/s390/crypto/vfio_ap_drv.c
@@ -26,6 +26,18 @@ MODULE_LICENSE("GPL v2");
struct ap_matrix_dev *matrix_dev;
debug_info_t *vfio_ap_dbf_info;
+static ssize_t features_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "guest_matrix hotplug ap_config\n");
+}
+static DEVICE_ATTR_RO(features);
+
+static struct attribute *matrix_dev_attrs[] = {
+ &dev_attr_features.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(matrix_dev);
+
/* Only type 10 adapters (CEX4 and later) are supported
* by the AP matrix device driver
*/
@@ -68,6 +80,7 @@ static struct device_driver matrix_driver = {
.name = "vfio_ap",
.bus = &matrix_bus,
.suppress_bind_attrs = true,
+ .dev_groups = matrix_dev_groups,
};
static int vfio_ap_matrix_dev_create(void)
diff --git a/drivers/scsi/cxgbi/libcxgbi.h b/drivers/scsi/cxgbi/libcxgbi.h
index d92cf1dccc2f..0909b03e2497 100644
--- a/drivers/scsi/cxgbi/libcxgbi.h
+++ b/drivers/scsi/cxgbi/libcxgbi.h
@@ -485,7 +485,6 @@ struct cxgbi_device {
unsigned char nmtus;
unsigned char nports;
struct pci_dev *pdev;
- struct dentry *debugfs_root;
struct iscsi_transport *itp;
struct module *owner;
@@ -499,7 +498,6 @@ struct cxgbi_device {
unsigned int rxq_idx_cntr;
struct cxgbi_ports_map pmap;
- void (*dev_ddp_cleanup)(struct cxgbi_device *);
struct cxgbi_ppm* (*cdev2ppm)(struct cxgbi_device *);
int (*csk_ddp_set_map)(struct cxgbi_ppm *, struct cxgbi_sock *,
struct cxgbi_task_tag_info *);
@@ -512,7 +510,6 @@ struct cxgbi_device {
unsigned int, int);
void (*csk_release_offload_resources)(struct cxgbi_sock *);
- int (*csk_rx_pdu_ready)(struct cxgbi_sock *, struct sk_buff *);
u32 (*csk_send_rx_credits)(struct cxgbi_sock *, u32);
int (*csk_push_tx_frames)(struct cxgbi_sock *, int);
void (*csk_send_abort_req)(struct cxgbi_sock *);
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
index feda9b54b443..4cd3a3eab6f1 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@ -2421,7 +2421,7 @@ out:
spin_lock_irqsave(&device->done_lock, flags);
if (test_bit(SAS_HA_FROZEN, &ha->state)) {
spin_unlock_irqrestore(&device->done_lock, flags);
- dev_info(dev, "slot complete: task(%pK) ignored\n ",
+ dev_info(dev, "slot complete: task(%pK) ignored\n",
task);
return;
}
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index a3d1013c8307..e66c3ef74267 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -37,6 +37,7 @@ static unsigned int default_timeout = IBMVFC_DEFAULT_TIMEOUT;
static u64 max_lun = IBMVFC_MAX_LUN;
static unsigned int max_targets = IBMVFC_MAX_TARGETS;
static unsigned int max_requests = IBMVFC_MAX_REQUESTS_DEFAULT;
+static u16 max_sectors = IBMVFC_MAX_SECTORS;
static u16 scsi_qdepth = IBMVFC_SCSI_QDEPTH;
static unsigned int disc_threads = IBMVFC_MAX_DISC_THREADS;
static unsigned int ibmvfc_debug = IBMVFC_DEBUG;
@@ -83,6 +84,9 @@ MODULE_PARM_DESC(default_timeout,
module_param_named(max_requests, max_requests, uint, S_IRUGO);
MODULE_PARM_DESC(max_requests, "Maximum requests for this adapter. "
"[Default=" __stringify(IBMVFC_MAX_REQUESTS_DEFAULT) "]");
+module_param_named(max_sectors, max_sectors, ushort, S_IRUGO);
+MODULE_PARM_DESC(max_sectors, "Maximum sectors for this adapter. "
+ "[Default=" __stringify(IBMVFC_MAX_SECTORS) "]");
module_param_named(scsi_qdepth, scsi_qdepth, ushort, S_IRUGO);
MODULE_PARM_DESC(scsi_qdepth, "Maximum scsi command depth per adapter queue. "
"[Default=" __stringify(IBMVFC_SCSI_QDEPTH) "]");
@@ -1494,7 +1498,7 @@ static void ibmvfc_set_login_info(struct ibmvfc_host *vhost)
memset(login_info, 0, sizeof(*login_info));
login_info->ostype = cpu_to_be32(IBMVFC_OS_LINUX);
- login_info->max_dma_len = cpu_to_be64(IBMVFC_MAX_SECTORS << 9);
+ login_info->max_dma_len = cpu_to_be64(max_sectors << 9);
login_info->max_payload = cpu_to_be32(sizeof(struct ibmvfc_fcp_cmd_iu));
login_info->max_response = cpu_to_be32(sizeof(struct ibmvfc_fcp_rsp));
login_info->partition_num = cpu_to_be32(vhost->partition_number);
@@ -5230,7 +5234,7 @@ static void ibmvfc_npiv_login_done(struct ibmvfc_event *evt)
}
vhost->logged_in = 1;
- npiv_max_sectors = min((uint)(be64_to_cpu(rsp->max_dma_len) >> 9), IBMVFC_MAX_SECTORS);
+ npiv_max_sectors = min((uint)(be64_to_cpu(rsp->max_dma_len) >> 9), max_sectors);
dev_info(vhost->dev, "Host partition: %s, device: %s %s %s max sectors %u\n",
rsp->partition_name, rsp->device_name, rsp->port_loc_code,
rsp->drc_name, npiv_max_sectors);
@@ -6329,7 +6333,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id)
shost->can_queue = scsi_qdepth;
shost->max_lun = max_lun;
shost->max_id = max_targets;
- shost->max_sectors = IBMVFC_MAX_SECTORS;
+ shost->max_sectors = max_sectors;
shost->max_cmd_len = IBMVFC_MAX_CDB_LEN;
shost->unique_id = shost->host_no;
shost->nr_hw_queues = mq_enabled ? min(max_scsi_queues, nr_scsi_hw_queues) : 1;
@@ -6556,6 +6560,7 @@ static struct fc_function_template ibmvfc_transport_functions = {
**/
static int __init ibmvfc_module_init(void)
{
+ int min_max_sectors = PAGE_SIZE >> 9;
int rc;
if (!firmware_has_feature(FW_FEATURE_VIO))
@@ -6564,6 +6569,16 @@ static int __init ibmvfc_module_init(void)
printk(KERN_INFO IBMVFC_NAME": IBM Virtual Fibre Channel Driver version: %s %s\n",
IBMVFC_DRIVER_VERSION, IBMVFC_DRIVER_DATE);
+ /*
+ * Range check the max_sectors module parameter. The upper bounds is
+ * implicity checked since the parameter is a ushort.
+ */
+ if (max_sectors < min_max_sectors) {
+ printk(KERN_ERR IBMVFC_NAME ": max_sectors must be at least %d.\n",
+ min_max_sectors);
+ max_sectors = min_max_sectors;
+ }
+
ibmvfc_transport_template = fc_attach_transport(&ibmvfc_transport_functions);
if (!ibmvfc_transport_template)
return -ENOMEM;
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.h b/drivers/scsi/ibmvscsi/ibmvfc.h
index 745ad5ac7251..c73ed2314ad0 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.h
+++ b/drivers/scsi/ibmvscsi/ibmvfc.h
@@ -32,7 +32,7 @@
#define IBMVFC_DEBUG 0
#define IBMVFC_MAX_TARGETS 1024
#define IBMVFC_MAX_LUN 0xffffffff
-#define IBMVFC_MAX_SECTORS 0xffffu
+#define IBMVFC_MAX_SECTORS 2048
#define IBMVFC_MAX_DISC_THREADS 4
#define IBMVFC_TGT_MEMPOOL_SZ 64
#define IBMVFC_MAX_CMDS_PER_LUN 64
diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c
index 4756a3f82531..85059b83ea6b 100644
--- a/drivers/scsi/lpfc/lpfc_bsg.c
+++ b/drivers/scsi/lpfc/lpfc_bsg.c
@@ -3208,6 +3208,9 @@ lpfc_bsg_diag_loopback_run(struct bsg_job *job)
cmdiocbq->num_bdes = num_bde;
cmdiocbq->cmd_flag |= LPFC_IO_LIBDFC;
cmdiocbq->cmd_flag |= LPFC_IO_LOOPBACK;
+ if (phba->cfg_vmid_app_header)
+ cmdiocbq->cmd_flag |= LPFC_IO_VMID;
+
cmdiocbq->vport = phba->pport;
cmdiocbq->cmd_cmpl = NULL;
cmdiocbq->bpl_dmabuf = txbmp;
diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c
index 2dedd1493e5b..134bc96dd134 100644
--- a/drivers/scsi/lpfc/lpfc_ct.c
+++ b/drivers/scsi/lpfc/lpfc_ct.c
@@ -1572,8 +1572,8 @@ lpfc_cmpl_ct_cmd_gft_id(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
}
}
} else
- lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
- "3065 GFT_ID failed x%08x\n", ulp_status);
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_DISCOVERY,
+ "3065 GFT_ID status x%08x\n", ulp_status);
out:
lpfc_ct_free_iocb(phba, cmdiocb);
@@ -1647,6 +1647,18 @@ lpfc_cmpl_ct(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
}
out:
+ /* If the caller wanted a synchronous DA_ID completion, signal the
+ * wait obj and clear flag to reset the vport.
+ */
+ if (ndlp->save_flags & NLP_WAIT_FOR_DA_ID) {
+ if (ndlp->da_id_waitq)
+ wake_up(ndlp->da_id_waitq);
+ }
+
+ spin_lock_irq(&ndlp->lock);
+ ndlp->save_flags &= ~NLP_WAIT_FOR_DA_ID;
+ spin_unlock_irq(&ndlp->lock);
+
lpfc_ct_free_iocb(phba, cmdiocb);
lpfc_nlp_put(ndlp);
return;
@@ -2246,7 +2258,7 @@ lpfc_cmpl_ct_disc_fdmi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
}
lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY,
- "0229 FDMI cmd %04x failed, latt = %d "
+ "0229 FDMI cmd %04x latt = %d "
"ulp_status: x%x, rid x%x\n",
be16_to_cpu(fdmi_cmd), latt, ulp_status,
ulp_word4);
@@ -2263,9 +2275,9 @@ lpfc_cmpl_ct_disc_fdmi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
/* Check for a CT LS_RJT response */
cmd = be16_to_cpu(fdmi_cmd);
if (be16_to_cpu(fdmi_rsp) == SLI_CT_RESPONSE_FS_RJT) {
- /* FDMI rsp failed */
+ /* Log FDMI reject */
lpfc_printf_vlog(vport, KERN_INFO, LOG_DISCOVERY | LOG_ELS,
- "0220 FDMI cmd failed FS_RJT Data: x%x", cmd);
+ "0220 FDMI cmd FS_RJT Data: x%x", cmd);
/* Should we fallback to FDMI-2 / FDMI-1 ? */
switch (cmd) {
diff --git a/drivers/scsi/lpfc/lpfc_disc.h b/drivers/scsi/lpfc/lpfc_disc.h
index f82615d87c4b..f5ae8cc15820 100644
--- a/drivers/scsi/lpfc/lpfc_disc.h
+++ b/drivers/scsi/lpfc/lpfc_disc.h
@@ -90,6 +90,8 @@ enum lpfc_nlp_save_flags {
NLP_IN_RECOV_POST_DEV_LOSS = 0x1,
/* wait for outstanding LOGO to cmpl */
NLP_WAIT_FOR_LOGO = 0x2,
+ /* wait for outstanding DA_ID to finish */
+ NLP_WAIT_FOR_DA_ID = 0x4
};
struct lpfc_nodelist {
@@ -159,7 +161,12 @@ struct lpfc_nodelist {
uint32_t nvme_fb_size; /* NVME target's supported byte cnt */
#define NVME_FB_BIT_SHIFT 9 /* PRLI Rsp first burst in 512B units. */
uint32_t nlp_defer_did;
+
+ /* These wait objects are NPIV specific. These IOs must complete
+ * synchronously.
+ */
wait_queue_head_t *logo_waitq;
+ wait_queue_head_t *da_id_waitq;
};
struct lpfc_node_rrq {
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index de0ec945d2f1..d737b897ddd8 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -979,7 +979,7 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
phba->fcoe_cvl_eventtag_attn =
phba->fcoe_cvl_eventtag;
lpfc_printf_log(phba, KERN_WARNING, LOG_FIP | LOG_ELS,
- "2611 FLOGI failed on FCF (x%x), "
+ "2611 FLOGI FCF (x%x), "
"status:x%x/x%x, tmo:x%x, perform "
"roundrobin FCF failover\n",
phba->fcf.current_rec.fcf_indx,
@@ -997,11 +997,11 @@ stop_rr_fcf_flogi:
if (!(ulp_status == IOSTAT_LOCAL_REJECT &&
((ulp_word4 & IOERR_PARAM_MASK) ==
IOERR_LOOP_OPEN_FAILURE)))
- lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
- "2858 FLOGI failure Status:x%x/x%x TMO"
- ":x%x Data x%lx x%x\n",
- ulp_status, ulp_word4, tmo,
- phba->hba_flag, phba->fcf.fcf_flag);
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "2858 FLOGI Status:x%x/x%x TMO"
+ ":x%x Data x%lx x%x\n",
+ ulp_status, ulp_word4, tmo,
+ phba->hba_flag, phba->fcf.fcf_flag);
/* Check for retry */
if (lpfc_els_retry(phba, cmdiocb, rspiocb)) {
@@ -1023,7 +1023,7 @@ stop_rr_fcf_flogi:
lpfc_nlp_put(ndlp);
lpfc_printf_vlog(vport, KERN_WARNING, LOG_ELS,
- "0150 FLOGI failure Status:x%x/x%x "
+ "0150 FLOGI Status:x%x/x%x "
"xri x%x TMO:x%x refcnt %d\n",
ulp_status, ulp_word4, cmdiocb->sli4_xritag,
tmo, kref_read(&ndlp->kref));
@@ -1032,11 +1032,11 @@ stop_rr_fcf_flogi:
if (!(ulp_status == IOSTAT_LOCAL_REJECT &&
((ulp_word4 & IOERR_PARAM_MASK) ==
IOERR_LOOP_OPEN_FAILURE))) {
- /* FLOGI failure */
- lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
- "0100 FLOGI failure Status:x%x/x%x "
- "TMO:x%x\n",
- ulp_status, ulp_word4, tmo);
+ /* Warn FLOGI status */
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "0100 FLOGI Status:x%x/x%x "
+ "TMO:x%x\n",
+ ulp_status, ulp_word4, tmo);
goto flogifail;
}
@@ -1964,16 +1964,16 @@ lpfc_cmpl_els_rrq(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
if (ulp_status) {
/* Check for retry */
- /* RRQ failed Don't print the vport to vport rjts */
+ /* Warn RRQ status Don't print the vport to vport rjts */
if (ulp_status != IOSTAT_LS_RJT ||
(((ulp_word4) >> 16 != LSRJT_INVALID_CMD) &&
((ulp_word4) >> 16 != LSRJT_UNABLE_TPC)) ||
(phba)->pport->cfg_log_verbose & LOG_ELS)
- lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
- "2881 RRQ failure DID:%06X Status:"
- "x%x/x%x\n",
- ndlp->nlp_DID, ulp_status,
- ulp_word4);
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "2881 RRQ DID:%06X Status:"
+ "x%x/x%x\n",
+ ndlp->nlp_DID, ulp_status,
+ ulp_word4);
}
lpfc_clr_rrq_active(phba, rrq->xritag, rrq);
@@ -2077,16 +2077,16 @@ lpfc_cmpl_els_plogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
}
goto out;
}
- /* PLOGI failed Don't print the vport to vport rjts */
+ /* Warn PLOGI status Don't print the vport to vport rjts */
if (ulp_status != IOSTAT_LS_RJT ||
(((ulp_word4) >> 16 != LSRJT_INVALID_CMD) &&
((ulp_word4) >> 16 != LSRJT_UNABLE_TPC)) ||
(phba)->pport->cfg_log_verbose & LOG_ELS)
- lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
- "2753 PLOGI failure DID:%06X "
- "Status:x%x/x%x\n",
- ndlp->nlp_DID, ulp_status,
- ulp_word4);
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "2753 PLOGI DID:%06X "
+ "Status:x%x/x%x\n",
+ ndlp->nlp_DID, ulp_status,
+ ulp_word4);
/* Do not call DSM for lpfc_els_abort'ed ELS cmds */
if (!lpfc_error_lost_link(vport, ulp_status, ulp_word4))
@@ -2323,7 +2323,6 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
struct lpfc_vport *vport = cmdiocb->vport;
struct lpfc_nodelist *ndlp;
char *mode;
- u32 loglevel;
u32 ulp_status;
u32 ulp_word4;
bool release_node = false;
@@ -2372,17 +2371,14 @@ lpfc_cmpl_els_prli(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
* could be expected.
*/
if (test_bit(FC_FABRIC, &vport->fc_flag) ||
- vport->cfg_enable_fc4_type != LPFC_ENABLE_BOTH) {
- mode = KERN_ERR;
- loglevel = LOG_TRACE_EVENT;
- } else {
+ vport->cfg_enable_fc4_type != LPFC_ENABLE_BOTH)
+ mode = KERN_WARNING;
+ else
mode = KERN_INFO;
- loglevel = LOG_ELS;
- }
- /* PRLI failed */
- lpfc_printf_vlog(vport, mode, loglevel,
- "2754 PRLI failure DID:%06X Status:x%x/x%x, "
+ /* Warn PRLI status */
+ lpfc_printf_vlog(vport, mode, LOG_ELS,
+ "2754 PRLI DID:%06X Status:x%x/x%x, "
"data: x%x x%x x%x\n",
ndlp->nlp_DID, ulp_status,
ulp_word4, ndlp->nlp_state,
@@ -2854,11 +2850,11 @@ lpfc_cmpl_els_adisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
}
goto out;
}
- /* ADISC failed */
- lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
- "2755 ADISC failure DID:%06X Status:x%x/x%x\n",
- ndlp->nlp_DID, ulp_status,
- ulp_word4);
+ /* Warn ADISC status */
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "2755 ADISC DID:%06X Status:x%x/x%x\n",
+ ndlp->nlp_DID, ulp_status,
+ ulp_word4);
lpfc_disc_state_machine(vport, ndlp, cmdiocb,
NLP_EVT_CMPL_ADISC);
@@ -3045,12 +3041,12 @@ lpfc_cmpl_els_logo(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
* discovery. The PLOGI will retry.
*/
if (ulp_status) {
- /* LOGO failed */
- lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
- "2756 LOGO failure, No Retry DID:%06X "
- "Status:x%x/x%x\n",
- ndlp->nlp_DID, ulp_status,
- ulp_word4);
+ /* Warn LOGO status */
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "2756 LOGO, No Retry DID:%06X "
+ "Status:x%x/x%x\n",
+ ndlp->nlp_DID, ulp_status,
+ ulp_word4);
if (lpfc_error_lost_link(vport, ulp_status, ulp_word4))
skip_recovery = 1;
@@ -4837,11 +4833,10 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
if ((phba->sli3_options & LPFC_SLI3_NPIV_ENABLED) &&
(cmd == ELS_CMD_FDISC) &&
(stat.un.b.lsRjtRsnCodeExp == LSEXP_OUT_OF_RESOURCE)){
- lpfc_printf_vlog(vport, KERN_ERR,
- LOG_TRACE_EVENT,
- "0125 FDISC Failed (x%x). "
- "Fabric out of resources\n",
- stat.un.lsRjtError);
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "0125 FDISC (x%x). "
+ "Fabric out of resources\n",
+ stat.un.lsRjtError);
lpfc_vport_set_state(vport,
FC_VPORT_NO_FABRIC_RSCS);
}
@@ -4877,11 +4872,10 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
LSEXP_NOTHING_MORE) {
vport->fc_sparam.cmn.bbRcvSizeMsb &= 0xf;
retry = 1;
- lpfc_printf_vlog(vport, KERN_ERR,
- LOG_TRACE_EVENT,
- "0820 FLOGI Failed (x%x). "
- "BBCredit Not Supported\n",
- stat.un.lsRjtError);
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "0820 FLOGI (x%x). "
+ "BBCredit Not Supported\n",
+ stat.un.lsRjtError);
}
break;
@@ -4891,11 +4885,10 @@ lpfc_els_retry(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
((stat.un.b.lsRjtRsnCodeExp == LSEXP_INVALID_PNAME) ||
(stat.un.b.lsRjtRsnCodeExp == LSEXP_INVALID_NPORT_ID))
) {
- lpfc_printf_vlog(vport, KERN_ERR,
- LOG_TRACE_EVENT,
- "0122 FDISC Failed (x%x). "
- "Fabric Detected Bad WWN\n",
- stat.un.lsRjtError);
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "0122 FDISC (x%x). "
+ "Fabric Detected Bad WWN\n",
+ stat.un.lsRjtError);
lpfc_vport_set_state(vport,
FC_VPORT_FABRIC_REJ_WWN);
}
@@ -5355,8 +5348,8 @@ lpfc_cmpl_els_rsp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
u32 ulp_status, ulp_word4, tmo, did, iotag;
if (!vport) {
- lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
- "3177 ELS response failed\n");
+ lpfc_printf_log(phba, KERN_WARNING, LOG_ELS,
+ "3177 null vport in ELS rsp\n");
goto out;
}
if (cmdiocb->context_un.mbox)
@@ -9658,11 +9651,12 @@ lpfc_els_flush_cmd(struct lpfc_vport *vport)
if (piocb->cmd_flag & LPFC_DRIVER_ABORTED && !mbx_tmo_err)
continue;
- /* On the ELS ring we can have ELS_REQUESTs or
- * GEN_REQUESTs waiting for a response.
+ /* On the ELS ring we can have ELS_REQUESTs, ELS_RSPs,
+ * or GEN_REQUESTs waiting for a CQE response.
*/
ulp_command = get_job_cmnd(phba, piocb);
- if (ulp_command == CMD_ELS_REQUEST64_CR) {
+ if (ulp_command == CMD_ELS_REQUEST64_WQE ||
+ ulp_command == CMD_XMIT_ELS_RSP64_WQE) {
list_add_tail(&piocb->dlist, &abort_list);
/* If the link is down when flushing ELS commands
@@ -11327,10 +11321,10 @@ lpfc_cmpl_els_fdisc(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
/* Check for retry */
if (lpfc_els_retry(phba, cmdiocb, rspiocb))
goto out;
- /* FDISC failed */
- lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
- "0126 FDISC failed. (x%x/x%x)\n",
- ulp_status, ulp_word4);
+ /* Warn FDISC status */
+ lpfc_vlog_msg(vport, KERN_WARNING, LOG_ELS,
+ "0126 FDISC cmpl status: x%x/x%x)\n",
+ ulp_status, ulp_word4);
goto fdisc_failed;
}
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index 35c9181c6608..9241075f72fa 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -527,6 +527,9 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp)
* the following lpfc_nlp_put is necessary after fabric node is
* recovered.
*/
+ spin_lock_irqsave(&ndlp->lock, iflags);
+ ndlp->nlp_flag &= ~NLP_IN_DEV_LOSS;
+ spin_unlock_irqrestore(&ndlp->lock, iflags);
if (recovering) {
lpfc_printf_vlog(vport, KERN_INFO,
LOG_DISCOVERY | LOG_NODE,
@@ -539,6 +542,7 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp)
spin_lock_irqsave(&ndlp->lock, iflags);
ndlp->save_flags |= NLP_IN_RECOV_POST_DEV_LOSS;
spin_unlock_irqrestore(&ndlp->lock, iflags);
+ return fcf_inuse;
} else if (ndlp->nlp_state == NLP_STE_UNMAPPED_NODE) {
/* Fabric node fully recovered before this dev_loss_tmo
* queue work is processed. Thus, ignore the
@@ -552,15 +556,9 @@ lpfc_dev_loss_tmo_handler(struct lpfc_nodelist *ndlp)
ndlp->nlp_DID, kref_read(&ndlp->kref),
ndlp, ndlp->nlp_flag,
vport->port_state);
- spin_lock_irqsave(&ndlp->lock, iflags);
- ndlp->nlp_flag &= ~NLP_IN_DEV_LOSS;
- spin_unlock_irqrestore(&ndlp->lock, iflags);
return fcf_inuse;
}
- spin_lock_irqsave(&ndlp->lock, iflags);
- ndlp->nlp_flag &= ~NLP_IN_DEV_LOSS;
- spin_unlock_irqrestore(&ndlp->lock, iflags);
lpfc_nlp_put(ndlp);
return fcf_inuse;
}
diff --git a/drivers/scsi/lpfc/lpfc_hw.h b/drivers/scsi/lpfc/lpfc_hw.h
index 2108b4cb7815..d5c15742f7f2 100644
--- a/drivers/scsi/lpfc/lpfc_hw.h
+++ b/drivers/scsi/lpfc/lpfc_hw.h
@@ -562,6 +562,27 @@ struct fc_vft_header {
#include <uapi/scsi/fc/fc_els.h>
/*
+ * Application Header
+ */
+struct fc_app_header {
+ uint32_t dst_app_id;
+ uint32_t src_app_id;
+#define LOOPBACK_SRC_APPID 0x4321
+ uint32_t word2;
+ uint32_t word3;
+};
+
+/*
+ * dfctl optional header definition
+ */
+enum lpfc_fc_dfctl {
+ LPFC_FC_NO_DEVICE_HEADER,
+ LPFC_FC_16B_DEVICE_HEADER,
+ LPFC_FC_32B_DEVICE_HEADER,
+ LPFC_FC_64B_DEVICE_HEADER,
+};
+
+/*
* Extended Link Service LS_COMMAND codes (Payload Word 0)
*/
#ifdef __BIG_ENDIAN_BITFIELD
diff --git a/drivers/scsi/lpfc/lpfc_hw4.h b/drivers/scsi/lpfc/lpfc_hw4.h
index 500253007b1d..26e1313ebb21 100644
--- a/drivers/scsi/lpfc/lpfc_hw4.h
+++ b/drivers/scsi/lpfc/lpfc_hw4.h
@@ -4847,6 +4847,7 @@ struct fcp_iwrite64_wqe {
#define cmd_buff_len_SHIFT 16
#define cmd_buff_len_MASK 0x00000ffff
#define cmd_buff_len_WORD word3
+/* Note: payload_offset_len field depends on ASIC support */
#define payload_offset_len_SHIFT 0
#define payload_offset_len_MASK 0x0000ffff
#define payload_offset_len_WORD word3
@@ -4863,6 +4864,7 @@ struct fcp_iread64_wqe {
#define cmd_buff_len_SHIFT 16
#define cmd_buff_len_MASK 0x00000ffff
#define cmd_buff_len_WORD word3
+/* Note: payload_offset_len field depends on ASIC support */
#define payload_offset_len_SHIFT 0
#define payload_offset_len_MASK 0x0000ffff
#define payload_offset_len_WORD word3
@@ -4879,6 +4881,7 @@ struct fcp_icmnd64_wqe {
#define cmd_buff_len_SHIFT 16
#define cmd_buff_len_MASK 0x00000ffff
#define cmd_buff_len_WORD word3
+/* Note: payload_offset_len field depends on ASIC support */
#define payload_offset_len_SHIFT 0
#define payload_offset_len_MASK 0x0000ffff
#define payload_offset_len_WORD word3
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index 50620918becd..0dd451009b07 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -4699,6 +4699,7 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
uint64_t wwn;
bool use_no_reset_hba = false;
int rc;
+ u8 if_type;
if (lpfc_no_hba_reset_cnt) {
if (phba->sli_rev < LPFC_SLI_REV4 &&
@@ -4773,10 +4774,24 @@ lpfc_create_port(struct lpfc_hba *phba, int instance, struct device *dev)
shost->max_id = LPFC_MAX_TARGET;
shost->max_lun = vport->cfg_max_luns;
shost->this_id = -1;
- if (phba->sli_rev == LPFC_SLI_REV4)
- shost->max_cmd_len = LPFC_FCP_CDB_LEN_32;
- else
+
+ /* Set max_cmd_len applicable to ASIC support */
+ if (phba->sli_rev == LPFC_SLI_REV4) {
+ if_type = bf_get(lpfc_sli_intf_if_type,
+ &phba->sli4_hba.sli_intf);
+ switch (if_type) {
+ case LPFC_SLI_INTF_IF_TYPE_2:
+ fallthrough;
+ case LPFC_SLI_INTF_IF_TYPE_6:
+ shost->max_cmd_len = LPFC_FCP_CDB_LEN_32;
+ break;
+ default:
+ shost->max_cmd_len = LPFC_FCP_CDB_LEN;
+ break;
+ }
+ } else {
shost->max_cmd_len = LPFC_FCP_CDB_LEN;
+ }
if (phba->sli_rev == LPFC_SLI_REV4) {
if (!phba->cfg_fcp_mq_threshold ||
@@ -10436,6 +10451,7 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba)
struct lpfc_vector_map_info *cpup;
struct lpfc_vector_map_info *eqcpup;
struct lpfc_eq_intr_info *eqi;
+ u32 wqesize;
/*
* Create HBA Record arrays.
@@ -10655,9 +10671,15 @@ lpfc_sli4_queue_create(struct lpfc_hba *phba)
* Create ELS Work Queues
*/
- /* Create slow-path ELS Work Queue */
+ /*
+ * Create slow-path ELS Work Queue.
+ * Increase the ELS WQ size when WQEs contain an embedded cdb
+ */
+ wqesize = (phba->fcp_embed_io) ?
+ LPFC_WQE128_SIZE : phba->sli4_hba.wq_esize;
+
qdesc = lpfc_sli4_queue_alloc(phba, LPFC_DEFAULT_PAGE_SIZE,
- phba->sli4_hba.wq_esize,
+ wqesize,
phba->sli4_hba.wq_ecount, cpu);
if (!qdesc) {
lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 60cd60ebff38..0eaede8275da 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -4760,7 +4760,7 @@ static int lpfc_scsi_prep_cmnd_buf_s4(struct lpfc_vport *vport,
/* Word 3 */
bf_set(payload_offset_len, &wqe->fcp_icmd,
- sizeof(struct fcp_cmnd32) + sizeof(struct fcp_rsp));
+ sizeof(struct fcp_cmnd) + sizeof(struct fcp_rsp));
/* Word 6 */
bf_set(wqe_ctxt_tag, &wqe->generic.wqe_com,
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 332b8d2348e9..2ec6e55771b4 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -1940,12 +1940,15 @@ lpfc_issue_cmf_sync_wqe(struct lpfc_hba *phba, u32 ms, u64 total)
atot = atomic_xchg(&phba->cgn_sync_alarm_cnt, 0);
wtot = atomic_xchg(&phba->cgn_sync_warn_cnt, 0);
+ spin_lock_irqsave(&phba->hbalock, iflags);
+
/* ONLY Managed mode will send the CMF_SYNC_WQE to the HBA */
if (phba->cmf_active_mode != LPFC_CFG_MANAGED ||
- phba->link_state == LPFC_LINK_DOWN)
- return 0;
+ phba->link_state < LPFC_LINK_UP) {
+ ret_val = 0;
+ goto out_unlock;
+ }
- spin_lock_irqsave(&phba->hbalock, iflags);
sync_buf = __lpfc_sli_get_iocbq(phba);
if (!sync_buf) {
lpfc_printf_log(phba, KERN_ERR, LOG_CGN_MGMT,
@@ -8818,7 +8821,7 @@ lpfc_sli4_hba_setup(struct lpfc_hba *phba)
rc = lpfc_sli4_queue_setup(phba);
if (unlikely(rc)) {
lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
- "0381 Error %d during queue setup.\n ", rc);
+ "0381 Error %d during queue setup.\n", rc);
goto out_stop_timers;
}
/* Initialize the driver internal SLI layer lists. */
@@ -11090,9 +11093,17 @@ __lpfc_sli_prep_xmit_seq64_s4(struct lpfc_iocbq *cmdiocbq,
/* Word 9 */
bf_set(wqe_rcvoxid, &wqe->xmit_sequence.wqe_com, ox_id);
- /* Word 12 */
- if (cmdiocbq->cmd_flag & (LPFC_IO_LIBDFC | LPFC_IO_LOOPBACK))
+ if (cmdiocbq->cmd_flag & (LPFC_IO_LIBDFC | LPFC_IO_LOOPBACK)) {
+ /* Word 10 */
+ if (cmdiocbq->cmd_flag & LPFC_IO_VMID) {
+ bf_set(wqe_appid, &wqe->xmit_sequence.wqe_com, 1);
+ bf_set(wqe_wqes, &wqe->xmit_sequence.wqe_com, 1);
+ wqe->words[31] = LOOPBACK_SRC_APPID;
+ }
+
+ /* Word 12 */
wqe->xmit_sequence.xmit_len = full_size;
+ }
else
wqe->xmit_sequence.xmit_len =
wqe->xmit_sequence.bde.tus.f.bdeSize;
@@ -18431,6 +18442,7 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr)
{
/* make rctl_names static to save stack space */
struct fc_vft_header *fc_vft_hdr;
+ struct fc_app_header *fc_app_hdr;
uint32_t *header = (uint32_t *) fc_hdr;
#define FC_RCTL_MDS_DIAGS 0xF4
@@ -18486,6 +18498,32 @@ lpfc_fc_frame_check(struct lpfc_hba *phba, struct fc_frame_header *fc_hdr)
goto drop;
}
+ if (unlikely(phba->link_flag == LS_LOOPBACK_MODE &&
+ phba->cfg_vmid_app_header)) {
+ /* Application header is 16B device header */
+ if (fc_hdr->fh_df_ctl & LPFC_FC_16B_DEVICE_HEADER) {
+ fc_app_hdr = (struct fc_app_header *) (fc_hdr + 1);
+ if (be32_to_cpu(fc_app_hdr->src_app_id) !=
+ LOOPBACK_SRC_APPID) {
+ lpfc_printf_log(phba, KERN_WARNING,
+ LOG_ELS | LOG_LIBDFC,
+ "1932 Loopback src app id "
+ "not matched, app_id:x%x\n",
+ be32_to_cpu(fc_app_hdr->src_app_id));
+
+ goto drop;
+ }
+ } else {
+ lpfc_printf_log(phba, KERN_WARNING,
+ LOG_ELS | LOG_LIBDFC,
+ "1933 Loopback df_ctl bit not set, "
+ "df_ctl:x%x\n",
+ fc_hdr->fh_df_ctl);
+
+ goto drop;
+ }
+ }
+
lpfc_printf_log(phba, KERN_INFO, LOG_ELS,
"2538 Received frame rctl:x%x, type:x%x, "
"frame Data:%08x %08x %08x %08x %08x %08x %08x\n",
@@ -21149,7 +21187,7 @@ lpfc_drain_txq(struct lpfc_hba *phba)
if (!piocbq) {
spin_unlock_irqrestore(&pring->ring_lock, iflags);
lpfc_printf_log(phba, KERN_ERR, LOG_TRACE_EVENT,
- "2823 txq empty and txq_cnt is %d\n ",
+ "2823 txq empty and txq_cnt is %d\n",
txq_cnt);
break;
}
diff --git a/drivers/scsi/lpfc/lpfc_version.h b/drivers/scsi/lpfc/lpfc_version.h
index 2fe0386a1fee..e70f163fab90 100644
--- a/drivers/scsi/lpfc/lpfc_version.h
+++ b/drivers/scsi/lpfc/lpfc_version.h
@@ -20,7 +20,7 @@
* included with this package. *
*******************************************************************/
-#define LPFC_DRIVER_VERSION "14.4.0.4"
+#define LPFC_DRIVER_VERSION "14.4.0.5"
#define LPFC_DRIVER_NAME "lpfc"
/* Used for SLI 2/3 */
diff --git a/drivers/scsi/lpfc/lpfc_vport.c b/drivers/scsi/lpfc/lpfc_vport.c
index 4439167a5188..7a4d4d8e2ad5 100644
--- a/drivers/scsi/lpfc/lpfc_vport.c
+++ b/drivers/scsi/lpfc/lpfc_vport.c
@@ -626,6 +626,7 @@ lpfc_vport_delete(struct fc_vport *fc_vport)
struct Scsi_Host *shost = lpfc_shost_from_vport(vport);
struct lpfc_hba *phba = vport->phba;
int rc;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waitq);
if (vport->port_type == LPFC_PHYSICAL_PORT) {
lpfc_printf_vlog(vport, KERN_ERR, LOG_TRACE_EVENT,
@@ -679,21 +680,49 @@ lpfc_vport_delete(struct fc_vport *fc_vport)
if (!ndlp)
goto skip_logo;
+ /* Send the DA_ID and Fabric LOGO to cleanup the NPIV fabric entries. */
if (ndlp && ndlp->nlp_state == NLP_STE_UNMAPPED_NODE &&
phba->link_state >= LPFC_LINK_UP &&
phba->fc_topology != LPFC_TOPOLOGY_LOOP) {
if (vport->cfg_enable_da_id) {
- /* Send DA_ID and wait for a completion. */
+ /* Send DA_ID and wait for a completion. This is best
+ * effort. If the DA_ID fails, likely the fabric will
+ * "leak" NportIDs but at least the driver issued the
+ * command.
+ */
+ ndlp = lpfc_findnode_did(vport, NameServer_DID);
+ if (!ndlp)
+ goto issue_logo;
+
+ spin_lock_irq(&ndlp->lock);
+ ndlp->da_id_waitq = &waitq;
+ ndlp->save_flags |= NLP_WAIT_FOR_DA_ID;
+ spin_unlock_irq(&ndlp->lock);
+
rc = lpfc_ns_cmd(vport, SLI_CTNS_DA_ID, 0, 0);
- if (rc) {
- lpfc_printf_log(vport->phba, KERN_WARNING,
- LOG_VPORT,
- "1829 CT command failed to "
- "delete objects on fabric, "
- "rc %d\n", rc);
+ if (!rc) {
+ wait_event_timeout(waitq,
+ !(ndlp->save_flags & NLP_WAIT_FOR_DA_ID),
+ msecs_to_jiffies(phba->fc_ratov * 2000));
}
+
+ lpfc_printf_vlog(vport, KERN_INFO, LOG_VPORT | LOG_ELS,
+ "1829 DA_ID issue status %d. "
+ "SFlag x%x NState x%x, NFlag x%x "
+ "Rpi x%x\n",
+ rc, ndlp->save_flags, ndlp->nlp_state,
+ ndlp->nlp_flag, ndlp->nlp_rpi);
+
+ /* Remove the waitq and save_flags. It no
+ * longer matters if the wake happened.
+ */
+ spin_lock_irq(&ndlp->lock);
+ ndlp->da_id_waitq = NULL;
+ ndlp->save_flags &= ~NLP_WAIT_FOR_DA_ID;
+ spin_unlock_irq(&ndlp->lock);
}
+issue_logo:
/*
* If the vpi is not registered, then a valid FDISC doesn't
* exist and there is no need for a ELS LOGO. Just cleanup
diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c
index 6c79c350a4d5..4ecf5284c0fc 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -6380,7 +6380,7 @@ static int megasas_init_fw(struct megasas_instance *instance)
GFP_KERNEL);
if (!fusion->stream_detect_by_ld[i]) {
dev_err(&instance->pdev->dev,
- "unable to allocate stream detect by LD\n ");
+ "unable to allocate stream detect by LD\n");
for (j = 0; j < i; ++j)
kfree(fusion->stream_detect_by_ld[j]);
kfree(fusion->stream_detect_by_ld);
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
index 4b7a8f6314a3..00cd18edfad6 100644
--- a/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_cnfg.h
@@ -67,6 +67,7 @@
#define MPI3_SECURITY_PGAD_SLOT_GROUP_MASK (0x0000ff00)
#define MPI3_SECURITY_PGAD_SLOT_GROUP_SHIFT (8)
#define MPI3_SECURITY_PGAD_SLOT_MASK (0x000000ff)
+#define MPI3_INSTANCE_PGAD_INSTANCE_MASK (0x0000ffff)
struct mpi3_config_request {
__le16 host_tag;
u8 ioc_use_only02;
@@ -75,7 +76,8 @@ struct mpi3_config_request {
u8 ioc_use_only06;
u8 msg_flags;
__le16 change_count;
- __le16 reserved0a;
+ u8 proxy_ioc_number;
+ u8 reserved0b;
u8 page_version;
u8 page_number;
u8 page_type;
@@ -206,6 +208,9 @@ struct mpi3_config_page_header {
#define MPI3_MFGPAGE_DEVID_SAS5116_MPI_MGMT (0x00b5)
#define MPI3_MFGPAGE_DEVID_SAS5116_NVME_MGMT (0x00b6)
#define MPI3_MFGPAGE_DEVID_SAS5116_PCIE_SWITCH (0x00b8)
+#define MPI3_MFGPAGE_DEVID_SAS5248_MPI (0x00f0)
+#define MPI3_MFGPAGE_DEVID_SAS5248_MPI_NS (0x00f1)
+#define MPI3_MFGPAGE_DEVID_SAS5248_PCIE_SWITCH (0x00f2)
struct mpi3_man_page0 {
struct mpi3_config_page_header header;
u8 chip_revision[8];
@@ -1074,6 +1079,8 @@ struct mpi3_io_unit_page8 {
#define MPI3_IOUNIT8_SBSTATE_SVN_UPDATE_PENDING (0x04)
#define MPI3_IOUNIT8_SBSTATE_KEY_UPDATE_PENDING (0x02)
#define MPI3_IOUNIT8_SBSTATE_SECURE_BOOT_ENABLED (0x01)
+#define MPI3_IOUNIT8_SBMODE_CURRENT_KEY_IOUNIT17 (0x10)
+#define MPI3_IOUNIT8_SBMODE_HARD_SECURE_RECERTIFIED (0x08)
struct mpi3_io_unit_page9 {
struct mpi3_config_page_header header;
__le32 flags;
@@ -1089,6 +1096,8 @@ struct mpi3_io_unit_page9 {
#define MPI3_IOUNIT9_FLAGS_UBM_ENCLOSURE_ORDER_BACKPLANE_TYPE (0x00000004)
#define MPI3_IOUNIT9_FLAGS_VDFIRST_ENABLED (0x00000001)
#define MPI3_IOUNIT9_FIRSTDEVICE_UNKNOWN (0xffff)
+#define MPI3_IOUNIT9_FIRSTDEVICE_IN_DRIVER_PAGE_0 (0xfffe)
+
struct mpi3_io_unit_page10 {
struct mpi3_config_page_header header;
u8 flags;
@@ -1224,6 +1233,19 @@ struct mpi3_io_unit_page15 {
#define MPI3_IOUNIT15_FLAGS_EPRSUPPORT_WITHOUT_POWER_BRAKE_GPIO (0x01)
#define MPI3_IOUNIT15_FLAGS_EPRSUPPORT_WITH_POWER_BRAKE_GPIO (0x02)
#define MPI3_IOUNIT15_NUMPOWERBUDGETDATA_POWER_BUDGETING_DISABLED (0x00)
+
+struct mpi3_io_unit_page17 {
+ struct mpi3_config_page_header header;
+ u8 num_instances;
+ u8 instance;
+ __le16 reserved0a;
+ __le32 reserved0c[4];
+ __le16 key_length;
+ u8 encryption_algorithm;
+ u8 reserved1f;
+ __le32 current_key[];
+};
+#define MPI3_IOUNIT17_PAGEVERSION (0x00)
struct mpi3_ioc_page0 {
struct mpi3_config_page_header header;
__le32 reserved08;
@@ -1311,7 +1333,7 @@ struct mpi3_driver_page0 {
u8 tur_interval;
u8 reserved10;
u8 security_key_timeout;
- __le16 reserved12;
+ __le16 first_device;
__le32 reserved14;
__le32 reserved18;
};
@@ -1324,10 +1346,13 @@ struct mpi3_driver_page0 {
#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_AND_DEVS (0x00000000)
#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_ONLY (0x00000001)
#define MPI3_DRIVER0_BSDOPTS_REGISTRATION_IOC_AND_INTERNAL_DEVS (0x00000002)
+#define MPI3_DRIVER0_FIRSTDEVICE_IGNORE1 (0x0000)
+#define MPI3_DRIVER0_FIRSTDEVICE_IGNORE2 (0xffff)
struct mpi3_driver_page1 {
struct mpi3_config_page_header header;
__le32 flags;
- __le32 reserved0c;
+ u8 time_stamp_update;
+ u8 reserved0d[3];
__le16 host_diag_trace_max_size;
__le16 host_diag_trace_min_size;
__le16 host_diag_trace_decrement_size;
@@ -2347,6 +2372,10 @@ struct mpi3_device0_vd_format {
#define MPI3_DEVICE0_VD_DEVICE_INFO_SAS (0x0001)
#define MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_MASK (0xf000)
#define MPI3_DEVICE0_VD_FLAGS_IO_THROTTLE_GROUP_QD_SHIFT (12)
+#define MPI3_DEVICE0_VD_FLAGS_OSEXPOSURE_MASK (0x0003)
+#define MPI3_DEVICE0_VD_FLAGS_OSEXPOSURE_HDD (0x0000)
+#define MPI3_DEVICE0_VD_FLAGS_OSEXPOSURE_SSD (0x0001)
+#define MPI3_DEVICE0_VD_FLAGS_OSEXPOSURE_NO_GUIDANCE (0x0002)
union mpi3_device0_dev_spec_format {
struct mpi3_device0_sas_sata_format sas_sata_format;
struct mpi3_device0_pcie_format pcie_format;
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_image.h b/drivers/scsi/mpi3mr/mpi/mpi30_image.h
index 7df242190135..2c6e548cbd0f 100644
--- a/drivers/scsi/mpi3mr/mpi/mpi30_image.h
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_image.h
@@ -205,13 +205,14 @@ struct mpi3_encrypted_hash_entry {
u8 hash_image_type;
u8 hash_algorithm;
u8 encryption_algorithm;
- u8 reserved03;
+ u8 flags;
__le16 public_key_size;
__le16 signature_size;
__le32 public_key[MPI3_PUBLIC_KEY_MAX];
};
-
-#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_SIGNATURE (0x03)
+#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_HASH (0x03)
+#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_HASH_1_OF_2 (0x04)
+#define MPI3_HASH_IMAGE_TYPE_KEY_WITH_HASH_2_OF_2 (0x05)
#define MPI3_HASH_ALGORITHM_VERSION_MASK (0xe0)
#define MPI3_HASH_ALGORITHM_VERSION_NONE (0x00)
#define MPI3_HASH_ALGORITHM_VERSION_SHA1 (0x20)
@@ -230,6 +231,12 @@ struct mpi3_encrypted_hash_entry {
#define MPI3_ENCRYPTION_ALGORITHM_RSA4096 (0x05)
#define MPI3_ENCRYPTION_ALGORITHM_RSA3072 (0x06)
+/* hierarchical signature system (hss) */
+#define MPI3_ENCRYPTION_ALGORITHM_ML_DSA_87 (0x0b)
+#define MPI3_ENCRYPTION_ALGORITHM_ML_DSA_65 (0x0c)
+#define MPI3_ENCRYPTION_ALGORITHM_ML_DSA_44 (0x0d)
+#define MPI3_ENCRYPTED_HASH_ENTRY_FLAGS_PAIRED_KEY_MASK (0x0f)
+
#ifndef MPI3_ENCRYPTED_HASH_ENTRY_MAX
#define MPI3_ENCRYPTED_HASH_ENTRY_MAX (1)
#endif
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h b/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h
index c9fa0d69b75f..c374867f9ba0 100644
--- a/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_ioc.h
@@ -39,6 +39,12 @@ struct mpi3_ioc_init_request {
#define MPI3_WHOINIT_HOST_DRIVER (0x03)
#define MPI3_WHOINIT_MANUFACTURER (0x04)
+#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_MASK (0x00000003)
+#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_NO_GUIDANCE (0x00000000)
+#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_NO_SPECIAL (0x00000001)
+#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_REPORT_AS_HDD (0x00000002)
+#define MPI3_IOCINIT_DRIVERCAP_OSEXPOSURE_REPORT_AS_SSD (0x00000003)
+
struct mpi3_ioc_facts_request {
__le16 host_tag;
u8 ioc_use_only02;
@@ -140,6 +146,8 @@ struct mpi3_ioc_facts_data {
#define MPI3_IOCFACTS_EXCEPT_MANUFACT_CHECKSUM_FAIL (0x0020)
#define MPI3_IOCFACTS_EXCEPT_FW_CHECKSUM_FAIL (0x0010)
#define MPI3_IOCFACTS_EXCEPT_CONFIG_CHECKSUM_FAIL (0x0008)
+#define MPI3_IOCFACTS_EXCEPT_BLOCKING_BOOT_EVENT (0x0004)
+#define MPI3_IOCFACTS_EXCEPT_SECURITY_SELFTEST_FAILURE (0x0002)
#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_MASK (0x0001)
#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_PRIMARY (0x0000)
#define MPI3_IOCFACTS_EXCEPT_BOOTSTAT_SECONDARY (0x0001)
diff --git a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h
index fdc3d1968e43..b2ab25a1cfeb 100644
--- a/drivers/scsi/mpi3mr/mpi/mpi30_transport.h
+++ b/drivers/scsi/mpi3mr/mpi/mpi30_transport.h
@@ -18,7 +18,7 @@ union mpi3_version_union {
#define MPI3_VERSION_MAJOR (3)
#define MPI3_VERSION_MINOR (0)
-#define MPI3_VERSION_UNIT (31)
+#define MPI3_VERSION_UNIT (34)
#define MPI3_VERSION_DEV (0)
#define MPI3_DEVHANDLE_INVALID (0xffff)
struct mpi3_sysif_oper_queue_indexes {
@@ -158,6 +158,7 @@ struct mpi3_sysif_registers {
#define MPI3_SYSIF_FAULT_CODE_SOFT_RESET_NEEDED (0x0000f004)
#define MPI3_SYSIF_FAULT_CODE_POWER_CYCLE_REQUIRED (0x0000f005)
#define MPI3_SYSIF_FAULT_CODE_TEMP_THRESHOLD_EXCEEDED (0x0000f006)
+#define MPI3_SYSIF_FAULT_CODE_INSUFFICIENT_PCI_SLOT_POWER (0x0000f007)
#define MPI3_SYSIF_FAULT_INFO0_OFFSET (0x00001c14)
#define MPI3_SYSIF_FAULT_INFO1_OFFSET (0x00001c18)
#define MPI3_SYSIF_FAULT_INFO2_OFFSET (0x00001c1c)
@@ -410,6 +411,7 @@ struct mpi3_default_reply {
#define MPI3_IOCSTATUS_INSUFFICIENT_RESOURCES (0x0006)
#define MPI3_IOCSTATUS_INVALID_FIELD (0x0007)
#define MPI3_IOCSTATUS_INVALID_STATE (0x0008)
+#define MPI3_IOCSTATUS_SHUTDOWN_ACTIVE (0x0009)
#define MPI3_IOCSTATUS_INSUFFICIENT_POWER (0x000a)
#define MPI3_IOCSTATUS_INVALID_CHANGE_COUNT (0x000b)
#define MPI3_IOCSTATUS_ALLOWED_CMD_BLOCK (0x000c)
diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h
index 1dc640de3efc..fcb0fa31536b 100644
--- a/drivers/scsi/mpi3mr/mpi3mr.h
+++ b/drivers/scsi/mpi3mr/mpi3mr.h
@@ -57,8 +57,8 @@ extern struct list_head mrioc_list;
extern int prot_mask;
extern atomic64_t event_counter;
-#define MPI3MR_DRIVER_VERSION "8.10.0.5.50"
-#define MPI3MR_DRIVER_RELDATE "08-Aug-2024"
+#define MPI3MR_DRIVER_VERSION "8.12.0.0.50"
+#define MPI3MR_DRIVER_RELDATE "05-Sept-2024"
#define MPI3MR_DRIVER_NAME "mpi3mr"
#define MPI3MR_DRIVER_LICENSE "GPL"
@@ -178,7 +178,7 @@ extern atomic64_t event_counter;
#define MPI3MR_DEFAULT_SDEV_QD 32
/* Definitions for Threaded IRQ poll*/
-#define MPI3MR_IRQ_POLL_SLEEP 2
+#define MPI3MR_IRQ_POLL_SLEEP 20
#define MPI3MR_IRQ_POLL_TRIGGER_IOCOUNT 8
/* Definitions for the controller security status*/
@@ -1090,6 +1090,7 @@ struct scmd_priv {
* @evtack_cmds_bitmap: Event Ack bitmap
* @delayed_evtack_cmds_list: Delayed event acknowledgment list
* @ts_update_counter: Timestamp update counter
+ * @ts_update_interval: Timestamp update interval
* @reset_in_progress: Reset in progress flag
* @unrecoverable: Controller unrecoverable flag
* @prev_reset_result: Result of previous reset
@@ -1277,7 +1278,8 @@ struct mpi3mr_ioc {
unsigned long *evtack_cmds_bitmap;
struct list_head delayed_evtack_cmds_list;
- u32 ts_update_counter;
+ u16 ts_update_counter;
+ u16 ts_update_interval;
u8 reset_in_progress;
u8 unrecoverable;
int prev_reset_result;
diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c
index 2e1a92d306b2..f1ab76351bd8 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_fw.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c
@@ -728,7 +728,7 @@ static irqreturn_t mpi3mr_isr_poll(int irq, void *privdata)
mpi3mr_process_op_reply_q(mrioc,
intr_info->op_reply_q);
- usleep_range(MPI3MR_IRQ_POLL_SLEEP, 10 * MPI3MR_IRQ_POLL_SLEEP);
+ usleep_range(MPI3MR_IRQ_POLL_SLEEP, MPI3MR_IRQ_POLL_SLEEP + 1);
} while (atomic_read(&intr_info->op_reply_q->pend_ios) &&
(num_op_reply < mrioc->max_host_ios));
@@ -1362,6 +1362,10 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
int retval = 0;
enum mpi3mr_iocstate ioc_state;
u64 base_info;
+ u8 retry = 0;
+ u64 start_time, elapsed_time_sec;
+
+retry_bring_ioc_ready:
ioc_status = readl(&mrioc->sysif_regs->ioc_status);
ioc_config = readl(&mrioc->sysif_regs->ioc_configuration);
@@ -1380,26 +1384,23 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
ioc_info(mrioc, "controller is in %s state during detection\n",
mpi3mr_iocstate_name(ioc_state));
- if (ioc_state == MRIOC_STATE_BECOMING_READY ||
- ioc_state == MRIOC_STATE_RESET_REQUESTED) {
- timeout = mrioc->ready_timeout * 10;
- do {
- msleep(100);
- } while (--timeout);
+ timeout = mrioc->ready_timeout * 10;
+
+ do {
+ ioc_state = mpi3mr_get_iocstate(mrioc);
+
+ if (ioc_state != MRIOC_STATE_BECOMING_READY &&
+ ioc_state != MRIOC_STATE_RESET_REQUESTED)
+ break;
if (!pci_device_is_present(mrioc->pdev)) {
mrioc->unrecoverable = 1;
- ioc_err(mrioc,
- "controller is not present while waiting to reset\n");
- retval = -1;
+ ioc_err(mrioc, "controller is not present while waiting to reset\n");
goto out_device_not_present;
}
- ioc_state = mpi3mr_get_iocstate(mrioc);
- ioc_info(mrioc,
- "controller is in %s state after waiting to reset\n",
- mpi3mr_iocstate_name(ioc_state));
- }
+ msleep(100);
+ } while (--timeout);
if (ioc_state == MRIOC_STATE_READY) {
ioc_info(mrioc, "issuing message unit reset (MUR) to bring to reset state\n");
@@ -1460,6 +1461,9 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
ioc_config |= MPI3_SYSIF_IOC_CONFIG_ENABLE_IOC;
writel(ioc_config, &mrioc->sysif_regs->ioc_configuration);
+ if (retry == 0)
+ start_time = jiffies;
+
timeout = mrioc->ready_timeout * 10;
do {
ioc_state = mpi3mr_get_iocstate(mrioc);
@@ -1469,6 +1473,12 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
mpi3mr_iocstate_name(ioc_state));
return 0;
}
+ ioc_status = readl(&mrioc->sysif_regs->ioc_status);
+ if ((ioc_status & MPI3_SYSIF_IOC_STATUS_RESET_HISTORY) ||
+ (ioc_status & MPI3_SYSIF_IOC_STATUS_FAULT)) {
+ mpi3mr_print_fault_info(mrioc);
+ goto out_failed;
+ }
if (!pci_device_is_present(mrioc->pdev)) {
mrioc->unrecoverable = 1;
ioc_err(mrioc,
@@ -1477,9 +1487,19 @@ static int mpi3mr_bring_ioc_ready(struct mpi3mr_ioc *mrioc)
goto out_device_not_present;
}
msleep(100);
- } while (--timeout);
+ elapsed_time_sec = jiffies_to_msecs(jiffies - start_time)/1000;
+ } while (elapsed_time_sec < mrioc->ready_timeout);
out_failed:
+ elapsed_time_sec = jiffies_to_msecs(jiffies - start_time)/1000;
+ if ((retry < 2) && (elapsed_time_sec < (mrioc->ready_timeout - 60))) {
+ retry++;
+
+ ioc_warn(mrioc, "retrying to bring IOC ready, retry_count:%d\n"
+ " elapsed time =%llu\n", retry, elapsed_time_sec);
+
+ goto retry_bring_ioc_ready;
+ }
ioc_state = mpi3mr_get_iocstate(mrioc);
ioc_err(mrioc,
"failed to bring to ready state, current state: %s\n",
@@ -2671,7 +2691,7 @@ static void mpi3mr_watchdog_work(struct work_struct *work)
return;
}
- if (mrioc->ts_update_counter++ >= MPI3MR_TSUPDATE_INTERVAL) {
+ if (mrioc->ts_update_counter++ >= mrioc->ts_update_interval) {
mrioc->ts_update_counter = 0;
mpi3mr_sync_timestamp(mrioc);
}
@@ -3845,6 +3865,29 @@ static int mpi3mr_repost_diag_bufs(struct mpi3mr_ioc *mrioc)
}
/**
+ * mpi3mr_read_tsu_interval - Update time stamp interval
+ * @mrioc: Adapter instance reference
+ *
+ * Update time stamp interval if its defined in driver page 1,
+ * otherwise use default value.
+ *
+ * Return: Nothing
+ */
+static void
+mpi3mr_read_tsu_interval(struct mpi3mr_ioc *mrioc)
+{
+ struct mpi3_driver_page1 driver_pg1;
+ u16 pg_sz = sizeof(driver_pg1);
+ int retval = 0;
+
+ mrioc->ts_update_interval = MPI3MR_TSUPDATE_INTERVAL;
+
+ retval = mpi3mr_cfg_get_driver_pg1(mrioc, &driver_pg1, pg_sz);
+ if (!retval && driver_pg1.time_stamp_update)
+ mrioc->ts_update_interval = (driver_pg1.time_stamp_update * 60);
+}
+
+/**
* mpi3mr_print_ioc_info - Display controller information
* @mrioc: Adapter instance reference
*
@@ -4140,6 +4183,7 @@ retry_init:
goto out_failed_noretry;
}
+ mpi3mr_read_tsu_interval(mrioc);
mpi3mr_print_ioc_info(mrioc);
if (!mrioc->cfg_page) {
@@ -4321,6 +4365,7 @@ retry_init:
goto out_failed_noretry;
}
+ mpi3mr_read_tsu_interval(mrioc);
mpi3mr_print_ioc_info(mrioc);
if (is_resume) {
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 9a24f7776d64..ed5046593fda 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -8898,9 +8898,8 @@ _base_check_ioc_facts_changes(struct MPT3SAS_ADAPTER *ioc)
ioc->device_remove_in_progress, pd_handles_sz, GFP_KERNEL);
if (!device_remove_in_progress) {
ioc_info(ioc,
- "Unable to allocate the memory for "
- "device_remove_in_progress of sz: %d\n "
- , pd_handles_sz);
+ "Unable to allocate the memory for device_remove_in_progress of sz: %d\n",
+ pd_handles_sz);
return -ENOMEM;
}
memset(device_remove_in_progress +
diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c
index 1e63cb6cd8e3..33e1eba62ca1 100644
--- a/drivers/scsi/pm8001/pm8001_init.c
+++ b/drivers/scsi/pm8001/pm8001_init.c
@@ -100,10 +100,12 @@ static void pm8001_map_queues(struct Scsi_Host *shost)
struct pm8001_hba_info *pm8001_ha = sha->lldd_ha;
struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT];
- if (pm8001_ha->number_of_intr > 1)
+ if (pm8001_ha->number_of_intr > 1) {
blk_mq_pci_map_queues(qmap, pm8001_ha->pdev, 1);
+ return;
+ }
- return blk_mq_map_queues(qmap);
+ blk_mq_map_queues(qmap);
}
/*
diff --git a/drivers/scsi/pm8001/pm80xx_hwi.c b/drivers/scsi/pm8001/pm80xx_hwi.c
index 8fe886dc5e47..a9869cd8c4c0 100644
--- a/drivers/scsi/pm8001/pm80xx_hwi.c
+++ b/drivers/scsi/pm8001/pm80xx_hwi.c
@@ -2037,7 +2037,7 @@ mpi_ssp_completion(struct pm8001_hba_info *pm8001_ha, void *piomb)
atomic_dec(&pm8001_dev->running_req);
break;
}
- pm8001_dbg(pm8001_ha, IO, "scsi_status = 0x%x\n ",
+ pm8001_dbg(pm8001_ha, IO, "scsi_status = 0x%x\n",
psspPayload->ssp_resp_iu.status);
spin_lock_irqsave(&t->task_state_lock, flags);
t->task_state_flags &= ~SAS_TASK_STATE_PENDING;
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 72a4c6e3d0c8..4c5881917d76 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -1946,7 +1946,7 @@ static void pmcraid_soft_reset(struct pmcraid_cmd *cmd)
}
iowrite32(doorbell, pinstance->int_regs.host_ioa_interrupt_reg);
- ioread32(pinstance->int_regs.host_ioa_interrupt_reg),
+ ioread32(pinstance->int_regs.host_ioa_interrupt_reg);
int_reg = ioread32(pinstance->int_regs.ioa_host_interrupt_reg);
pmcraid_info("Waiting for IOA to become operational %x:%x\n",
diff --git a/drivers/scsi/qedf/qedf_io.c b/drivers/scsi/qedf/qedf_io.c
index 054a51713d55..fcfc3bed02c6 100644
--- a/drivers/scsi/qedf/qedf_io.c
+++ b/drivers/scsi/qedf/qedf_io.c
@@ -310,7 +310,7 @@ struct qedf_ioreq *qedf_alloc_cmd(struct qedf_rport *fcport, u8 cmd_type)
if (!free_sqes) {
QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_IO,
- "Returning NULL, free_sqes=%d.\n ",
+ "Returning NULL, free_sqes=%d.\n",
free_sqes);
goto out_failed;
}
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index a9d8a9c62663..d95f417e24c0 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -2760,7 +2760,6 @@ static int resp_mode_sense(struct scsi_cmnd *scp,
else
bd_len = 0;
alloc_len = msense_6 ? cmd[4] : get_unaligned_be16(cmd + 7);
- memset(arr, 0, SDEBUG_MAX_MSENSE_SZ);
if (0x3 == pcontrol) { /* Saving values not supported */
mk_sense_buffer(scp, ILLEGAL_REQUEST, SAVING_PARAMS_UNSUP, 0);
return check_condition_result;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 76f488ef6a7e..41e2dfa2d67d 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -38,7 +38,6 @@
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/bio-integrity.h>
#include <linux/hdreg.h>
#include <linux/errno.h>
#include <linux/idr.h>
@@ -3404,7 +3403,7 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp,
rcu_read_lock();
vpd = rcu_dereference(sdkp->device->vpd_pgb1);
- if (!vpd || vpd->len < 8) {
+ if (!vpd || vpd->len <= 8) {
rcu_read_unlock();
return;
}
@@ -4093,9 +4092,38 @@ static int sd_start_stop_device(struct scsi_disk *sdkp, int start)
{
unsigned char cmd[6] = { START_STOP }; /* START_VALID */
struct scsi_sense_hdr sshdr;
+ struct scsi_failure failure_defs[] = {
+ {
+ /* Power on, reset, or bus device reset occurred */
+ .sense = UNIT_ATTENTION,
+ .asc = 0x29,
+ .ascq = 0,
+ .result = SAM_STAT_CHECK_CONDITION,
+ },
+ {
+ /* Power on occurred */
+ .sense = UNIT_ATTENTION,
+ .asc = 0x29,
+ .ascq = 1,
+ .result = SAM_STAT_CHECK_CONDITION,
+ },
+ {
+ /* SCSI bus reset */
+ .sense = UNIT_ATTENTION,
+ .asc = 0x29,
+ .ascq = 2,
+ .result = SAM_STAT_CHECK_CONDITION,
+ },
+ {}
+ };
+ struct scsi_failures failures = {
+ .total_allowed = 3,
+ .failure_definitions = failure_defs,
+ };
const struct scsi_exec_args exec_args = {
.sshdr = &sshdr,
.req_flags = BLK_MQ_REQ_PM,
+ .failures = &failures,
};
struct scsi_device *sdp = sdkp->device;
int res;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 0d8ce1a92168..d50bad3a2ce9 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -834,6 +834,9 @@ static int flush_buffer(struct scsi_tape *STp, int seek_next)
int backspace, result;
struct st_partstat *STps;
+ if (STp->ready != ST_READY)
+ return 0;
+
/*
* If there was a bus reset, block further access
* to this device.
@@ -841,8 +844,6 @@ static int flush_buffer(struct scsi_tape *STp, int seek_next)
if (STp->pos_unknown)
return (-EIO);
- if (STp->ready != ST_READY)
- return 0;
STps = &(STp->ps[STp->partition]);
if (STps->rw == ST_WRITING) /* Writing */
return st_flush_write_buffer(STp);
diff --git a/drivers/scsi/zalon.c b/drivers/scsi/zalon.c
index 22d412cab91d..15602ec862e3 100644
--- a/drivers/scsi/zalon.c
+++ b/drivers/scsi/zalon.c
@@ -139,7 +139,7 @@ zalon_probe(struct parisc_device *dev)
return -ENODEV;
if (request_irq(dev->irq, ncr53c8xx_intr, IRQF_SHARED, "zalon", host)) {
- dev_printk(KERN_ERR, &dev->dev, "irq problem with %d, detaching\n ",
+ dev_printk(KERN_ERR, &dev->dev, "irq problem with %d, detaching\n",
dev->irq);
goto fail;
}
diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index c87fdc849c62..ecdfff2456e3 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -93,7 +93,7 @@ static const struct __ufs_qcom_bw_table {
[MODE_HS_RB][UFS_HS_G3][UFS_LANE_2] = { 1492582, 204800 },
[MODE_HS_RB][UFS_HS_G4][UFS_LANE_2] = { 2915200, 409600 },
[MODE_HS_RB][UFS_HS_G5][UFS_LANE_2] = { 5836800, 819200 },
- [MODE_MAX][0][0] = { 7643136, 307200 },
+ [MODE_MAX][0][0] = { 7643136, 819200 },
};
static void ufs_qcom_get_default_testbus_cfg(struct ufs_qcom_host *host);
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index 2e093535884b..e8b4e8c119b5 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -512,8 +512,10 @@ static int search_fb_in_map(int idx)
int i, retval = 0;
for (i = first_fb_vc; i <= last_fb_vc; i++) {
- if (con2fb_map[i] == idx)
+ if (con2fb_map[i] == idx) {
retval = 1;
+ break;
+ }
}
return retval;
}
@@ -523,8 +525,10 @@ static int search_for_mapped_con(void)
int i, retval = 0;
for (i = first_fb_vc; i <= last_fb_vc; i++) {
- if (con2fb_map[i] != -1)
+ if (con2fb_map[i] != -1) {
retval = 1;
+ break;
+ }
}
return retval;
}
@@ -861,6 +865,8 @@ static int set_con2fb_map(int unit, int newidx, int user)
return err;
fbcon_add_cursor_work(info);
+ } else if (vc) {
+ set_blitting_type(vc, info);
}
con2fb_map[unit] = newidx;
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c b/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c
index 4040e247e026..d5a43b3bf45e 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c
@@ -129,12 +129,9 @@ omapdss_of_find_source_for_first_ep(struct device_node *node)
return ERR_PTR(-EINVAL);
src_port = of_graph_get_remote_port(ep);
- if (!src_port) {
- of_node_put(ep);
- return ERR_PTR(-EINVAL);
- }
-
of_node_put(ep);
+ if (!src_port)
+ return ERR_PTR(-EINVAL);
src = omap_dss_find_output_by_port_node(src_port);
diff --git a/drivers/video/fbdev/sis/sis_main.c b/drivers/video/fbdev/sis/sis_main.c
index 009bf1d92644..75033e6be15a 100644
--- a/drivers/video/fbdev/sis/sis_main.c
+++ b/drivers/video/fbdev/sis/sis_main.c
@@ -183,7 +183,7 @@ static void sisfb_search_mode(char *name, bool quiet)
{
unsigned int j = 0, xres = 0, yres = 0, depth = 0, rate = 0;
int i = 0;
- char strbuf[16], strbuf1[20];
+ char strbuf[24], strbuf1[20];
char *nameptr = name;
/* We don't know the hardware specs yet and there is no ivideo */
diff --git a/fs/afs/afs_vl.h b/fs/afs/afs_vl.h
index 9c65ffb8a523..a06296c8827d 100644
--- a/fs/afs/afs_vl.h
+++ b/fs/afs/afs_vl.h
@@ -134,13 +134,4 @@ struct afs_uvldbentry__xdr {
__be32 spares9;
};
-struct afs_address_list {
- refcount_t usage;
- unsigned int version;
- unsigned int nr_addrs;
- struct sockaddr_rxrpc addrs[];
-};
-
-extern void afs_put_address_list(struct afs_address_list *alist);
-
#endif /* AFS_VL_H */
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 492d857a3fa0..6762eff97517 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -420,6 +420,7 @@ const struct netfs_request_ops afs_req_ops = {
.begin_writeback = afs_begin_writeback,
.prepare_write = afs_prepare_write,
.issue_write = afs_issue_write,
+ .retry_request = afs_retry_request,
};
static void afs_add_open_mmap(struct afs_vnode *vnode)
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 3546b087e791..428721bbe4f6 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -201,7 +201,7 @@ void afs_wait_for_operation(struct afs_operation *op)
}
}
- if (op->call_responded)
+ if (op->call_responded && op->server)
set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
if (!afs_op_error(op)) {
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 580de4adaaf6..b516d05b0fef 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -506,10 +506,10 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_sta
finish_wait(&server->probe_wq, &wait);
dont_wait:
- if (estate->responsive_set & ~exclude)
- return 1;
if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
return 0;
+ if (estate->responsive_set & ~exclude)
+ return 1;
if (is_intr && signal_pending(current))
return -ERESTARTSYS;
if (timo == 0)
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index ed09d4d4c211..d612983d6f38 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -632,8 +632,10 @@ iterate_address:
wait_for_more_probe_results:
error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
!(op->flags & AFS_OPERATION_UNINTR));
- if (!error)
+ if (error == 1)
goto iterate_address;
+ if (!error)
+ goto restart_from_beginning;
/* We've now had a failure to respond on all of a server's addresses -
* immediately probe them again and consider retrying the server.
@@ -644,10 +646,13 @@ wait_for_more_probe_results:
error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
!(op->flags & AFS_OPERATION_UNINTR));
switch (error) {
- case 0:
+ case 1:
op->flags &= ~AFS_OPERATION_RETRY_SERVER;
- trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
+ trace_afs_rotate(op, afs_rotate_trace_retry_server, 1);
goto retry_server;
+ case 0:
+ trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
+ goto restart_from_beginning;
case -ERESTARTSYS:
afs_op_set_error(op, error);
goto failed;
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index e11989a57ca0..47455a85c909 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -501,7 +501,7 @@ found:
prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
bch2_bkey_val_to_text(&buf, c, extent2);
- struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+ struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
trans, dup_backpointer_to_bad_csum_extent,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index c711d4c27a03..f4151ee51b03 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -594,6 +594,7 @@ struct bch_dev {
#define BCH_FS_FLAGS() \
x(new_fs) \
x(started) \
+ x(clean_recovery) \
x(btree_running) \
x(accounting_replay_done) \
x(may_go_rw) \
@@ -776,7 +777,7 @@ struct bch_fs {
unsigned nsec_per_time_unit;
u64 features;
u64 compat;
- unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+ unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data;
} sb;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 8c4addddd07e..84832c2d4df9 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -217,13 +217,13 @@ struct bkey {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u8 pad[1];
- struct bversion version;
+ struct bversion bversion;
__u32 size; /* extent size, in sectors */
struct bpos p;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
struct bpos p;
__u32 size; /* extent size, in sectors */
- struct bversion version;
+ struct bversion bversion;
__u8 pad[1];
#endif
@@ -328,8 +328,8 @@ enum bch_bkey_fields {
bkey_format_field(OFFSET, p.offset), \
bkey_format_field(SNAPSHOT, p.snapshot), \
bkey_format_field(SIZE, size), \
- bkey_format_field(VERSION_HI, version.hi), \
- bkey_format_field(VERSION_LO, version.lo), \
+ bkey_format_field(VERSION_HI, bversion.hi), \
+ bkey_format_field(VERSION_LO, bversion.lo), \
}, \
})
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index e34cb2bf329c..41df24a53d97 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -214,9 +214,9 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
-static __always_inline int bversion_zero(struct bversion v)
+static __always_inline bool bversion_zero(struct bversion v)
{
- return !bversion_cmp(v, ZERO_VERSION);
+ return bversion_cmp(v, ZERO_VERSION) == 0;
}
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -554,8 +554,8 @@ static inline void bch2_bkey_pack_test(void) {}
x(BKEY_FIELD_OFFSET, p.offset) \
x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
x(BKEY_FIELD_SIZE, size) \
- x(BKEY_FIELD_VERSION_HI, version.hi) \
- x(BKEY_FIELD_VERSION_LO, version.lo)
+ x(BKEY_FIELD_VERSION_HI, bversion.hi) \
+ x(BKEY_FIELD_VERSION_LO, bversion.lo)
struct bkey_format_state {
u64 field_min[BKEY_NR_FIELDS];
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 88d8958281e8..e7ac227ba7e8 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -289,7 +289,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
bch2_bpos_to_text(out, k->p);
- prt_printf(out, " len %u ver %llu", k->size, k->version.lo);
+ prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
} else {
prt_printf(out, "(null)");
}
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3df3dd2723a1..018fb72e32d3 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -70,7 +70,7 @@ bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
{
return l->type == r->type &&
- !bversion_cmp(l->version, r->version) &&
+ !bversion_cmp(l->bversion, r->bversion) &&
bpos_eq(l->p, bkey_start_pos(r));
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index b5e0692f03c6..660d2fa02da2 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -513,6 +513,8 @@ int bch2_check_topology(struct bch_fs *c)
struct bpos pulled_from_scan = POS_MIN;
int ret = 0;
+ bch2_trans_srcu_unlock(trans);
+
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
bool reconstructed_root = false;
@@ -599,15 +601,15 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (initial) {
BUG_ON(bch2_journal_seq_verify &&
- k.k->version.lo > atomic64_read(&c->journal.seq));
+ k.k->bversion.lo > atomic64_read(&c->journal.seq));
if (fsck_err_on(btree_id != BTREE_ID_accounting &&
- k.k->version.lo > atomic64_read(&c->key_version),
+ k.k->bversion.lo > atomic64_read(&c->key_version),
trans, bkey_version_in_future,
"key version number higher than recorded %llu\n %s",
atomic64_read(&c->key_version),
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- atomic64_set(&c->key_version, k.k->version.lo);
+ atomic64_set(&c->key_version, k.k->bversion.lo);
}
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index cb48a9477514..1c1448b52207 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1195,6 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset(b, b->set, &b->data->keys);
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
+ memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
+ btree_buf_bytes(b) -
+ sizeof(struct btree_node) -
+ b->nr.live_u64s * sizeof(u64));
u64s = le16_to_cpu(sorted->keys.u64s);
*sorted = *b->data;
@@ -1219,7 +1223,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bkey_val_validate(c, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey ||
(bch2_inject_invalid_keys &&
- !bversion_cmp(u.k->version, MAX_VERSION))) {
+ !bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index b28c649c6838..1e694fedc5da 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -275,7 +275,7 @@ static int read_btree_nodes(struct find_btree_nodes *f)
w->ca = ca;
t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
- ret = IS_ERR_OR_NULL(t);
+ ret = PTR_ERR_OR_ZERO(t);
if (ret) {
percpu_ref_put(&ca->io_ref);
closure_put(&cl);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 91884da4e30a..1a74a1a252ee 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -684,10 +684,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
!(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
- i->k->k.version.lo = trans->journal_res.seq;
+ i->k->k.bversion.lo = trans->journal_res.seq;
else if (bch2_inject_invalid_keys)
trans_for_each_update(trans, i)
- i->k->k.version = MAX_VERSION;
+ i->k->k.bversion = MAX_VERSION;
}
h = trans->hooks;
@@ -700,27 +700,31 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct jset_entry *entry = trans->journal_entries;
- if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
- percpu_down_read(&c->mark_lock);
+ percpu_down_read(&c->mark_lock);
+
+ for (entry = trans->journal_entries;
+ entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ entry = vstruct_next(entry))
+ if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
+ entry->start->k.type == KEY_TYPE_accounting) {
+ BUG_ON(!trans->journal_res.ref);
+
+ struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
- for (entry = trans->journal_entries;
- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- entry = vstruct_next(entry))
- if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) {
- struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
+ (u64 *) entry - (u64 *) trans->journal_entries);
+ BUG_ON(bversion_zero(a->k.bversion));
- a->k.version = journal_pos_to_bversion(&trans->journal_res,
- (u64 *) entry - (u64 *) trans->journal_entries);
- BUG_ON(bversion_zero(a->k.version));
- ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false);
+ if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
+ ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal);
if (ret)
goto revert_fs_usage;
}
- percpu_up_read(&c->mark_lock);
+ }
+ percpu_up_read(&c->mark_lock);
- /* XXX: we only want to run this if deltas are nonzero */
- bch2_trans_account_disk_usage_change(trans);
- }
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
trans_for_each_update(trans, i)
if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
@@ -735,6 +739,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err;
}
+ trans_for_each_update(trans, i) {
+ enum bch_validate_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+ ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
+ i->bkey_type, invalid_flags);
+ if (unlikely(ret)){
+ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
+ trans->fn, (void *) i->ip_allocated);
+ goto fatal_err;
+ }
+ btree_insert_entry_checks(trans, i);
+ }
+
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i)) {
+ enum bch_validate_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
+
+ ret = bch2_journal_entry_validate(c, NULL, i,
+ bcachefs_metadata_version_current,
+ CPU_BIG_ENDIAN, invalid_flags);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
+ trans->fn);
+ goto fatal_err;
+ }
+ }
+
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -798,7 +836,7 @@ revert_fs_usage:
struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, false, false);
+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
bch2_accounting_neg(a);
}
percpu_up_read(&c->mark_lock);
@@ -1019,40 +1057,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (ret)
goto out_reset;
- trans_for_each_update(trans, i) {
- enum bch_validate_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
- i->bkey_type, invalid_flags);
- if (unlikely(ret)){
- bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
- trans->fn, (void *) i->ip_allocated);
- return ret;
- }
- btree_insert_entry_checks(trans, i);
- }
-
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i)) {
- enum bch_validate_flags invalid_flags = 0;
-
- if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
-
- ret = bch2_journal_entry_validate(c, NULL, i,
- bcachefs_metadata_version_current,
- CPU_BIG_ENDIAN, invalid_flags);
- if (unlikely(ret)) {
- bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
- trans->fn);
- return ret;
- }
- }
-
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
ret = do_bch2_trans_commit_to_journal_replay(trans);
goto out_reset;
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 60393e98084d..6a454f2fa005 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -220,7 +220,8 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t
if (type && k.k->type != type)
return ERR_PTR(-ENOENT);
- mut = bch2_trans_kmalloc_nomemzero(trans, bytes);
+ /* extra padding for varint_decode_fast... */
+ mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
if (!IS_ERR(mut)) {
bkey_reassemble(mut, k);
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 757b9884ef55..462b1a2fe1ad 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -639,7 +639,7 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
- m->op.version = k.k->version;
+ m->op.version = k.k->bversion;
m->op.target = data_opts.target;
m->op.write_point = wp;
m->op.nr_replicas = 0;
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index e972e2bca546..9f3133e3e7e5 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -134,6 +134,10 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
void *end = &acc_k + 1;
int ret = 0;
+ bkey_fsck_err_on(bversion_zero(k.k->bversion),
+ c, accounting_key_version_0,
+ "accounting key with version=0");
+
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_nr_inodes:
end = field_end(acc_k, nr_inodes);
@@ -291,7 +295,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
struct accounting_mem_entry n = {
.pos = a.k->p,
- .version = a.k->version,
+ .bversion = a.k->bversion,
.nr_counters = bch2_accounting_counters(a.k),
.v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
sizeof(u64), GFP_KERNEL),
@@ -319,11 +323,13 @@ err:
return -BCH_ERR_ENOMEM_disk_accounting;
}
-int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc)
+int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
{
struct bch_replicas_padded r;
- if (accounting_to_replicas(&r.e, a.k->p) &&
+ if (mode != BCH_ACCOUNTING_read &&
+ accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
return -BCH_ERR_btree_insert_need_mark_replicas;
@@ -566,7 +572,9 @@ int bch2_gc_accounting_done(struct bch_fs *c)
struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
- bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false);
+ bch2_accounting_mem_mod_locked(trans,
+ bkey_i_to_s_c_accounting(&k_i.k),
+ BCH_ACCOUNTING_normal);
preempt_disable();
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -589,30 +597,14 @@ fsck_err:
static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
if (k.k->type != KEY_TYPE_accounting)
return 0;
percpu_down_read(&c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true);
+ int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
+ BCH_ACCOUNTING_read);
percpu_up_read(&c->mark_lock);
-
- if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
- ret == -BCH_ERR_btree_insert_need_mark_replicas)
- ret = 0;
-
- struct disk_accounting_pos acc;
- bpos_to_disk_accounting_pos(&acc, k.k->p);
-
- if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas,
- trans, accounting_replicas_not_marked,
- "accounting not marked in superblock replicas\n %s",
- (bch2_accounting_key_to_text(&buf, &acc),
- buf.buf)))
- ret = bch2_accounting_update_sb_one(c, k.k->p);
-fsck_err:
- printbuf_exit(&buf);
return ret;
}
@@ -624,6 +616,7 @@ int bch2_accounting_read(struct bch_fs *c)
{
struct bch_accounting_mem *acc = &c->accounting;
struct btree_trans *trans = bch2_trans_get(c);
+ struct printbuf buf = PRINTBUF;
int ret = for_each_btree_key(trans, iter,
BTREE_ID_accounting, POS_MIN,
@@ -647,7 +640,7 @@ int bch2_accounting_read(struct bch_fs *c)
accounting_pos_cmp, &k.k->p);
bool applied = idx < acc->k.nr &&
- bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0;
+ bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
if (applied)
continue;
@@ -655,7 +648,7 @@ int bch2_accounting_read(struct bch_fs *c)
if (i + 1 < &darray_top(*keys) &&
i[1].k->k.type == KEY_TYPE_accounting &&
!journal_key_cmp(i, i + 1)) {
- BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0);
+ WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
i[1].journal_seq = i[0].journal_seq;
@@ -674,6 +667,45 @@ int bch2_accounting_read(struct bch_fs *c)
keys->gap = keys->nr = dst - keys->data;
percpu_down_read(&c->mark_lock);
+ for (unsigned i = 0; i < acc->k.nr; i++) {
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+
+ if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
+ continue;
+
+ struct bch_replicas_padded r;
+ if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
+ continue;
+
+ /*
+ * If the replicas entry is invalid it'll get cleaned up by
+ * check_allocations:
+ */
+ if (bch2_replicas_entry_validate(&r.e, c, &buf))
+ continue;
+
+ struct disk_accounting_pos k;
+ bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
+
+ if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+ trans, accounting_replicas_not_marked,
+ "accounting not marked in superblock replicas\n %s",
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &k),
+ buf.buf))) {
+ /*
+ * We're not RW yet and still single threaded, dropping
+ * and retaking lock is ok:
+ */
+ percpu_up_read(&c->mark_lock);
+ ret = bch2_mark_replicas(c, &r.e);
+ if (ret)
+ goto fsck_err;
+ percpu_down_read(&c->mark_lock);
+ }
+ }
+
preempt_disable();
struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
@@ -709,8 +741,10 @@ int bch2_accounting_read(struct bch_fs *c)
}
}
preempt_enable();
+fsck_err:
percpu_up_read(&c->mark_lock);
err:
+ printbuf_exit(&buf);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index f29fd0dd9581..4ea6c8a092bc 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -36,8 +36,8 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
dst->v.d[i] += src.v->d[i];
- if (bversion_cmp(dst->k.version, src.k->version) < 0)
- dst->k.version = src.k->version;
+ if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
+ dst->k.bversion = src.k->bversion;
}
static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
@@ -103,23 +103,35 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
return bpos_cmp(*l, *r);
}
-int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool);
+enum bch_accounting_mode {
+ BCH_ACCOUNTING_normal,
+ BCH_ACCOUNTING_gc,
+ BCH_ACCOUNTING_read,
+};
+
+int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);
/*
* Update in memory counters so they match the btree update we're doing; called
* from transaction commit path
*/
-static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read)
+static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
+ struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
{
struct bch_fs *c = trans->c;
+ struct bch_accounting_mem *acc = &c->accounting;
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, a.k->p);
+ bool gc = mode == BCH_ACCOUNTING_gc;
+
+ EBUG_ON(gc && !acc->gc_running);
if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
return 0;
- if (!gc && !read) {
+ if (mode == BCH_ACCOUNTING_normal) {
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_persistent_reserved:
trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
@@ -140,14 +152,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
}
}
- struct bch_accounting_mem *acc = &c->accounting;
unsigned idx;
- EBUG_ON(gc && !acc->gc_running);
-
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
- int ret = bch2_accounting_mem_insert(c, a, gc);
+ int ret = bch2_accounting_mem_insert(c, a, mode);
if (ret)
return ret;
}
@@ -164,7 +173,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{
percpu_down_read(&trans->c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false);
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
percpu_up_read(&trans->c->mark_lock);
return ret;
}
diff --git a/fs/bcachefs/disk_accounting_types.h b/fs/bcachefs/disk_accounting_types.h
index 1687a45177a7..b1982131b206 100644
--- a/fs/bcachefs/disk_accounting_types.h
+++ b/fs/bcachefs/disk_accounting_types.h
@@ -6,7 +6,7 @@
struct accounting_mem_entry {
struct bpos pos;
- struct bversion version;
+ struct bversion bversion;
unsigned nr_counters;
u64 __percpu *v[2];
};
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 95afa7bf2020..3a16b535b6c3 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -239,7 +239,19 @@ int __bch2_fsck_err(struct bch_fs *c,
if (!c)
c = trans->c;
- WARN_ON(!trans && bch2_current_has_btree_trans(c));
+ /*
+ * Ugly: if there's a transaction in the current task it has to be
+ * passed in to unlock if we prompt for user input.
+ *
+ * But, plumbing a transaction and transaction restarts into
+ * bkey_validate() is problematic.
+ *
+ * So:
+ * - make all bkey errors AUTOFIX, they're simple anyways (we just
+ * delete the key)
+ * - and we don't need to warn if we're not prompting
+ */
+ WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c));
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 2f1b86978f36..21ee7211b03e 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -184,7 +184,7 @@ do { \
ret = -BCH_ERR_fsck_delete_bkey; \
goto fsck_err; \
} \
- int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \
+ int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX|FSCK_AUTOFIX,\
BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \
if (_ret != -BCH_ERR_fsck_fix && \
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 9b3470a97546..0d8b782b63fb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -21,6 +21,49 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *inode)
+{
+ if (d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum)
+ return 0;
+ return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+}
+
+static void dirent_inode_mismatch_msg(struct printbuf *out,
+ struct bch_fs *c,
+ struct bkey_s_c_dirent dirent,
+ struct bch_inode_unpacked *inode)
+{
+ prt_str(out, "inode points to dirent that does not point back:");
+ prt_newline(out);
+ bch2_bkey_val_to_text(out, c, dirent.s_c);
+ prt_newline(out);
+ bch2_inode_unpacked_to_text(out, inode);
+}
+
+static int dirent_points_to_inode(struct bch_fs *c,
+ struct bkey_s_c_dirent dirent,
+ struct bch_inode_unpacked *inode)
+{
+ int ret = dirent_points_to_inode_nowarn(dirent, inode);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ dirent_inode_mismatch_msg(&buf, c, dirent, inode);
+ bch_warn(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ return ret;
+}
+
/*
* XXX: this is handling transaction restarts without returning
* -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
@@ -346,14 +389,17 @@ static int reattach_inode(struct btree_trans *trans,
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
- struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret;
+ if (!inode->bi_dir)
+ return 0;
- d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
- POS(inode->bi_dir, inode->bi_dir_offset), 0,
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c_dirent d =
+ bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
+ SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0,
dirent);
- ret = bkey_err(d) ?:
+ int ret = bkey_err(d) ?:
+ dirent_points_to_inode(c, d, inode) ?:
__remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -371,7 +417,8 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
return ret;
ret = remove_backpointer(trans, &inode);
- bch_err_msg(c, ret, "removing dirent");
+ if (!bch2_err_matches(ret, ENOENT))
+ bch_err_msg(c, ret, "removing dirent");
if (ret)
return ret;
@@ -626,12 +673,12 @@ static int ref_visible2(struct bch_fs *c,
struct inode_walker_entry {
struct bch_inode_unpacked inode;
u32 snapshot;
- bool seen_this_pos;
u64 count;
};
struct inode_walker {
bool first_this_inode;
+ bool have_inodes;
bool recalculate_sums;
struct bpos last_pos;
@@ -669,6 +716,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
+ /*
+ * We no longer have inodes for w->last_pos; clear this to avoid
+ * screwing up check_i_sectors/check_subdir_count if we take a
+ * transaction restart here:
+ */
+ w->have_inodes = false;
w->recalculate_sums = false;
w->inodes.nr = 0;
@@ -686,6 +739,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return ret;
w->first_this_inode = true;
+ w->have_inodes = true;
return 0;
}
@@ -740,9 +794,6 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret)
return ERR_PTR(ret);
- } else if (bkey_cmp(w->last_pos, k.k->p)) {
- darray_for_each(w->inodes, i)
- i->seen_this_pos = false;
}
w->last_pos = k.k->p;
@@ -896,21 +947,6 @@ static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
}
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *inode)
-{
- return d.v->d_type == DT_SUBVOL
- ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
- : le64_to_cpu(d.v->d_inum) == inode->bi_inum;
-}
-
static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{
struct btree_iter iter;
@@ -920,13 +956,14 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
return ret;
}
-static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
+static int check_inode_dirent_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
- u32 inode_snapshot, bool *write_inode)
+ bool *write_inode)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ u32 inode_snapshot = inode->bi_snapshot;
struct btree_iter dirent_iter = {};
struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
int ret = bkey_err(d);
@@ -936,13 +973,13 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i
if (fsck_err_on(ret,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
- (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
- fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
+ (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
+ fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
trans, inode_points_to_wrong_dirent,
- "inode points to dirent that does not point back:\n%s",
- (bch2_bkey_val_to_text(&buf, c, inode_k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ "%s",
+ (printbuf_reset(&buf),
+ dirent_inode_mismatch_msg(&buf, c, d, inode),
+ buf.buf))) {
/*
* We just clear the backpointer fields for now. If we find a
* dirent that points to this inode in check_dirents(), we'll
@@ -963,7 +1000,7 @@ fsck_err:
return ret;
}
-static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
+static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
{
subvol_inum inum = {
.subvol = snapshot_t(c, p.snapshot)->subvol,
@@ -972,7 +1009,7 @@ static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
/* snapshot tree corruption, can't safely delete */
if (!inum.subvol) {
- bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot);
+ bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
return true;
}
@@ -1045,30 +1082,44 @@ static int check_inode(struct btree_trans *trans,
}
if (u.bi_flags & BCH_INODE_unlinked) {
- ret = check_inode_deleted_list(trans, k.k->p);
- if (ret < 0)
- return ret;
+ if (!test_bit(BCH_FS_started, &c->flags)) {
+ /*
+ * If we're not in online fsck, don't delete unlinked
+ * inodes, just make sure they're on the deleted list.
+ *
+ * They might be referred to by a logged operation -
+ * i.e. we might have crashed in the middle of a
+ * truncate on an unlinked but open file - so we want to
+ * let the delete_dead_inodes kill it after resuming
+ * logged ops.
+ */
+ ret = check_inode_deleted_list(trans, k.k->p);
+ if (ret < 0)
+ return ret;
- fsck_err_on(!ret,
- trans, unlinked_inode_not_on_deleted_list,
- "inode %llu:%u unlinked, but not on deleted list",
- u.bi_inum, k.k->p.snapshot);
- ret = 0;
- }
+ fsck_err_on(!ret,
+ trans, unlinked_inode_not_on_deleted_list,
+ "inode %llu:%u unlinked, but not on deleted list",
+ u.bi_inum, k.k->p.snapshot);
- if (u.bi_flags & BCH_INODE_unlinked &&
- !bch2_inode_open(c, k.k->p) &&
- (!c->sb.clean ||
- fsck_err(trans, inode_unlinked_but_clean,
- "filesystem marked clean, but inode %llu unlinked",
- u.bi_inum))) {
- ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
- bch_err_msg(c, ret, "in fsck deleting inode");
- return ret;
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
+ if (ret)
+ goto err;
+ } else {
+ if (fsck_err_on(bch2_inode_is_open(c, k.k->p),
+ trans, inode_unlinked_and_not_open,
+ "inode %llu%u unlinked and not open",
+ u.bi_inum, u.bi_snapshot)) {
+ ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
+ bch_err_msg(c, ret, "in fsck deleting inode");
+ return ret;
+ }
+ }
}
+ /* i_size_dirty is vestigal, since we now have logged ops for truncate * */
if (u.bi_flags & BCH_INODE_i_size_dirty &&
- (!c->sb.clean ||
+ (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
fsck_err(trans, inode_i_size_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_size dirty",
u.bi_inum))) {
@@ -1097,8 +1148,9 @@ static int check_inode(struct btree_trans *trans,
do_update = true;
}
+ /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
- (!c->sb.clean ||
+ (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
fsck_err(trans, inode_i_sectors_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_sectors dirty",
u.bi_inum))) {
@@ -1126,7 +1178,7 @@ static int check_inode(struct btree_trans *trans,
}
if (u.bi_dir || u.bi_dir_offset) {
- ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
+ ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret)
goto err;
}
@@ -1555,10 +1607,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
struct snapshots_seen *s,
- struct extent_ends *extent_ends)
+ struct extent_ends *extent_ends,
+ struct disk_reservation *res)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -1568,7 +1620,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto out;
}
- if (inode->last_pos.inode != k.k->p.inode) {
+ if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
ret = check_i_sectors(trans, inode);
if (ret)
goto err;
@@ -1578,12 +1630,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
- i = walk_inode(trans, inode, k);
- ret = PTR_ERR_OR_ZERO(i);
+ struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
+ ret = PTR_ERR_OR_ZERO(extent_i);
if (ret)
goto err;
- ret = check_key_has_inode(trans, iter, inode, i, k);
+ ret = check_key_has_inode(trans, iter, inode, extent_i, k);
if (ret)
goto err;
@@ -1592,24 +1644,19 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
&inode->recalculate_sums);
if (ret)
goto err;
- }
- /*
- * Check inodes in reverse order, from oldest snapshots to newest,
- * starting from the inode that matches this extent's snapshot. If we
- * didn't have one, iterate over all inodes:
- */
- if (!i)
- i = &darray_last(inode->inodes);
-
- for (;
- inode->inodes.data && i >= inode->inodes.data;
- --i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
- continue;
+ /*
+ * Check inodes in reverse order, from oldest snapshots to
+ * newest, starting from the inode that matches this extent's
+ * snapshot. If we didn't have one, iterate over all inodes:
+ */
+ for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ continue;
- if (k.k->type != KEY_TYPE_whiteout) {
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k),
@@ -1629,13 +1676,25 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
iter->k.type = KEY_TYPE_whiteout;
+ break;
}
-
- if (bkey_extent_is_allocation(k.k))
- i->count += k.k->size;
}
+ }
- i->seen_this_pos = true;
+ ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ if (bkey_extent_is_allocation(k.k)) {
+ for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
+ inode->inodes.data && i >= inode->inodes.data;
+ --i) {
+ if (i->snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ continue;
+
+ i->count += k.k->size;
+ }
}
if (k.k->type != KEY_TYPE_whiteout) {
@@ -1666,13 +1725,11 @@ int bch2_check_extents(struct bch_fs *c)
extent_ends_init(&extent_ends);
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+ for_each_btree_key(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL,
- BCH_TRANS_COMMIT_no_enospc, ({
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
check_i_sectors_notnested(trans, &w));
@@ -1758,6 +1815,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
int ret = 0;
if (inode_points_to_dirent(target, d))
@@ -1770,7 +1828,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
prt_printf(&buf, "\n "),
bch2_inode_unpacked_to_text(&buf, target),
buf.buf)))
- goto out_noiter;
+ goto err;
if (!target->bi_dir &&
!target->bi_dir_offset) {
@@ -1779,7 +1837,6 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
return __bch2_fsck_write_inode(trans, target, target_snapshot);
}
- struct btree_iter bp_iter = { NULL };
struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
ret = bkey_err(bp_dirent);
@@ -1840,7 +1897,6 @@ out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
-out_noiter:
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -2075,7 +2131,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type == KEY_TYPE_whiteout)
goto out;
- if (dir->last_pos.inode != k.k->p.inode) {
+ if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
ret = check_subdir_count(trans, dir);
if (ret)
goto err;
@@ -2137,11 +2193,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
}
-
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
- i->count++;
}
+
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ if (d.v->d_type == DT_DIR)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ i->count++;
out:
err:
fsck_err:
@@ -2164,12 +2224,9 @@ int bch2_check_dirents(struct bch_fs *c)
snapshots_seen_init(&s);
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots,
- k,
- NULL, NULL,
- BCH_TRANS_COMMIT_no_enospc,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
check_subdir_count_notnested(trans, &dir));
@@ -2314,22 +2371,6 @@ static bool darray_u32_has(darray_u32 *d, u32 v)
return false;
}
-/*
- * We've checked that inode backpointers point to valid dirents; here, it's
- * sufficient to check that the subvolume root has a dirent:
- */
-static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
-{
- struct bch_inode_unpacked inode;
- int ret = bch2_inode_find_by_inum_trans(trans,
- (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
- &inode);
- if (ret)
- return ret;
-
- return inode.bi_dir != 0;
-}
-
static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
@@ -2348,14 +2389,24 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
- ret = subvol_has_dirent(trans, s);
- if (ret < 0)
+ struct bch_inode_unpacked subvol_root;
+ ret = bch2_inode_find_by_inum_trans(trans,
+ (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+ &subvol_root);
+ if (ret)
break;
- if (fsck_err_on(!ret,
+ /*
+ * We've checked that inode backpointers point to valid dirents;
+ * here, it's sufficient to check that the subvolume root has a
+ * dirent:
+ */
+ if (fsck_err_on(!subvol_root.bi_dir,
trans, subvol_unreachable,
"unreachable subvolume %s",
(bch2_bkey_val_to_text(&buf, c, s.s_c),
+ prt_newline(&buf),
+ bch2_inode_unpacked_to_text(&buf, &subvol_root),
buf.buf))) {
ret = reattach_subvol(trans, s);
break;
@@ -2450,10 +2501,8 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
if (ret && !bch2_err_matches(ret, ENOENT))
break;
- if (!ret && !dirent_points_to_inode(d, &inode)) {
+ if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
bch2_trans_iter_exit(trans, &dirent_iter);
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
- }
if (bch2_err_matches(ret, ENOENT)) {
ret = 0;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 6ac0ff7e074b..753c208896c3 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -320,9 +320,11 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
- if (likely(k.k->type == KEY_TYPE_inode_v3))
- return bch2_inode_unpack_v3(k, unpacked);
- return bch2_inode_unpack_slowpath(k, unpacked);
+ unpacked->bi_snapshot = k.k->p.snapshot;
+
+ return likely(k.k->type == KEY_TYPE_inode_v3)
+ ? bch2_inode_unpack_v3(k, unpacked)
+ : bch2_inode_unpack_slowpath(k, unpacked);
}
int bch2_inode_peek_nowarn(struct btree_trans *trans,
@@ -557,7 +559,7 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
{
- prt_printf(out, "inum: %llu ", inode->bi_inum);
+ prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
__bch2_inode_unpacked_to_text(out, inode);
}
@@ -1111,7 +1113,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
pos.offset, pos.snapshot))
goto delete;
- if (c->sb.clean &&
+ if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
!fsck_err(trans, deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot)) {
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index f1fcb4c58039..695abd707cb6 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -69,6 +69,7 @@ typedef u64 u96;
struct bch_inode_unpacked {
u64 bi_inum;
+ u32 bi_snapshot;
u64 bi_journal_seq;
__le64 bi_hash_seed;
u64 bi_size;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index b2f50e74bb76..e4fc17c548fd 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -517,7 +517,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if ((ret = bkey_err(k)))
goto out;
- if (bversion_cmp(k.k->version, rbio->version) ||
+ if (bversion_cmp(k.k->bversion, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out;
@@ -1031,7 +1031,7 @@ get_bio:
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
- rbio->version = k.k->version;
+ rbio->version = k.k->bversion;
rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index d3b5be7fd9bf..b5fe9e0dc155 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -697,7 +697,7 @@ static void init_append_extent(struct bch_write_op *op,
e = bkey_extent_init(op->insert_keys.top);
e->k.p = op->pos;
e->k.size = crc.uncompressed_size;
- e->k.version = version;
+ e->k.bversion = version;
if (crc.csum_type ||
crc.compression_type ||
@@ -1544,7 +1544,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
id = bkey_inline_data_init(op->insert_keys.top);
id->k.p = op->pos;
- id->k.version = op->version;
+ id->k.bversion = op->version;
id->k.size = sectors;
iter = bio->bi_iter;
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 30460bce04be..954f6a96e0f4 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -605,7 +605,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
goto out;
}
- if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err),
+ if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
c, version, jset, entry,
journal_entry_data_usage_bad_size,
"invalid journal entry usage: %s", err.buf)) {
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index f49fdca1d07d..6f4a4e1083c9 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -37,6 +37,14 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk;
u32 restart_count = trans->restart_count;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
+ trans, logged_op_but_clean,
+ "filesystem marked as clean but have logged op\n%s",
+ (bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf));
if (!fn)
return 0;
@@ -47,8 +55,9 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c);
-
- return trans_was_restarted(trans, restart_count);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret ?: trans_was_restarted(trans, restart_count);
}
int bch2_resume_logged_ops(struct bch_fs *c)
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index be1e7ca4362f..6db72d3bad7d 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -151,7 +151,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
/* Has this delta already been applied to the btree? */
- if (bversion_cmp(old.k->version, k->k->k.version) >= 0) {
+ if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
ret = 0;
goto out;
}
@@ -717,6 +717,8 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags);
+ if (c->sb.clean)
+ set_bit(BCH_FS_clean_recovery, &c->flags);
ret = bch2_blacklist_table_initialize(c);
if (ret) {
@@ -862,6 +864,9 @@ use_clean:
clear_bit(BCH_FS_fsck_running, &c->flags);
+ /* in case we don't run journal replay, i.e. norecovery mode */
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
+
/* fsync if we fixed errors */
if (test_bit(BCH_FS_errors_fixed, &c->flags) &&
bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) {
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 8c7dee5983d2..50406ce0e4ef 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -50,7 +50,7 @@
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
+ x(delete_dead_inodes, 32, PASS_ALWAYS) \
x(fix_reflink_p, 33, 0) \
x(set_fs_needs_rebalance, 34, 0) \
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index e59c0abb4772..f457925fa362 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -367,7 +367,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_v->k.type = bkey_type_to_indirect(&orig->k);
r_v->k.p = reflink_iter.pos;
bch2_key_resize(&r_v->k, orig->k.size);
- r_v->k.version = orig->k.version;
+ r_v->k.bversion = orig->k.bversion;
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 998c0bd06802..bcb3276747e0 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]");
}
-int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
- struct bch_sb *sb,
- struct printbuf *err)
+static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
+ struct bch_sb *sb,
+ struct printbuf *err)
{
if (!r->nr_devs) {
prt_printf(err, "no devices in entry ");
@@ -94,6 +94,16 @@ bad:
return -BCH_ERR_invalid_replicas_entry;
}
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
+ struct bch_fs *c,
+ struct printbuf *err)
+{
+ mutex_lock(&c->sb_lock);
+ int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
+ mutex_unlock(&c->sb_lock);
+ return ret;
+}
+
void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r)
{
@@ -676,7 +686,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
- int ret = bch2_replicas_entry_validate(e, sb, err);
+ int ret = bch2_replicas_entry_validate_locked(e, sb, err);
if (ret)
return ret;
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 622482559c3d..5aba2c1ce133 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -10,7 +10,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *);
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
- struct bch_sb *, struct printbuf *);
+ struct bch_fs *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry_v1 *
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index 025848a9c4c0..005275281804 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -167,6 +167,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
ret = bch2_sb_clean_validate_late(c, clean, READ);
if (ret) {
+ kfree(clean);
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
}
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index c7e4cdd3f6a5..5102059a0f1d 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -312,8 +312,7 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
if (!first)
prt_char(out, ',');
first = false;
- unsigned e = le16_to_cpu(i->errors[j]);
- prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)");
+ bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
}
prt_newline(out);
}
@@ -353,7 +352,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
for (unsigned i = 0; i < src->nr_errors; i++)
dst->errors[i] = cpu_to_le16(src->errors[i]);
- downgrade_table_extra(c, &table);
+ ret = downgrade_table_extra(c, &table);
+ if (ret)
+ goto out;
if (!dst->recovery_passes[0] &&
!dst->recovery_passes[1] &&
@@ -399,7 +400,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
unsigned e = le16_to_cpu(i->errors[j]);
- if (e < BCH_SB_ERR_MAX)
+ if (e < BCH_FSCK_ERR_MAX)
__set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8)
__set_bit_le64(e, ext->errors_silent);
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index c1270d790e43..013a96883b4e 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -7,12 +7,12 @@
const char * const bch2_sb_error_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_SB_ERRS()
- NULL
+#undef x
};
-static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
{
- if (id < BCH_SB_ERR_MAX)
+ if (id < BCH_FSCK_ERR_MAX)
prt_str(out, bch2_sb_error_strs[id]);
else
prt_printf(out, "(unknown error %u)", id);
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
index 8889001e7db4..b2357b8e6107 100644
--- a/fs/bcachefs/sb-errors.h
+++ b/fs/bcachefs/sb-errors.h
@@ -6,6 +6,8 @@
extern const char * const bch2_sb_error_strs[];
+void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index f0c14702f9e6..ed5dca5e1161 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -210,22 +210,23 @@ enum bch_fsck_flags {
x(inode_snapshot_mismatch, 196, 0) \
x(inode_unlinked_but_clean, 197, 0) \
x(inode_unlinked_but_nlink_nonzero, 198, 0) \
+ x(inode_unlinked_and_not_open, 281, 0) \
x(inode_checksum_type_invalid, 199, 0) \
x(inode_compression_type_invalid, 200, 0) \
x(inode_subvol_root_but_not_dir, 201, 0) \
- x(inode_i_size_dirty_but_clean, 202, 0) \
- x(inode_i_sectors_dirty_but_clean, 203, 0) \
- x(inode_i_sectors_wrong, 204, 0) \
- x(inode_dir_wrong_nlink, 205, 0) \
- x(inode_dir_multiple_links, 206, 0) \
- x(inode_multiple_links_but_nlink_0, 207, 0) \
- x(inode_wrong_backpointer, 208, 0) \
- x(inode_wrong_nlink, 209, 0) \
- x(inode_unreachable, 210, 0) \
- x(deleted_inode_but_clean, 211, 0) \
- x(deleted_inode_missing, 212, 0) \
- x(deleted_inode_is_dir, 213, 0) \
- x(deleted_inode_not_unlinked, 214, 0) \
+ x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \
+ x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \
+ x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \
+ x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \
+ x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
+ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
+ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
+ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
+ x(inode_unreachable, 210, FSCK_AUTOFIX) \
+ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
+ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
+ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
+ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \
x(key_in_missing_inode, 216, 0) \
x(key_in_wrong_inode_type, 217, 0) \
@@ -255,7 +256,7 @@ enum bch_fsck_flags {
x(dir_loop, 241, 0) \
x(hash_table_key_duplicate, 242, 0) \
x(hash_table_key_wrong_offset, 243, 0) \
- x(unlinked_inode_not_on_deleted_list, 244, 0) \
+ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
x(reflink_p_front_pad_bad, 245, 0) \
x(journal_entry_dup_same_device, 246, 0) \
x(inode_bi_subvol_missing, 247, 0) \
@@ -270,7 +271,7 @@ enum bch_fsck_flags {
x(subvol_children_not_set, 256, 0) \
x(subvol_children_bad, 257, 0) \
x(subvol_loop, 258, 0) \
- x(subvol_unreachable, 259, 0) \
+ x(subvol_unreachable, 259, FSCK_AUTOFIX) \
x(btree_node_bkey_bad_u64s, 260, 0) \
x(btree_node_topology_empty_interior_node, 261, 0) \
x(btree_ptr_v2_min_key_bad, 262, 0) \
@@ -282,8 +283,8 @@ enum bch_fsck_flags {
x(btree_ptr_v2_written_0, 268, 0) \
x(subvol_snapshot_bad, 269, 0) \
x(subvol_inode_bad, 270, 0) \
- x(alloc_key_stripe_sectors_wrong, 271, 0) \
- x(accounting_mismatch, 272, 0) \
+ x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
+ x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
@@ -292,12 +293,14 @@ enum bch_fsck_flags {
x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
+ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
+ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
+ x(MAX, 284, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
BCH_SB_ERRS()
#undef x
- BCH_SB_ERR_MAX
};
struct bch_sb_field_errors {
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 9cbd3c14c94f..617d07e53b20 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -169,11 +169,17 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
ret = -1 - SIX_LOCK_write;
}
} else if (type == SIX_LOCK_write && lock->readers) {
- if (try) {
+ if (try)
atomic_add(SIX_LOCK_HELD_write, &lock->state);
- smp_mb__after_atomic();
- }
+ /*
+ * Make sure atomic_add happens before pcpu_read_count and
+ * six_set_bitmask in slow path happens before pcpu_read_count.
+ *
+ * Paired with the smp_mb() in read lock fast path (per-cpu mode)
+ * and the one before atomic_read in read unlock path.
+ */
+ smp_mb();
ret = !pcpu_read_count(lock);
if (try && !ret) {
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 8b18a9b483a4..1809442b00ee 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -469,6 +469,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 id = snapshot_root;
u32 subvol = 0, s;
+ rcu_read_lock();
while (id) {
s = snapshot_t(c, id)->subvol;
@@ -477,6 +478,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
id = bch2_snapshot_tree_next(c, id);
}
+ rcu_read_unlock();
return subvol;
}
@@ -1782,6 +1784,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
new->k.p.snapshot = leaf_id;
ret = bch2_trans_update(trans, &iter, new, 0);
out:
+ bch2_set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index dbe834cb349f..6845dde1b339 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -92,34 +92,32 @@ static int check_subvol(struct btree_trans *trans,
}
struct bch_inode_unpacked inode;
- struct btree_iter inode_iter = {};
- ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+ ret = bch2_inode_find_by_inum_nowarn_trans(trans,
(subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
- 0);
- bch2_trans_iter_exit(trans, &inode_iter);
-
- if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
-
- if (fsck_err_on(ret,
- trans, subvol_to_missing_root,
- "subvolume %llu points to missing subvolume root %llu:%u",
- k.k->p.offset, le64_to_cpu(subvol.v->inode),
- le32_to_cpu(subvol.v->snapshot))) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- return ret ?: -BCH_ERR_transaction_restart_nested;
- }
-
- if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
- trans, subvol_root_wrong_bi_subvol,
- "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
- inode.bi_inum, inode_iter.k.p.snapshot,
- inode.bi_subvol, subvol.k->p.offset)) {
- inode.bi_subvol = subvol.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
- if (ret)
+ &inode);
+ if (!ret) {
+ if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+ trans, subvol_root_wrong_bi_subvol,
+ "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+ inode.bi_inum, inode.bi_snapshot,
+ inode.bi_subvol, subvol.k->p.offset)) {
+ inode.bi_subvol = subvol.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+ if (ret)
+ goto err;
+ }
+ } else if (bch2_err_matches(ret, ENOENT)) {
+ if (fsck_err(trans, subvol_to_missing_root,
+ "subvolume %llu points to missing subvolume root %llu:%u",
+ k.k->p.offset, le64_to_cpu(subvol.v->inode),
+ le32_to_cpu(subvol.v->snapshot))) {
+ ret = bch2_subvolume_delete(trans, iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ ret = ret ?: -BCH_ERR_transaction_restart_nested;
goto err;
+ }
+ } else {
+ goto err;
}
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
@@ -137,7 +135,7 @@ static int check_subvol(struct btree_trans *trans,
"%s: snapshot tree %u not found", __func__, snapshot_tree);
if (ret)
- return ret;
+ goto err;
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
trans, subvol_not_master_and_not_snapshot,
@@ -147,7 +145,7 @@ static int check_subvol(struct btree_trans *trans,
bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
ret = PTR_ERR_OR_ZERO(s);
if (ret)
- return ret;
+ goto err;
SET_BCH_SUBVOLUME_SNAP(&s->v, true);
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index d86d5dae54c9..ce7410d72089 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -799,8 +799,10 @@ retry:
i < layout.sb_offset + layout.nr_superblocks; i++) {
offset = le64_to_cpu(*i);
- if (offset == opt_get(*opts, sb))
+ if (offset == opt_get(*opts, sb)) {
+ ret = -BCH_ERR_invalid;
continue;
+ }
ret = read_one_super(sb, offset, &err);
if (!ret)
@@ -1188,7 +1190,8 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
prt_printf(out, "Errors to silently fix:\t");
- prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8);
+ prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
+ min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
prt_newline(out);
kfree(errors_silent);
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 01b768c9b767..b2f209743afe 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -394,7 +394,7 @@ static int insert_test_extent(struct bch_fs *c,
k.k_i.k.p.offset = end;
k.k_i.k.p.snapshot = U32_MAX;
k.k_i.k.size = end - start;
- k.k_i.k.version.lo = test_version++;
+ k.k_i.k.bversion.lo = test_version++;
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
bch_err_fn(c, ret);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f53977169db4..2b3f9935dbb4 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -595,14 +595,12 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
* write and readdir but not lookup or open).
*/
touch_atime(&file->f_path);
- dput(dentry);
return true;
check_failed:
fscache_cookie_lookup_negative(object->cookie);
cachefiles_unmark_inode_in_use(object, file);
fput(file);
- dput(dentry);
if (ret == -ESTALE)
return cachefiles_create_file(object);
return false;
@@ -611,7 +609,6 @@ error_fput:
fput(file);
error:
cachefiles_do_unmark_inode_in_use(object, d_inode(dentry));
- dput(dentry);
return false;
}
@@ -654,7 +651,9 @@ bool cachefiles_look_up_object(struct cachefiles_object *object)
goto new_file;
}
- if (!cachefiles_open_file(object, dentry))
+ ret = cachefiles_open_file(object, dentry);
+ dput(dentry);
+ if (!ret)
return false;
_leave(" = t [%lu]", file_inode(object->file)->i_ino);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5d9ccda098cc..53fef258c2bc 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -96,7 +96,6 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
/* dirty the head */
spin_lock(&ci->i_ceph_lock);
- BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
if (__ceph_have_pending_cap_snap(ci)) {
struct ceph_cap_snap *capsnap =
list_last_entry(&ci->i_cap_snaps,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 808c9c048276..bed34fc11c91 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -10,6 +10,7 @@
#include <linux/writeback.h>
#include <linux/iversion.h>
#include <linux/filelock.h>
+#include <linux/jiffies.h>
#include "super.h"
#include "mds_client.h"
@@ -4149,7 +4150,7 @@ retry:
ceph_remove_cap(mdsc, cap, false);
goto out_unlock;
} else if (tsession) {
- /* add placeholder for the export tagert */
+ /* add placeholder for the export target */
int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
tcap = new_cap;
ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
@@ -4602,7 +4603,7 @@ flush_cap_releases:
__ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- ceph_flush_cap_releases(mdsc, session);
+ ceph_flush_session_cap_releases(mdsc, session);
goto done;
bad:
@@ -4659,7 +4660,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
* slowness doesn't block mdsc delayed work,
* preventing send_renew_caps() from running.
*/
- if (jiffies - loop_start >= 5 * HZ)
+ if (time_after_eq(jiffies, loop_start + 5 * HZ))
break;
}
spin_unlock(&mdsc->cap_delay_lock);
@@ -4701,6 +4702,28 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
}
+/*
+ * Flush all cap releases to the mds
+ */
+static void flush_cap_releases(struct ceph_mds_session *s)
+{
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "begin\n");
+ spin_lock(&s->s_cap_lock);
+ if (s->s_num_cap_releases)
+ ceph_flush_session_cap_releases(mdsc, s);
+ spin_unlock(&s->s_cap_lock);
+ doutc(cl, "done\n");
+
+}
+
+void ceph_flush_cap_releases(struct ceph_mds_client *mdsc)
+{
+ ceph_mdsc_iterate_sessions(mdsc, flush_cap_releases, true);
+}
+
void __ceph_touch_fmode(struct ceph_inode_info *ci,
struct ceph_mds_client *mdsc, int fmode)
{
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ddec8c9244ee..952109292d69 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -2058,7 +2058,7 @@ static int ceph_d_delete(const struct dentry *dentry)
return 0;
if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP)
return 0;
- /* vaild lease? */
+ /* valid lease? */
di = ceph_dentry(dentry);
if (di) {
if (__dentry_lease_is_valid(di))
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4a8eec46254b..315ef02f9a3f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1779,7 +1779,7 @@ retry_lookup:
if (err < 0)
goto done;
} else if (rinfo->head->is_dentry && req->r_dentry) {
- /* parent inode is not locked, be carefull */
+ /* parent inode is not locked, be careful */
struct ceph_vino *ptvino = NULL;
dvino.ino = le64_to_cpu(rinfo->diri.in->ino);
dvino.snap = le64_to_cpu(rinfo->diri.in->snapid);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 276e34ab3e2c..c4a5fd94bbbb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2266,7 +2266,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
trim_caps - remaining);
}
- ceph_flush_cap_releases(mdsc, session);
+ ceph_flush_session_cap_releases(mdsc, session);
return 0;
}
@@ -2420,7 +2420,7 @@ static void ceph_cap_release_work(struct work_struct *work)
ceph_put_mds_session(session);
}
-void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
+void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_client *cl = mdsc->fsc->client;
@@ -2447,7 +2447,7 @@ void __ceph_queue_cap_release(struct ceph_mds_session *session,
session->s_num_cap_releases++;
if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE))
- ceph_flush_cap_releases(session->s_mdsc, session);
+ ceph_flush_session_cap_releases(session->s_mdsc, session);
}
static void ceph_cap_reclaim_work(struct work_struct *work)
@@ -4340,7 +4340,7 @@ skip_cap_auths:
/* flush cap releases */
spin_lock(&session->s_cap_lock);
if (session->s_num_cap_releases)
- ceph_flush_cap_releases(mdsc, session);
+ ceph_flush_session_cap_releases(mdsc, session);
spin_unlock(&session->s_cap_lock);
send_flushmsg_ack(mdsc, session, seq);
@@ -4910,7 +4910,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
} else {
recon_state.msg_version = 2;
}
- /* trsaverse this session's caps */
+ /* traverse this session's caps */
err = ceph_iterate_session_caps(session, reconnect_caps_cb, &recon_state);
spin_lock(&session->s_cap_lock);
@@ -5446,7 +5446,7 @@ static void delayed_work(struct work_struct *work)
}
mutex_unlock(&mdsc->mutex);
- ceph_flush_cap_releases(mdsc, s);
+ ceph_flush_session_cap_releases(mdsc, s);
mutex_lock(&s->s_mutex);
if (renew_caps)
@@ -5877,6 +5877,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
ceph_flush_dirty_caps(mdsc);
+ ceph_flush_cap_releases(mdsc);
spin_lock(&mdsc->cap_dirty_lock);
want_flush = mdsc->last_cap_flush_tid;
if (!list_empty(&mdsc->cap_flush_list)) {
@@ -6015,6 +6016,18 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
ceph_mdsmap_destroy(mdsc->mdsmap);
kfree(mdsc->sessions);
ceph_caps_finalize(mdsc);
+
+ if (mdsc->s_cap_auths) {
+ int i;
+
+ for (i = 0; i < mdsc->s_cap_auths_num; i++) {
+ kfree(mdsc->s_cap_auths[i].match.gids);
+ kfree(mdsc->s_cap_auths[i].match.path);
+ kfree(mdsc->s_cap_auths[i].match.fs_name);
+ }
+ kfree(mdsc->s_cap_auths);
+ }
+
ceph_pool_perm_destroy(mdsc);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 9bcc7f181bfe..3dd54587944a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -559,9 +559,6 @@ extern struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s);
extern void ceph_put_mds_session(struct ceph_mds_session *s);
-extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg, int mds);
-
extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
@@ -602,8 +599,8 @@ extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
struct ceph_cap *cap);
-extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session);
+extern void ceph_flush_session_cap_releases(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0cdf84cd1791..73f321b52895 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -126,6 +126,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
if (!wait) {
doutc(cl, "(non-blocking)\n");
ceph_flush_dirty_caps(fsc->mdsc);
+ ceph_flush_cap_releases(fsc->mdsc);
doutc(cl, "(non-blocking) done\n");
return 0;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6e817bf1337c..2508aa8950b7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1056,8 +1056,6 @@ extern int ceph_fill_trace(struct super_block *sb,
extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session);
-extern int ceph_inode_holds_cap(struct inode *inode, int mask);
-
extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode);
@@ -1208,10 +1206,6 @@ static inline void ceph_init_inode_acls(struct inode *inode,
struct ceph_acl_sec_ctx *as_ctx)
{
}
-static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
-{
- return 0;
-}
static inline void ceph_forget_all_cached_acls(struct inode *inode)
{
@@ -1270,6 +1264,7 @@ extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags);
extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
extern int ceph_encode_inode_release(void **p, struct inode *inode,
int mds, int drop, int unless, int force);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index c9f0ed24cb7b..c562aec3b483 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -58,6 +58,7 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/*
* misc.c
*/
+struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq);
int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
bool needs_put);
struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 0ad0982ce0e2..63280791de3b 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -9,34 +9,66 @@
#include "internal.h"
/*
- * Append a folio to the rolling queue.
+ * Make sure there's space in the rolling queue.
*/
-int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
- bool needs_put)
+struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq)
{
- struct folio_queue *tail = rreq->buffer_tail;
- unsigned int slot, order = folio_order(folio);
+ struct folio_queue *tail = rreq->buffer_tail, *prev;
+ unsigned int prev_nr_slots = 0;
if (WARN_ON_ONCE(!rreq->buffer && tail) ||
WARN_ON_ONCE(rreq->buffer && !tail))
- return -EIO;
-
- if (!tail || folioq_full(tail)) {
- tail = kmalloc(sizeof(*tail), GFP_NOFS);
- if (!tail)
- return -ENOMEM;
- netfs_stat(&netfs_n_folioq);
- folioq_init(tail);
- tail->prev = rreq->buffer_tail;
- if (tail->prev)
- tail->prev->next = tail;
- rreq->buffer_tail = tail;
- if (!rreq->buffer) {
- rreq->buffer = tail;
- iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0);
+ return ERR_PTR(-EIO);
+
+ prev = tail;
+ if (prev) {
+ if (!folioq_full(tail))
+ return tail;
+ prev_nr_slots = folioq_nr_slots(tail);
+ }
+
+ tail = kmalloc(sizeof(*tail), GFP_NOFS);
+ if (!tail)
+ return ERR_PTR(-ENOMEM);
+ netfs_stat(&netfs_n_folioq);
+ folioq_init(tail);
+ tail->prev = prev;
+ if (prev)
+ /* [!] NOTE: After we set prev->next, the consumer is entirely
+ * at liberty to delete prev.
+ */
+ WRITE_ONCE(prev->next, tail);
+
+ rreq->buffer_tail = tail;
+ if (!rreq->buffer) {
+ rreq->buffer = tail;
+ iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0);
+ } else {
+ /* Make sure we don't leave the master iterator pointing to a
+ * block that might get immediately consumed.
+ */
+ if (rreq->io_iter.folioq == prev &&
+ rreq->io_iter.folioq_slot == prev_nr_slots) {
+ rreq->io_iter.folioq = tail;
+ rreq->io_iter.folioq_slot = 0;
}
- rreq->buffer_tail_slot = 0;
}
+ rreq->buffer_tail_slot = 0;
+ return tail;
+}
+
+/*
+ * Append a folio to the rolling queue.
+ */
+int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
+ bool needs_put)
+{
+ struct folio_queue *tail;
+ unsigned int slot, order = folio_order(folio);
+
+ tail = netfs_buffer_make_space(rreq);
+ if (IS_ERR(tail))
+ return PTR_ERR(tail);
rreq->io_iter.count += PAGE_SIZE << order;
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 04e66d587f77..6293f547e4c3 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -153,12 +153,22 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
loff_t start)
{
struct netfs_io_subrequest *subreq;
+ struct iov_iter *wreq_iter = &wreq->io_iter;
+
+ /* Make sure we don't point the iterator at a used-up folio_queue
+ * struct being used as a placeholder to prevent the queue from
+ * collapsing. In such a case, extend the queue.
+ */
+ if (iov_iter_is_folioq(wreq_iter) &&
+ wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) {
+ netfs_buffer_make_space(wreq);
+ }
subreq = netfs_alloc_subrequest(wreq);
subreq->source = stream->source;
subreq->start = start;
subreq->stream_nr = stream->stream_nr;
- subreq->io_iter = wreq->io_iter;
+ subreq->io_iter = *wreq_iter;
_enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
@@ -307,6 +317,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
struct netfs_io_stream *stream;
struct netfs_group *fgroup; /* TODO: Use this with ceph */
struct netfs_folio *finfo;
+ size_t iter_off = 0;
size_t fsize = folio_size(folio), flen = fsize, foff = 0;
loff_t fpos = folio_pos(folio), i_size;
bool to_eof = false, streamw = false;
@@ -462,7 +473,12 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
if (choose_s < 0)
break;
stream = &wreq->io_streams[choose_s];
- wreq->io_iter.iov_offset = stream->submit_off;
+
+ /* Advance the iterator(s). */
+ if (stream->submit_off > iter_off) {
+ iov_iter_advance(&wreq->io_iter, stream->submit_off - iter_off);
+ iter_off = stream->submit_off;
+ }
atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
stream->submit_extendable_to = fsize - stream->submit_off;
@@ -477,8 +493,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
debug = true;
}
- wreq->io_iter.iov_offset = 0;
- iov_iter_advance(&wreq->io_iter, fsize);
+ if (fsize > iter_off)
+ iov_iter_advance(&wreq->io_iter, fsize - iter_off);
atomic64_set(&wreq->issued_to, fpos + fsize);
if (!debug)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 7ffdc88dfb52..80675b6bf884 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -120,6 +120,7 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct nsproxy *nsp __free(put_nsproxy) = NULL;
struct pid *pid = pidfd_pid(file);
struct ns_common *ns_common = NULL;
+ struct pid_namespace *pid_ns;
if (arg)
return -EINVAL;
@@ -202,7 +203,9 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PIDFD_GET_PID_NAMESPACE:
if (IS_ENABLED(CONFIG_PID_NS)) {
rcu_read_lock();
- ns_common = to_ns_common( get_pid_ns(task_active_pid_ns(task)));
+ pid_ns = task_active_pid_ns(task);
+ if (pid_ns)
+ ns_common = to_ns_common(get_pid_ns(pid_ns));
rcu_read_unlock();
}
break;
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index 7481b21a0489..2d851f596a72 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -416,7 +416,7 @@ find_timestamp(struct cifs_ses *ses)
}
static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
- const struct nls_table *nls_cp)
+ const struct nls_table *nls_cp, struct shash_desc *hmacmd5)
{
int rc = 0;
int len;
@@ -425,34 +425,26 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
wchar_t *domain;
wchar_t *server;
- if (!ses->server->secmech.hmacmd5) {
- cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
- return -1;
- }
-
/* calculate md4 hash of password */
E_md4hash(ses->password, nt_hash, nls_cp);
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm, nt_hash,
- CIFS_NTHASH_SIZE);
+ rc = crypto_shash_setkey(hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not set NT Hash as a key\n", __func__);
+ cifs_dbg(VFS, "%s: Could not set NT hash as a key, rc=%d\n", __func__, rc);
return rc;
}
- rc = crypto_shash_init(ses->server->secmech.hmacmd5);
+ rc = crypto_shash_init(hmacmd5);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
return rc;
}
/* convert ses->user_name to unicode */
len = ses->user_name ? strlen(ses->user_name) : 0;
user = kmalloc(2 + (len * 2), GFP_KERNEL);
- if (user == NULL) {
- rc = -ENOMEM;
- return rc;
- }
+ if (user == NULL)
+ return -ENOMEM;
if (len) {
len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
@@ -461,11 +453,10 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
*(u16 *)user = 0;
}
- rc = crypto_shash_update(ses->server->secmech.hmacmd5,
- (char *)user, 2 * len);
+ rc = crypto_shash_update(hmacmd5, (char *)user, 2 * len);
kfree(user);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with user\n", __func__);
+ cifs_dbg(VFS, "%s: Could not update with user, rc=%d\n", __func__, rc);
return rc;
}
@@ -474,19 +465,15 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = strlen(ses->domainName);
domain = kmalloc(2 + (len * 2), GFP_KERNEL);
- if (domain == NULL) {
- rc = -ENOMEM;
- return rc;
- }
+ if (domain == NULL)
+ return -ENOMEM;
+
len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
nls_cp);
- rc =
- crypto_shash_update(ses->server->secmech.hmacmd5,
- (char *)domain, 2 * len);
+ rc = crypto_shash_update(hmacmd5, (char *)domain, 2 * len);
kfree(domain);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with domain\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not update with domain, rc=%d\n", __func__, rc);
return rc;
}
} else {
@@ -494,33 +481,27 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = strlen(ses->ip_addr);
server = kmalloc(2 + (len * 2), GFP_KERNEL);
- if (server == NULL) {
- rc = -ENOMEM;
- return rc;
- }
- len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len,
- nls_cp);
- rc =
- crypto_shash_update(ses->server->secmech.hmacmd5,
- (char *)server, 2 * len);
+ if (server == NULL)
+ return -ENOMEM;
+
+ len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp);
+ rc = crypto_shash_update(hmacmd5, (char *)server, 2 * len);
kfree(server);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with server\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not update with server, rc=%d\n", __func__, rc);
return rc;
}
}
- rc = crypto_shash_final(ses->server->secmech.hmacmd5,
- ntlmv2_hash);
+ rc = crypto_shash_final(hmacmd5, ntlmv2_hash);
if (rc)
- cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+ cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
return rc;
}
static int
-CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
+CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_desc *hmacmd5)
{
int rc;
struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *)
@@ -531,43 +512,33 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
offsetof(struct ntlmv2_resp, challenge.key[0]));
- if (!ses->server->secmech.hmacmd5) {
- cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
- return -1;
- }
-
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm,
- ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+ rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc);
return rc;
}
- rc = crypto_shash_init(ses->server->secmech.hmacmd5);
+ rc = crypto_shash_init(hmacmd5);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
return rc;
}
if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
- memcpy(ntlmv2->challenge.key,
- ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+ memcpy(ntlmv2->challenge.key, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
else
- memcpy(ntlmv2->challenge.key,
- ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
- rc = crypto_shash_update(ses->server->secmech.hmacmd5,
- ntlmv2->challenge.key, hash_len);
+ memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+
+ rc = crypto_shash_update(hmacmd5, ntlmv2->challenge.key, hash_len);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+ cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc);
return rc;
}
/* Note that the MD5 digest over writes anon.challenge_key.key */
- rc = crypto_shash_final(ses->server->secmech.hmacmd5,
- ntlmv2->ntlmv2_hash);
+ rc = crypto_shash_final(hmacmd5, ntlmv2->ntlmv2_hash);
if (rc)
- cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+ cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
return rc;
}
@@ -575,6 +546,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
int
setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
{
+ struct shash_desc *hmacmd5 = NULL;
int rc;
int baselen;
unsigned int tilen;
@@ -640,55 +612,51 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
cifs_server_lock(ses->server);
- rc = cifs_alloc_hash("hmac(md5)", &ses->server->secmech.hmacmd5);
+ rc = cifs_alloc_hash("hmac(md5)", &hmacmd5);
if (rc) {
+ cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc);
goto unlock;
}
/* calculate ntlmv2_hash */
- rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
+ rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp, hmacmd5);
if (rc) {
- cifs_dbg(VFS, "Could not get v2 hash rc %d\n", rc);
+ cifs_dbg(VFS, "Could not get NTLMv2 hash, rc=%d\n", rc);
goto unlock;
}
/* calculate first part of the client response (CR1) */
- rc = CalcNTLMv2_response(ses, ntlmv2_hash);
+ rc = CalcNTLMv2_response(ses, ntlmv2_hash, hmacmd5);
if (rc) {
- cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc);
+ cifs_dbg(VFS, "Could not calculate CR1, rc=%d\n", rc);
goto unlock;
}
/* now calculate the session key for NTLMv2 */
- rc = crypto_shash_setkey(ses->server->secmech.hmacmd5->tfm,
- ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+ rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
- __func__);
+ cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc);
goto unlock;
}
- rc = crypto_shash_init(ses->server->secmech.hmacmd5);
+ rc = crypto_shash_init(hmacmd5);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
+ cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc);
goto unlock;
}
- rc = crypto_shash_update(ses->server->secmech.hmacmd5,
- ntlmv2->ntlmv2_hash,
- CIFS_HMAC_MD5_HASH_SIZE);
+ rc = crypto_shash_update(hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
+ cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc);
goto unlock;
}
- rc = crypto_shash_final(ses->server->secmech.hmacmd5,
- ses->auth_key.response);
+ rc = crypto_shash_final(hmacmd5, ses->auth_key.response);
if (rc)
- cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
-
+ cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc);
unlock:
cifs_server_unlock(ses->server);
+ cifs_free_hash(&hmacmd5);
setup_ntlmv2_rsp_ret:
kfree_sensitive(tiblob);
@@ -732,16 +700,19 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)
cifs_free_hash(&server->secmech.aes_cmac);
cifs_free_hash(&server->secmech.hmacsha256);
cifs_free_hash(&server->secmech.md5);
- cifs_free_hash(&server->secmech.sha512);
- cifs_free_hash(&server->secmech.hmacmd5);
- if (server->secmech.enc) {
- crypto_free_aead(server->secmech.enc);
- server->secmech.enc = NULL;
- }
+ if (!SERVER_IS_CHAN(server)) {
+ if (server->secmech.enc) {
+ crypto_free_aead(server->secmech.enc);
+ server->secmech.enc = NULL;
+ }
- if (server->secmech.dec) {
- crypto_free_aead(server->secmech.dec);
+ if (server->secmech.dec) {
+ crypto_free_aead(server->secmech.dec);
+ server->secmech.dec = NULL;
+ }
+ } else {
+ server->secmech.enc = NULL;
server->secmech.dec = NULL;
}
}
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 15571cf0ba63..315aac5dec05 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -178,10 +178,8 @@ struct session_key {
/* crypto hashing related structure/fields, not specific to a sec mech */
struct cifs_secmech {
- struct shash_desc *hmacmd5; /* hmacmd5 hash function, for NTLMv2/CR1 hashes */
struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */
struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */
- struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */
struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */
struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 3216f786908f..03c0b484a4b5 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -624,7 +624,7 @@ cifs_ses_add_channel(struct cifs_ses *ses,
* to sign packets before we generate the channel signing key
* (we sign with the session key)
*/
- rc = smb311_crypto_shash_allocate(chan->server);
+ rc = smb3_crypto_shash_allocate(chan->server);
if (rc) {
cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
mutex_unlock(&ses->session_mutex);
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index f3c4b70b77b9..bdeb12ff53e3 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -906,41 +906,41 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server,
|| (hdr->Status !=
cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))))
return 0;
-
ok:
- rc = smb311_crypto_shash_allocate(server);
- if (rc)
+ rc = cifs_alloc_hash("sha512", &sha512);
+ if (rc) {
+ cifs_dbg(VFS, "%s: Could not allocate SHA512 shash, rc=%d\n", __func__, rc);
return rc;
+ }
- sha512 = server->secmech.sha512;
rc = crypto_shash_init(sha512);
if (rc) {
- cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__);
- return rc;
+ cifs_dbg(VFS, "%s: Could not init SHA512 shash, rc=%d\n", __func__, rc);
+ goto err_free;
}
rc = crypto_shash_update(sha512, ses->preauth_sha_hash,
SMB2_PREAUTH_HASH_SIZE);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__);
- return rc;
+ cifs_dbg(VFS, "%s: Could not update SHA512 shash, rc=%d\n", __func__, rc);
+ goto err_free;
}
for (i = 0; i < nvec; i++) {
rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len);
if (rc) {
- cifs_dbg(VFS, "%s: Could not update sha512 shash\n",
- __func__);
- return rc;
+ cifs_dbg(VFS, "%s: Could not update SHA512 shash, rc=%d\n", __func__, rc);
+ goto err_free;
}
}
rc = crypto_shash_final(sha512, ses->preauth_sha_hash);
if (rc) {
- cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n",
- __func__);
- return rc;
+ cifs_dbg(VFS, "%s: Could not finalize SHA12 shash, rc=%d\n", __func__, rc);
+ goto err_free;
}
+err_free:
+ cifs_free_hash(&sha512);
return 0;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 1ee2dd4a1cae..177173072bfa 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -4309,7 +4309,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
*/
static int
crypt_message(struct TCP_Server_Info *server, int num_rqst,
- struct smb_rqst *rqst, int enc)
+ struct smb_rqst *rqst, int enc, struct crypto_aead *tfm)
{
struct smb2_transform_hdr *tr_hdr =
(struct smb2_transform_hdr *)rqst[0].rq_iov[0].iov_base;
@@ -4320,8 +4320,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
u8 *iv;
- DECLARE_CRYPTO_WAIT(wait);
- struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
void *creq;
size_t sensitive_size;
@@ -4333,14 +4331,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- rc = smb3_crypto_aead_allocate(server);
- if (rc) {
- cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__);
- return rc;
- }
-
- tfm = enc ? server->secmech.enc : server->secmech.dec;
-
if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE);
@@ -4380,11 +4370,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
- aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done, &wait);
-
- rc = crypto_wait_req(enc ? crypto_aead_encrypt(req)
- : crypto_aead_decrypt(req), &wait);
+ rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req);
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
@@ -4526,7 +4512,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
/* fill the 1st iov with a transform header */
fill_transform_hdr(tr_hdr, orig_len, old_rq, server->cipher_type);
- rc = crypt_message(server, num_rqst, new_rq, 1);
+ rc = crypt_message(server, num_rqst, new_rq, 1, server->secmech.enc);
cifs_dbg(FYI, "Encrypt message returned %d\n", rc);
if (rc)
goto err_free;
@@ -4551,8 +4537,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
unsigned int buf_data_size, struct iov_iter *iter,
bool is_offloaded)
{
- struct kvec iov[2];
+ struct crypto_aead *tfm;
struct smb_rqst rqst = {NULL};
+ struct kvec iov[2];
size_t iter_size = 0;
int rc;
@@ -4568,9 +4555,31 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
iter_size = iov_iter_count(iter);
}
- rc = crypt_message(server, 1, &rqst, 0);
+ if (is_offloaded) {
+ if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
+ (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
+ tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ else
+ tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+ if (IS_ERR(tfm)) {
+ rc = PTR_ERR(tfm);
+ cifs_server_dbg(VFS, "%s: Failed alloc decrypt TFM, rc=%d\n", __func__, rc);
+
+ return rc;
+ }
+ } else {
+ if (unlikely(!server->secmech.dec))
+ return -EIO;
+
+ tfm = server->secmech.dec;
+ }
+
+ rc = crypt_message(server, 1, &rqst, 0, tfm);
cifs_dbg(FYI, "Decrypt message returned %d\n", rc);
+ if (is_offloaded)
+ crypto_free_aead(tfm);
+
if (rc)
return rc;
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index bb225758448a..02828b9c3cb3 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -1266,6 +1266,16 @@ SMB2_negotiate(const unsigned int xid,
else
cifs_server_dbg(VFS, "Missing expected negotiate contexts\n");
}
+
+ if (server->cipher_type && !rc) {
+ if (!SERVER_IS_CHAN(server)) {
+ rc = smb3_crypto_aead_allocate(server);
+ } else {
+ /* For channels, just reuse the primary server crypto secmech. */
+ server->secmech.enc = server->primary_server->secmech.enc;
+ server->secmech.dec = server->primary_server->secmech.dec;
+ }
+ }
neg_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index c7e1b149877a..56a896ff7cd9 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -291,7 +291,7 @@ extern int smb2_validate_and_copy_iov(unsigned int offset,
extern void smb2_copy_fs_info_to_kstatfs(
struct smb2_fs_full_size_info *pfs_inf,
struct kstatfs *kst);
-extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server);
+extern int smb3_crypto_shash_allocate(struct TCP_Server_Info *server);
extern int smb311_update_preauth_hash(struct cifs_ses *ses,
struct TCP_Server_Info *server,
struct kvec *iov, int nvec);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index e4636fca821d..f7e04c40d22e 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -26,8 +26,7 @@
#include "../common/smb2status.h"
#include "smb2glob.h"
-static int
-smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
+int smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
{
struct cifs_secmech *p = &server->secmech;
int rc;
@@ -46,33 +45,6 @@ err:
return rc;
}
-int
-smb311_crypto_shash_allocate(struct TCP_Server_Info *server)
-{
- struct cifs_secmech *p = &server->secmech;
- int rc = 0;
-
- rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256);
- if (rc)
- return rc;
-
- rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac);
- if (rc)
- goto err;
-
- rc = cifs_alloc_hash("sha512", &p->sha512);
- if (rc)
- goto err;
-
- return 0;
-
-err:
- cifs_free_hash(&p->aes_cmac);
- cifs_free_hash(&p->hmacsha256);
- return rc;
-}
-
-
static
int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
{
@@ -242,7 +214,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server,
ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId));
if (unlikely(!ses)) {
- cifs_server_dbg(VFS, "%s: Could not find session\n", __func__);
+ cifs_server_dbg(FYI, "%s: Could not find session\n", __func__);
return -ENOENT;
}
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index c769f9dbc0b4..9f272cc8f566 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -6,7 +6,7 @@
* Note that, due to trying to use names similar to the protocol specifications,
* there are many mixed case field names in the structures below. Although
* this does not match typical Linux kernel style, it is necessary to be
- * able to match against the protocol specfication.
+ * able to match against the protocol specification.
*
* SMB2 commands
* Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
@@ -491,7 +491,7 @@ struct smb2_encryption_neg_context {
__le16 ContextType; /* 2 */
__le16 DataLength;
__le32 Reserved;
- /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */
+ /* CipherCount usually 2, but can be 3 when AES256-GCM enabled */
__le16 CipherCount; /* AES128-GCM and AES128-CCM by default */
__le16 Ciphers[];
} __packed;
@@ -1061,7 +1061,7 @@ struct smb2_server_client_notification {
#define IL_IMPERSONATION cpu_to_le32(0x00000002)
#define IL_DELEGATE cpu_to_le32(0x00000003)
-/* File Attrubutes */
+/* File Attributes */
#define FILE_ATTRIBUTE_READONLY 0x00000001
#define FILE_ATTRIBUTE_HIDDEN 0x00000002
#define FILE_ATTRIBUTE_SYSTEM 0x00000004
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index cac80e7bfefc..aa2a37a7ce84 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -25,7 +25,7 @@ DECLARE_RWSEM(conn_list_lock);
/**
* ksmbd_conn_free() - free resources of the connection instance
*
- * @conn: connection instance to be cleand up
+ * @conn: connection instance to be cleaned up
*
* During the thread termination, the corresponding conn instance
* resources(sock/memory) are released and finally the conn object is freed.
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index f4e55199938d..38e6fd2da3b8 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -213,7 +213,7 @@ struct ksmbd_tree_connect_response {
};
/*
- * IPC Request struture to disconnect tree connection.
+ * IPC Request structure to disconnect tree connection.
*/
struct ksmbd_tree_disconnect_request {
__u64 session_id; /* session id */
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 246cde380dfb..4142c7ad5fa9 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -796,7 +796,7 @@ out:
/**
* smb2_lease_break_noti() - break lease when a new client request
* write lease
- * @opinfo: conains lease state information
+ * @opinfo: contains lease state information
*
* Return: 0 on success, otherwise error
*/
@@ -1484,7 +1484,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
}
/**
- * parse_lease_state() - parse lease context containted in file open request
+ * parse_lease_state() - parse lease context contained in file open request
* @open_req: buffer containing smb2 file open(create) request
*
* Return: allocated lease context object on success, otherwise NULL
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index c402d4abe826..231d2d224656 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -279,7 +279,7 @@ static void handle_ksmbd_work(struct work_struct *wk)
/**
* queue_ksmbd_work() - queue a smb request to worker thread queue
- * for proccessing smb command and sending response
+ * for processing smb command and sending response
* @conn: connection instance
*
* read remaining data from socket create and submit work.
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index e6bdc1b20727..7460089c186f 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1335,8 +1335,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
return rc;
sz = le16_to_cpu(rsp->SecurityBufferOffset);
- chgblob =
- (struct challenge_message *)((char *)&rsp->hdr.ProtocolId + sz);
+ chgblob = (struct challenge_message *)rsp->Buffer;
memset(chgblob, 0, sizeof(struct challenge_message));
if (!work->conn->use_spnego) {
@@ -1369,9 +1368,7 @@ static int ntlm_negotiate(struct ksmbd_work *work,
goto out;
}
- sz = le16_to_cpu(rsp->SecurityBufferOffset);
- unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len,
- /* alloc is larger than blob, see smb2_allocate_rsp_buf() */);
+ memcpy(rsp->Buffer, spnego_blob, spnego_blob_len);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
out:
@@ -1453,10 +1450,7 @@ static int ntlm_authenticate(struct ksmbd_work *work,
if (rc)
return -ENOMEM;
- sz = le16_to_cpu(rsp->SecurityBufferOffset);
- unsafe_memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob,
- spnego_blob_len,
- /* alloc is larger than blob, see smb2_allocate_rsp_buf() */);
+ memcpy(rsp->Buffer, spnego_blob, spnego_blob_len);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
kfree(spnego_blob);
}
@@ -2058,18 +2052,20 @@ out_err1:
* @access: file access flags
* @disposition: file disposition flags
* @may_flags: set with MAY_ flags
- * @is_dir: is creating open flags for directory
+ * @coptions: file creation options
+ * @mode: file mode
*
* Return: file open flags
*/
static int smb2_create_open_flags(bool file_present, __le32 access,
__le32 disposition,
int *may_flags,
- bool is_dir)
+ __le32 coptions,
+ umode_t mode)
{
int oflags = O_NONBLOCK | O_LARGEFILE;
- if (is_dir) {
+ if (coptions & FILE_DIRECTORY_FILE_LE || S_ISDIR(mode)) {
access &= ~FILE_WRITE_DESIRE_ACCESS_LE;
ksmbd_debug(SMB, "Discard write access to a directory\n");
}
@@ -2086,7 +2082,7 @@ static int smb2_create_open_flags(bool file_present, __le32 access,
*may_flags = MAY_OPEN | MAY_READ;
}
- if (access == FILE_READ_ATTRIBUTES_LE)
+ if (access == FILE_READ_ATTRIBUTES_LE || S_ISBLK(mode) || S_ISCHR(mode))
oflags |= O_PATH;
if (file_present) {
@@ -3181,8 +3177,8 @@ int smb2_open(struct ksmbd_work *work)
open_flags = smb2_create_open_flags(file_present, daccess,
req->CreateDisposition,
&may_flags,
- req->CreateOptions & FILE_DIRECTORY_FILE_LE ||
- (file_present && S_ISDIR(d_inode(path.dentry)->i_mode)));
+ req->CreateOptions,
+ file_present ? d_inode(path.dentry)->i_mode : 0);
if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
if (open_flags & (O_CREAT | O_TRUNC)) {
@@ -3531,8 +3527,9 @@ int smb2_open(struct ksmbd_work *work)
memcpy(fp->create_guid, dh_info.CreateGuid,
SMB2_CREATE_GUID_SIZE);
if (dh_info.timeout)
- fp->durable_timeout = min(dh_info.timeout,
- DURABLE_HANDLE_MAX_TIMEOUT);
+ fp->durable_timeout =
+ min_t(unsigned int, dh_info.timeout,
+ DURABLE_HANDLE_MAX_TIMEOUT);
else
fp->durable_timeout = 60;
}
@@ -4586,7 +4583,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
path = &fp->filp->f_path;
/* single EA entry is requested with given user.* name */
if (req->InputBufferLength) {
- if (le32_to_cpu(req->InputBufferLength) <
+ if (le32_to_cpu(req->InputBufferLength) <=
sizeof(struct smb2_ea_info_req))
return -EINVAL;
@@ -8090,7 +8087,7 @@ int smb2_ioctl(struct ksmbd_work *work)
goto out;
}
- if (in_buf_len < sizeof(struct copychunk_ioctl_req)) {
+ if (in_buf_len <= sizeof(struct copychunk_ioctl_req)) {
ret = -EINVAL;
goto out;
}
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index 3be7d5ae65a8..73aff20e22d0 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -194,7 +194,7 @@ struct copychunk_ioctl_req {
__le64 ResumeKey[3];
__le32 ChunkCount;
__le32 Reserved;
- __u8 Chunks[1]; /* array of srv_copychunk */
+ __u8 Chunks[]; /* array of srv_copychunk */
} __packed;
struct srv_copychunk {
@@ -370,7 +370,7 @@ struct smb2_file_attr_tag_info {
struct smb2_ea_info_req {
__le32 NextEntryOffset;
__u8 EaNameLength;
- char name[1];
+ char name[];
} __packed; /* level 15 Query */
struct smb2_ea_info {
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index cc4bb2377cbd..5b8d75e78ffb 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -488,7 +488,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
* @shortname: destination short filename
*
* Return: shortname length or 0 when source long name is '.' or '..'
- * TODO: Though this function comforms the restriction of 8.3 Filename spec,
+ * TODO: Though this function conforms the restriction of 8.3 Filename spec,
* but the result is different with Windows 7's one. need to check.
*/
int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname,
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index b0f6d0f94cb8..5bbb179736c2 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -100,8 +100,8 @@ struct ksmbd_file {
struct list_head blocked_works;
struct list_head lock_list;
- int durable_timeout;
- int durable_scavenger_timeout;
+ unsigned int durable_timeout;
+ unsigned int durable_scavenger_timeout;
/* if ls is happening on directory, below is valid*/
struct ksmbd_readdir_data readdir_data;
diff --git a/fs/smb/server/xattr.h b/fs/smb/server/xattr.h
index fa3e27d6971b..505101a8104c 100644
--- a/fs/smb/server/xattr.h
+++ b/fs/smb/server/xattr.h
@@ -99,7 +99,7 @@ struct xattr_ntacl {
__u8 posix_acl_hash[XATTR_SD_HASH_SIZE]; /* 64bytes hash for posix acl */
};
-/* DOS ATTRIBUITE XATTR PREFIX */
+/* DOS ATTRIBUTE XATTR PREFIX */
#define DOS_ATTRIBUTE_PREFIX "DOSATTRIB"
#define DOS_ATTRIBUTE_PREFIX_LEN (sizeof(DOS_ATTRIBUTE_PREFIX) - 1)
#define XATTR_NAME_DOS_ATTRIBUTE (XATTR_USER_PREFIX DOS_ATTRIBUTE_PREFIX)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index f66f6aac74f6..d7941478158c 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -449,8 +449,6 @@ extern int ceph_osdc_init(struct ceph_osd_client *osdc,
extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
-extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
- struct ceph_msg *msg);
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg);
void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h
index a3d3e888cf1f..038b2d523bf8 100644
--- a/include/linux/cleanup.h
+++ b/include/linux/cleanup.h
@@ -4,6 +4,142 @@
#include <linux/compiler.h>
+/**
+ * DOC: scope-based cleanup helpers
+ *
+ * The "goto error" pattern is notorious for introducing subtle resource
+ * leaks. It is tedious and error prone to add new resource acquisition
+ * constraints into code paths that already have several unwind
+ * conditions. The "cleanup" helpers enable the compiler to help with
+ * this tedium and can aid in maintaining LIFO (last in first out)
+ * unwind ordering to avoid unintentional leaks.
+ *
+ * As drivers make up the majority of the kernel code base, here is an
+ * example of using these helpers to clean up PCI drivers. The target of
+ * the cleanups are occasions where a goto is used to unwind a device
+ * reference (pci_dev_put()), or unlock the device (pci_dev_unlock())
+ * before returning.
+ *
+ * The DEFINE_FREE() macro can arrange for PCI device references to be
+ * dropped when the associated variable goes out of scope::
+ *
+ * DEFINE_FREE(pci_dev_put, struct pci_dev *, if (_T) pci_dev_put(_T))
+ * ...
+ * struct pci_dev *dev __free(pci_dev_put) =
+ * pci_get_slot(parent, PCI_DEVFN(0, 0));
+ *
+ * The above will automatically call pci_dev_put() if @dev is non-NULL
+ * when @dev goes out of scope (automatic variable scope). If a function
+ * wants to invoke pci_dev_put() on error, but return @dev (i.e. without
+ * freeing it) on success, it can do::
+ *
+ * return no_free_ptr(dev);
+ *
+ * ...or::
+ *
+ * return_ptr(dev);
+ *
+ * The DEFINE_GUARD() macro can arrange for the PCI device lock to be
+ * dropped when the scope where guard() is invoked ends::
+ *
+ * DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T))
+ * ...
+ * guard(pci_dev)(dev);
+ *
+ * The lifetime of the lock obtained by the guard() helper follows the
+ * scope of automatic variable declaration. Take the following example::
+ *
+ * func(...)
+ * {
+ * if (...) {
+ * ...
+ * guard(pci_dev)(dev); // pci_dev_lock() invoked here
+ * ...
+ * } // <- implied pci_dev_unlock() triggered here
+ * }
+ *
+ * Observe the lock is held for the remainder of the "if ()" block not
+ * the remainder of "func()".
+ *
+ * Now, when a function uses both __free() and guard(), or multiple
+ * instances of __free(), the LIFO order of variable definition order
+ * matters. GCC documentation says:
+ *
+ * "When multiple variables in the same scope have cleanup attributes,
+ * at exit from the scope their associated cleanup functions are run in
+ * reverse order of definition (last defined, first cleanup)."
+ *
+ * When the unwind order matters it requires that variables be defined
+ * mid-function scope rather than at the top of the file. Take the
+ * following example and notice the bug highlighted by "!!"::
+ *
+ * LIST_HEAD(list);
+ * DEFINE_MUTEX(lock);
+ *
+ * struct object {
+ * struct list_head node;
+ * };
+ *
+ * static struct object *alloc_add(void)
+ * {
+ * struct object *obj;
+ *
+ * lockdep_assert_held(&lock);
+ * obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ * if (obj) {
+ * LIST_HEAD_INIT(&obj->node);
+ * list_add(obj->node, &list):
+ * }
+ * return obj;
+ * }
+ *
+ * static void remove_free(struct object *obj)
+ * {
+ * lockdep_assert_held(&lock);
+ * list_del(&obj->node);
+ * kfree(obj);
+ * }
+ *
+ * DEFINE_FREE(remove_free, struct object *, if (_T) remove_free(_T))
+ * static int init(void)
+ * {
+ * struct object *obj __free(remove_free) = NULL;
+ * int err;
+ *
+ * guard(mutex)(&lock);
+ * obj = alloc_add();
+ *
+ * if (!obj)
+ * return -ENOMEM;
+ *
+ * err = other_init(obj);
+ * if (err)
+ * return err; // remove_free() called without the lock!!
+ *
+ * no_free_ptr(obj);
+ * return 0;
+ * }
+ *
+ * That bug is fixed by changing init() to call guard() and define +
+ * initialize @obj in this order::
+ *
+ * guard(mutex)(&lock);
+ * struct object *obj __free(remove_free) = alloc_add();
+ *
+ * Given that the "__free(...) = NULL" pattern for variables defined at
+ * the top of the function poses this potential interdependency problem
+ * the recommendation is to always define and assign variables in one
+ * statement and not group variable definitions at the top of the
+ * function when __free() is used.
+ *
+ * Lastly, given that the benefit of cleanup helpers is removal of
+ * "goto", and that the "goto" statement can jump between scopes, the
+ * expectation is that usage of "goto" and cleanup helpers is never
+ * mixed in the same function. I.e. for a given routine, convert all
+ * resources that need a "goto" cleanup to scope-based cleanup, or
+ * convert none of them.
+ */
+
/*
* DEFINE_FREE(name, type, free):
* simple helper macro that defines the required wrapper for a __free()
diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h
index 955680c3bb5f..af871405ae55 100644
--- a/include/linux/folio_queue.h
+++ b/include/linux/folio_queue.h
@@ -3,6 +3,12 @@
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells ([email protected])
+ *
+ * See:
+ *
+ * Documentation/core-api/folio_queue.rst
+ *
+ * for a description of the API.
*/
#ifndef _LINUX_FOLIO_QUEUE_H
@@ -33,6 +39,13 @@ struct folio_queue {
#endif
};
+/**
+ * folioq_init - Initialise a folio queue segment
+ * @folioq: The segment to initialise
+ *
+ * Initialise a folio queue segment. Note that the folio pointers are
+ * left uninitialised.
+ */
static inline void folioq_init(struct folio_queue *folioq)
{
folio_batch_init(&folioq->vec);
@@ -43,62 +56,155 @@ static inline void folioq_init(struct folio_queue *folioq)
folioq->marks3 = 0;
}
+/**
+ * folioq_nr_slots: Query the capacity of a folio queue segment
+ * @folioq: The segment to query
+ *
+ * Query the number of folios that a particular folio queue segment might hold.
+ * [!] NOTE: This must not be assumed to be the same for every segment!
+ */
static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq)
{
return PAGEVEC_SIZE;
}
+/**
+ * folioq_count: Query the occupancy of a folio queue segment
+ * @folioq: The segment to query
+ *
+ * Query the number of folios that have been added to a folio queue segment.
+ * Note that this is not decreased as folios are removed from a segment.
+ */
static inline unsigned int folioq_count(struct folio_queue *folioq)
{
return folio_batch_count(&folioq->vec);
}
+/**
+ * folioq_count: Query if a folio queue segment is full
+ * @folioq: The segment to query
+ *
+ * Query if a folio queue segment is fully occupied. Note that this does not
+ * change if folios are removed from a segment.
+ */
static inline bool folioq_full(struct folio_queue *folioq)
{
//return !folio_batch_space(&folioq->vec);
return folioq_count(folioq) >= folioq_nr_slots(folioq);
}
+/**
+ * folioq_is_marked: Check first folio mark in a folio queue segment
+ * @folioq: The segment to query
+ * @slot: The slot number of the folio to query
+ *
+ * Determine if the first mark is set for the folio in the specified slot in a
+ * folio queue segment.
+ */
static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot)
{
return test_bit(slot, &folioq->marks);
}
+/**
+ * folioq_mark: Set the first mark on a folio in a folio queue segment
+ * @folioq: The segment to modify
+ * @slot: The slot number of the folio to modify
+ *
+ * Set the first mark for the folio in the specified slot in a folio queue
+ * segment.
+ */
static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot)
{
set_bit(slot, &folioq->marks);
}
+/**
+ * folioq_unmark: Clear the first mark on a folio in a folio queue segment
+ * @folioq: The segment to modify
+ * @slot: The slot number of the folio to modify
+ *
+ * Clear the first mark for the folio in the specified slot in a folio queue
+ * segment.
+ */
static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot)
{
clear_bit(slot, &folioq->marks);
}
+/**
+ * folioq_is_marked2: Check second folio mark in a folio queue segment
+ * @folioq: The segment to query
+ * @slot: The slot number of the folio to query
+ *
+ * Determine if the second mark is set for the folio in the specified slot in a
+ * folio queue segment.
+ */
static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot)
{
return test_bit(slot, &folioq->marks2);
}
+/**
+ * folioq_mark2: Set the second mark on a folio in a folio queue segment
+ * @folioq: The segment to modify
+ * @slot: The slot number of the folio to modify
+ *
+ * Set the second mark for the folio in the specified slot in a folio queue
+ * segment.
+ */
static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot)
{
set_bit(slot, &folioq->marks2);
}
+/**
+ * folioq_unmark2: Clear the second mark on a folio in a folio queue segment
+ * @folioq: The segment to modify
+ * @slot: The slot number of the folio to modify
+ *
+ * Clear the second mark for the folio in the specified slot in a folio queue
+ * segment.
+ */
static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot)
{
clear_bit(slot, &folioq->marks2);
}
+/**
+ * folioq_is_marked3: Check third folio mark in a folio queue segment
+ * @folioq: The segment to query
+ * @slot: The slot number of the folio to query
+ *
+ * Determine if the third mark is set for the folio in the specified slot in a
+ * folio queue segment.
+ */
static inline bool folioq_is_marked3(const struct folio_queue *folioq, unsigned int slot)
{
return test_bit(slot, &folioq->marks3);
}
+/**
+ * folioq_mark3: Set the third mark on a folio in a folio queue segment
+ * @folioq: The segment to modify
+ * @slot: The slot number of the folio to modify
+ *
+ * Set the third mark for the folio in the specified slot in a folio queue
+ * segment.
+ */
static inline void folioq_mark3(struct folio_queue *folioq, unsigned int slot)
{
set_bit(slot, &folioq->marks3);
}
+/**
+ * folioq_unmark3: Clear the third mark on a folio in a folio queue segment
+ * @folioq: The segment to modify
+ * @slot: The slot number of the folio to modify
+ *
+ * Clear the third mark for the folio in the specified slot in a folio queue
+ * segment.
+ */
static inline void folioq_unmark3(struct folio_queue *folioq, unsigned int slot)
{
clear_bit(slot, &folioq->marks3);
@@ -111,6 +217,19 @@ static inline unsigned int __folio_order(struct folio *folio)
return folio->_flags_1 & 0xff;
}
+/**
+ * folioq_append: Add a folio to a folio queue segment
+ * @folioq: The segment to add to
+ * @folio: The folio to add
+ *
+ * Add a folio to the tail of the sequence in a folio queue segment, increasing
+ * the occupancy count and returning the slot number for the folio just added.
+ * The folio size is extracted and stored in the queue and the marks are left
+ * unmodified.
+ *
+ * Note that it's left up to the caller to check that the segment capacity will
+ * not be exceeded and to extend the queue.
+ */
static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio)
{
unsigned int slot = folioq->vec.nr++;
@@ -120,6 +239,19 @@ static inline unsigned int folioq_append(struct folio_queue *folioq, struct foli
return slot;
}
+/**
+ * folioq_append_mark: Add a folio to a folio queue segment
+ * @folioq: The segment to add to
+ * @folio: The folio to add
+ *
+ * Add a folio to the tail of the sequence in a folio queue segment, increasing
+ * the occupancy count and returning the slot number for the folio just added.
+ * The folio size is extracted and stored in the queue, the first mark is set
+ * and and the second and third marks are left unmodified.
+ *
+ * Note that it's left up to the caller to check that the segment capacity will
+ * not be exceeded and to extend the queue.
+ */
static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio)
{
unsigned int slot = folioq->vec.nr++;
@@ -130,21 +262,57 @@ static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct
return slot;
}
+/**
+ * folioq_folio: Get a folio from a folio queue segment
+ * @folioq: The segment to access
+ * @slot: The folio slot to access
+ *
+ * Retrieve the folio in the specified slot from a folio queue segment. Note
+ * that no bounds check is made and if the slot hasn't been added into yet, the
+ * pointer will be undefined. If the slot has been cleared, NULL will be
+ * returned.
+ */
static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot)
{
return folioq->vec.folios[slot];
}
+/**
+ * folioq_folio_order: Get the order of a folio from a folio queue segment
+ * @folioq: The segment to access
+ * @slot: The folio slot to access
+ *
+ * Retrieve the order of the folio in the specified slot from a folio queue
+ * segment. Note that no bounds check is made and if the slot hasn't been
+ * added into yet, the order returned will be 0.
+ */
static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot)
{
return folioq->orders[slot];
}
+/**
+ * folioq_folio_size: Get the size of a folio from a folio queue segment
+ * @folioq: The segment to access
+ * @slot: The folio slot to access
+ *
+ * Retrieve the size of the folio in the specified slot from a folio queue
+ * segment. Note that no bounds check is made and if the slot hasn't been
+ * added into yet, the size returned will be PAGE_SIZE.
+ */
static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot)
{
return PAGE_SIZE << folioq_folio_order(folioq, slot);
}
+/**
+ * folioq_clear: Clear a folio from a folio queue segment
+ * @folioq: The segment to clear
+ * @slot: The folio slot to clear
+ *
+ * Clear a folio from a sequence in a folio queue segment and clear its marks.
+ * The occupancy count is left unchanged.
+ */
static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot)
{
folioq->vec.folios[slot] = NULL;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 0d5125a3e31a..db567d26f7b9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1529,8 +1529,22 @@ static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
#endif
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
-int kvm_arch_hardware_enable(void);
-void kvm_arch_hardware_disable(void);
+/*
+ * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under
+ * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of
+ * kvm_usage_count, i.e. at the beginning of the generic hardware enabling
+ * sequence, and at the end of the generic hardware disabling sequence.
+ */
+void kvm_arch_enable_virtualization(void);
+void kvm_arch_disable_virtualization(void);
+/*
+ * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to
+ * do the actual twiddling of hardware bits. The hooks are called on all
+ * online CPUs when KVM enables/disabled virtualization, and on a single CPU
+ * when that CPU is onlined/offlined (including for Resume/Suspend).
+ */
+int kvm_arch_enable_virtualization_cpu(void);
+void kvm_arch_disable_virtualization_cpu(void);
#endif
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
diff --git a/include/trace/events/dma.h b/include/trace/events/dma.h
index f57f05331d73..569f86a44aaa 100644
--- a/include/trace/events/dma.h
+++ b/include/trace/events/dma.h
@@ -176,9 +176,9 @@ TRACE_EVENT(dma_free,
);
TRACE_EVENT(dma_map_sg,
- TP_PROTO(struct device *dev, struct scatterlist *sg, int nents,
+ TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents,
int ents, enum dma_data_direction dir, unsigned long attrs),
- TP_ARGS(dev, sg, nents, ents, dir, attrs),
+ TP_ARGS(dev, sgl, nents, ents, dir, attrs),
TP_STRUCT__entry(
__string(device, dev_name(dev))
@@ -190,17 +190,17 @@ TRACE_EVENT(dma_map_sg,
),
TP_fast_assign(
+ struct scatterlist *sg;
int i;
__assign_str(device);
- for (i = 0; i < nents; i++)
- ((u64 *)__get_dynamic_array(phys_addrs))[i] =
- sg_phys(sg + i);
- for (i = 0; i < ents; i++) {
+ for_each_sg(sgl, sg, nents, i)
+ ((u64 *)__get_dynamic_array(phys_addrs))[i] = sg_phys(sg);
+ for_each_sg(sgl, sg, ents, i) {
((u64 *)__get_dynamic_array(dma_addrs))[i] =
- sg_dma_address(sg + i);
+ sg_dma_address(sg);
((unsigned int *)__get_dynamic_array(lengths))[i] =
- sg_dma_len(sg + i);
+ sg_dma_len(sg);
}
__entry->dir = dir;
__entry->attrs = attrs;
@@ -222,9 +222,9 @@ TRACE_EVENT(dma_map_sg,
);
TRACE_EVENT(dma_unmap_sg,
- TP_PROTO(struct device *dev, struct scatterlist *sg, int nents,
+ TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs),
- TP_ARGS(dev, sg, nents, dir, attrs),
+ TP_ARGS(dev, sgl, nents, dir, attrs),
TP_STRUCT__entry(
__string(device, dev_name(dev))
@@ -234,12 +234,12 @@ TRACE_EVENT(dma_unmap_sg,
),
TP_fast_assign(
+ struct scatterlist *sg;
int i;
__assign_str(device);
- for (i = 0; i < nents; i++)
- ((u64 *)__get_dynamic_array(addrs))[i] =
- sg_phys(sg + i);
+ for_each_sg(sgl, sg, nents, i)
+ ((u64 *)__get_dynamic_array(addrs))[i] = sg_phys(sg);
__entry->dir = dir;
__entry->attrs = attrs;
),
@@ -290,9 +290,9 @@ DEFINE_EVENT(dma_sync_single, dma_sync_single_for_device,
TP_ARGS(dev, dma_addr, size, dir));
DECLARE_EVENT_CLASS(dma_sync_sg,
- TP_PROTO(struct device *dev, struct scatterlist *sg, int nents,
+ TP_PROTO(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir),
- TP_ARGS(dev, sg, nents, dir),
+ TP_ARGS(dev, sgl, nents, dir),
TP_STRUCT__entry(
__string(device, dev_name(dev))
@@ -302,14 +302,15 @@ DECLARE_EVENT_CLASS(dma_sync_sg,
),
TP_fast_assign(
+ struct scatterlist *sg;
int i;
__assign_str(device);
- for (i = 0; i < nents; i++) {
+ for_each_sg(sgl, sg, nents, i) {
((u64 *)__get_dynamic_array(dma_addrs))[i] =
- sg_dma_address(sg + i);
+ sg_dma_address(sg);
((unsigned int *)__get_dynamic_array(lengths))[i] =
- sg_dma_len(sg + i);
+ sg_dma_len(sg);
}
__entry->dir = dir;
),
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 76bd42a96815..1d7c52821e55 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -448,7 +448,8 @@ TRACE_EVENT(netfs_folio,
),
TP_fast_assign(
- __entry->ino = folio->mapping->host->i_ino;
+ struct address_space *__m = READ_ONCE(folio->mapping);
+ __entry->ino = __m ? __m->host->i_ino : 0;
__entry->why = why;
__entry->index = folio_index(folio);
__entry->nr = folio_nr_pages(folio);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 6dc76b590703..93a822d3c468 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -168,7 +168,7 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key)
jump_label_update(key);
/*
* Ensure that when static_key_fast_inc_not_disabled() or
- * static_key_slow_try_dec() observe the positive value,
+ * static_key_dec_not_one() observe the positive value,
* they must also observe all the text changes.
*/
atomic_set_release(&key->enabled, 1);
@@ -250,7 +250,7 @@ void static_key_disable(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_disable);
-static bool static_key_slow_try_dec(struct static_key *key)
+static bool static_key_dec_not_one(struct static_key *key)
{
int v;
@@ -274,6 +274,14 @@ static bool static_key_slow_try_dec(struct static_key *key)
* enabled. This suggests an ordering problem on the user side.
*/
WARN_ON_ONCE(v < 0);
+
+ /*
+ * Warn about underflow, and lie about success in an attempt to
+ * not make things worse.
+ */
+ if (WARN_ON_ONCE(v == 0))
+ return true;
+
if (v <= 1)
return false;
} while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1)));
@@ -284,15 +292,27 @@ static bool static_key_slow_try_dec(struct static_key *key)
static void __static_key_slow_dec_cpuslocked(struct static_key *key)
{
lockdep_assert_cpus_held();
+ int val;
- if (static_key_slow_try_dec(key))
+ if (static_key_dec_not_one(key))
return;
guard(mutex)(&jump_label_mutex);
- if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
+ val = atomic_read(&key->enabled);
+ /*
+ * It should be impossible to observe -1 with jump_label_mutex held,
+ * see static_key_slow_inc_cpuslocked().
+ */
+ if (WARN_ON_ONCE(val == -1))
+ return;
+ /*
+ * Cannot already be 0, something went sideways.
+ */
+ if (WARN_ON_ONCE(val == 0))
+ return;
+
+ if (atomic_dec_and_test(&key->enabled))
jump_label_update(key);
- else
- WARN_ON_ONCE(!static_key_slow_try_dec(key));
}
static void __static_key_slow_dec(struct static_key *key)
@@ -329,7 +349,7 @@ void __static_key_slow_dec_deferred(struct static_key *key,
{
STATIC_KEY_CHECK_USE(key);
- if (static_key_slow_try_dec(key))
+ if (static_key_dec_not_one(key))
return;
schedule_delayed_work(work, timeout);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7963deac33c3..536bd471557f 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -788,7 +788,7 @@ static void lockdep_print_held_locks(struct task_struct *p)
printk("no locks held by %s/%d.\n", p->comm, task_pid_nr(p));
else
printk("%d lock%s held by %s/%d:\n", depth,
- depth > 1 ? "s" : "", p->comm, task_pid_nr(p));
+ str_plural(depth), p->comm, task_pid_nr(p));
/*
* It's not reliable to print a task's held locks if it's not sleeping
* and it's not the current task.
@@ -2084,6 +2084,9 @@ static noinline void print_bfs_bug(int ret)
/*
* Breadth-first-search failed, graph got corrupted?
*/
+ if (ret == BFS_EQUEUEFULL)
+ pr_warn("Increase LOCKDEP_CIRCULAR_QUEUE_BITS to avoid this warning:\n");
+
WARN(1, "lockdep bfs error:%d\n", ret);
}
@@ -6263,25 +6266,27 @@ static struct pending_free *get_pending_free(void)
static void free_zapped_rcu(struct rcu_head *cb);
/*
- * Schedule an RCU callback if no RCU callback is pending. Must be called with
- * the graph lock held.
- */
-static void call_rcu_zapped(struct pending_free *pf)
+* See if we need to queue an RCU callback, must called with
+* the lockdep lock held, returns false if either we don't have
+* any pending free or the callback is already scheduled.
+* Otherwise, a call_rcu() must follow this function call.
+*/
+static bool prepare_call_rcu_zapped(struct pending_free *pf)
{
WARN_ON_ONCE(inside_selftest());
if (list_empty(&pf->zapped))
- return;
+ return false;
if (delayed_free.scheduled)
- return;
+ return false;
delayed_free.scheduled = true;
WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf);
delayed_free.index ^= 1;
- call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+ return true;
}
/* The caller must hold the graph lock. May be called from RCU context. */
@@ -6307,6 +6312,7 @@ static void free_zapped_rcu(struct rcu_head *ch)
{
struct pending_free *pf;
unsigned long flags;
+ bool need_callback;
if (WARN_ON_ONCE(ch != &delayed_free.rcu_head))
return;
@@ -6318,14 +6324,18 @@ static void free_zapped_rcu(struct rcu_head *ch)
pf = delayed_free.pf + (delayed_free.index ^ 1);
__free_zapped_classes(pf);
delayed_free.scheduled = false;
+ need_callback =
+ prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index);
+ lockdep_unlock();
+ raw_local_irq_restore(flags);
/*
- * If there's anything on the open list, close and start a new callback.
- */
- call_rcu_zapped(delayed_free.pf + delayed_free.index);
+ * If there's pending free and its callback has not been scheduled,
+ * queue an RCU callback.
+ */
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
- lockdep_unlock();
- raw_local_irq_restore(flags);
}
/*
@@ -6365,6 +6375,7 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
{
struct pending_free *pf;
unsigned long flags;
+ bool need_callback;
init_data_structures_once();
@@ -6372,10 +6383,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size)
lockdep_lock();
pf = get_pending_free();
__lockdep_free_key_range(pf, start, size);
- call_rcu_zapped(pf);
+ need_callback = prepare_call_rcu_zapped(pf);
lockdep_unlock();
raw_local_irq_restore(flags);
-
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
/*
* Wait for any possible iterators from look_up_lock_class() to pass
* before continuing to free the memory they refer to.
@@ -6469,6 +6481,7 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock)
struct pending_free *pf;
unsigned long flags;
int locked;
+ bool need_callback = false;
raw_local_irq_save(flags);
locked = graph_lock();
@@ -6477,11 +6490,13 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock)
pf = get_pending_free();
__lockdep_reset_lock(pf, lock);
- call_rcu_zapped(pf);
+ need_callback = prepare_call_rcu_zapped(pf);
graph_unlock();
out_irq:
raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
}
/*
@@ -6525,6 +6540,7 @@ void lockdep_unregister_key(struct lock_class_key *key)
struct pending_free *pf;
unsigned long flags;
bool found = false;
+ bool need_callback = false;
might_sleep();
@@ -6545,11 +6561,14 @@ void lockdep_unregister_key(struct lock_class_key *key)
if (found) {
pf = get_pending_free();
__lockdep_free_key_range(pf, key, 1);
- call_rcu_zapped(pf);
+ need_callback = prepare_call_rcu_zapped(pf);
}
lockdep_unlock();
raw_local_irq_restore(flags);
+ if (need_callback)
+ call_rcu(&delayed_free.rcu_head, free_zapped_rcu);
+
/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
synchronize_rcu();
}
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index e2bfb1db589d..6db0f43fc4df 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -424,7 +424,7 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
for (i = 0; i < offset; i++)
seq_puts(m, " ");
for (i = 0; i < length; i++)
- seq_printf(m, "%c", c);
+ seq_putc(m, c);
seq_puts(m, "\n");
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 5ded7dff46ef..2bbb6eca5144 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -181,12 +181,21 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
__rwsem_set_reader_owned(sem, current);
}
+#ifdef CONFIG_DEBUG_RWSEMS
+/*
+ * Return just the real task structure pointer of the owner
+ */
+static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+ return (struct task_struct *)
+ (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
/*
* Return true if the rwsem is owned by a reader.
*/
static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
{
-#ifdef CONFIG_DEBUG_RWSEMS
/*
* Check the count to see if it is write-locked.
*/
@@ -194,11 +203,9 @@ static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
if (count & RWSEM_WRITER_MASK)
return false;
-#endif
return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
}
-#ifdef CONFIG_DEBUG_RWSEMS
/*
* With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
* is a task pointer in owner of a reader-owned rwsem, it will be the
@@ -266,15 +273,6 @@ static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
}
/*
- * Return just the real task structure pointer of the owner
- */
-static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
-{
- return (struct task_struct *)
- (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
-}
-
-/*
* Return the real task structure pointer of the owner and the embedded
* flags in the owner. pflags must be non-NULL.
*/
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 05a9a06a140c..7c6588148d42 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -229,7 +229,7 @@ comment "Do not forget to sign required modules with scripts/sign-file"
depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL
choice
- prompt "Which hash algorithm should modules be signed with?"
+ prompt "Hash algorithm to sign modules"
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
help
This determines which sort of hashing algorithm will be used during
@@ -239,31 +239,31 @@ choice
the signature on that module.
config MODULE_SIG_SHA1
- bool "Sign modules with SHA-1"
+ bool "SHA-1"
select CRYPTO_SHA1
config MODULE_SIG_SHA256
- bool "Sign modules with SHA-256"
+ bool "SHA-256"
select CRYPTO_SHA256
config MODULE_SIG_SHA384
- bool "Sign modules with SHA-384"
+ bool "SHA-384"
select CRYPTO_SHA512
config MODULE_SIG_SHA512
- bool "Sign modules with SHA-512"
+ bool "SHA-512"
select CRYPTO_SHA512
config MODULE_SIG_SHA3_256
- bool "Sign modules with SHA3-256"
+ bool "SHA3-256"
select CRYPTO_SHA3
config MODULE_SIG_SHA3_384
- bool "Sign modules with SHA3-384"
+ bool "SHA3-384"
select CRYPTO_SHA3
config MODULE_SIG_SHA3_512
- bool "Sign modules with SHA3-512"
+ bool "SHA3-512"
select CRYPTO_SHA3
endchoice
@@ -279,64 +279,65 @@ config MODULE_SIG_HASH
default "sha3-384" if MODULE_SIG_SHA3_384
default "sha3-512" if MODULE_SIG_SHA3_512
-choice
- prompt "Module compression mode"
+config MODULE_COMPRESS
+ bool "Module compression"
help
- This option allows you to choose the algorithm which will be used to
- compress modules when 'make modules_install' is run. (or, you can
- choose to not compress modules at all.)
-
- External modules will also be compressed in the same way during the
- installation.
-
- For modules inside an initrd or initramfs, it's more efficient to
- compress the whole initrd or initramfs instead.
-
+ Enable module compression to reduce on-disk size of module binaries.
This is fully compatible with signed modules.
- Please note that the tool used to load modules needs to support the
- corresponding algorithm. module-init-tools MAY support gzip, and kmod
- MAY support gzip, xz and zstd.
+ The tool used to work with modules needs to support the selected
+ compression type. kmod MAY support gzip, xz and zstd. Other tools
+ might have a limited selection of the supported types.
- Your build system needs to provide the appropriate compression tool
- to compress the modules.
+ Note that for modules inside an initrd or initramfs, it's more
+ efficient to compress the whole ramdisk instead.
- If in doubt, select 'None'.
+ If unsure, say N.
-config MODULE_COMPRESS_NONE
- bool "None"
+choice
+ prompt "Module compression type"
+ depends on MODULE_COMPRESS
help
- Do not compress modules. The installed modules are suffixed
- with .ko.
+ Choose the supported algorithm for module compression.
config MODULE_COMPRESS_GZIP
bool "GZIP"
help
- Compress modules with GZIP. The installed modules are suffixed
- with .ko.gz.
+ Support modules compressed with GZIP. The installed modules are
+ suffixed with .ko.gz.
config MODULE_COMPRESS_XZ
bool "XZ"
help
- Compress modules with XZ. The installed modules are suffixed
- with .ko.xz.
+ Support modules compressed with XZ. The installed modules are
+ suffixed with .ko.xz.
config MODULE_COMPRESS_ZSTD
bool "ZSTD"
help
- Compress modules with ZSTD. The installed modules are suffixed
- with .ko.zst.
+ Support modules compressed with ZSTD. The installed modules are
+ suffixed with .ko.zst.
endchoice
+config MODULE_COMPRESS_ALL
+ bool "Automatically compress all modules"
+ default y
+ depends on MODULE_COMPRESS
+ help
+ Compress all modules during 'make modules_install'.
+
+ Your build system needs to provide the appropriate compression tool
+ for the selected compression type. External modules will also be
+ compressed in the same way during the installation.
+
config MODULE_DECOMPRESS
bool "Support in-kernel module decompression"
- depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
+ depends on MODULE_COMPRESS
select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
select XZ_DEC if MODULE_COMPRESS_XZ
select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
help
-
Support for decompressing kernel modules by the kernel itself
instead of relying on userspace to perform this task. Useful when
load pinning security policy is enabled.
diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c
index 12a569d361e8..b4cc03842d70 100644
--- a/kernel/module/debug_kmemleak.c
+++ b/kernel/module/debug_kmemleak.c
@@ -12,19 +12,9 @@
void kmemleak_load_module(const struct module *mod,
const struct load_info *info)
{
- unsigned int i;
-
- /* only scan the sections containing data */
- kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
-
- for (i = 1; i < info->hdr->e_shnum; i++) {
- /* Scan all writable sections that's not executable */
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
- !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
- (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
- continue;
-
- kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
- info->sechdrs[i].sh_size, GFP_KERNEL);
+ /* only scan writable, non-executable sections */
+ for_each_mod_mem_type(type) {
+ if (type != MOD_DATA && type != MOD_INIT_DATA)
+ kmemleak_no_scan(mod->mem[type].base);
}
}
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index 26efe1305c12..456358e1fdc4 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -69,12 +69,13 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
kfree(sect_attrs);
}
-static void add_sect_attrs(struct module *mod, const struct load_info *info)
+static int add_sect_attrs(struct module *mod, const struct load_info *info)
{
unsigned int nloaded = 0, i, size[2];
struct module_sect_attrs *sect_attrs;
struct module_sect_attr *sattr;
struct bin_attribute **gattr;
+ int ret;
/* Count loaded sections and allocate structures */
for (i = 0; i < info->hdr->e_shnum; i++)
@@ -85,7 +86,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
if (!sect_attrs)
- return;
+ return -ENOMEM;
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
@@ -103,8 +104,10 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
sattr->address = sec->sh_addr;
sattr->battr.attr.name =
kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
- if (!sattr->battr.attr.name)
+ if (!sattr->battr.attr.name) {
+ ret = -ENOMEM;
goto out;
+ }
sect_attrs->nsections++;
sattr->battr.read = module_sect_read;
sattr->battr.size = MODULE_SECT_READ_SIZE;
@@ -113,13 +116,15 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
}
*gattr = NULL;
- if (sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp))
+ ret = sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp);
+ if (ret)
goto out;
mod->sect_attrs = sect_attrs;
- return;
+ return 0;
out:
free_sect_attrs(sect_attrs);
+ return ret;
}
static void remove_sect_attrs(struct module *mod)
@@ -158,15 +163,12 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
kfree(notes_attrs);
}
-static void add_notes_attrs(struct module *mod, const struct load_info *info)
+static int add_notes_attrs(struct module *mod, const struct load_info *info)
{
unsigned int notes, loaded, i;
struct module_notes_attrs *notes_attrs;
struct bin_attribute *nattr;
-
- /* failed to create section attributes, so can't create notes */
- if (!mod->sect_attrs)
- return;
+ int ret;
/* Count notes sections and allocate structures. */
notes = 0;
@@ -176,12 +178,12 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
++notes;
if (notes == 0)
- return;
+ return 0;
notes_attrs = kzalloc(struct_size(notes_attrs, attrs, notes),
GFP_KERNEL);
if (!notes_attrs)
- return;
+ return -ENOMEM;
notes_attrs->notes = notes;
nattr = &notes_attrs->attrs[0];
@@ -201,19 +203,23 @@ static void add_notes_attrs(struct module *mod, const struct load_info *info)
}
notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
- if (!notes_attrs->dir)
+ if (!notes_attrs->dir) {
+ ret = -ENOMEM;
goto out;
+ }
- for (i = 0; i < notes; ++i)
- if (sysfs_create_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]))
+ for (i = 0; i < notes; ++i) {
+ ret = sysfs_create_bin_file(notes_attrs->dir, &notes_attrs->attrs[i]);
+ if (ret)
goto out;
+ }
mod->notes_attrs = notes_attrs;
- return;
+ return 0;
out:
free_notes_attrs(notes_attrs, i);
+ return ret;
}
static void remove_notes_attrs(struct module *mod)
@@ -223,9 +229,15 @@ static void remove_notes_attrs(struct module *mod)
}
#else /* !CONFIG_KALLSYMS */
-static inline void add_sect_attrs(struct module *mod, const struct load_info *info) { }
+static inline int add_sect_attrs(struct module *mod, const struct load_info *info)
+{
+ return 0;
+}
static inline void remove_sect_attrs(struct module *mod) { }
-static inline void add_notes_attrs(struct module *mod, const struct load_info *info) { }
+static inline int add_notes_attrs(struct module *mod, const struct load_info *info)
+{
+ return 0;
+}
static inline void remove_notes_attrs(struct module *mod) { }
#endif /* CONFIG_KALLSYMS */
@@ -385,11 +397,20 @@ int mod_sysfs_setup(struct module *mod,
if (err)
goto out_unreg_modinfo_attrs;
- add_sect_attrs(mod, info);
- add_notes_attrs(mod, info);
+ err = add_sect_attrs(mod, info);
+ if (err)
+ goto out_del_usage_links;
+
+ err = add_notes_attrs(mod, info);
+ if (err)
+ goto out_unreg_sect_attrs;
return 0;
+out_unreg_sect_attrs:
+ remove_sect_attrs(mod);
+out_del_usage_links:
+ del_usage_links(mod);
out_unreg_modinfo_attrs:
module_remove_modinfo_attrs(mod, -1);
out_unreg_param:
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
index 639397b5491c..5259cda486d0 100644
--- a/kernel/static_call_inline.c
+++ b/kernel/static_call_inline.c
@@ -411,6 +411,17 @@ static void static_call_del_module(struct module *mod)
for (site = start; site < stop; site++) {
key = static_call_key(site);
+
+ /*
+ * If the key was not updated due to a memory allocation
+ * failure in __static_call_init() then treating key::sites
+ * as key::mods in the code below would cause random memory
+ * access and #GP. In that case all subsequent sites have
+ * not been touched either, so stop iterating.
+ */
+ if (!static_call_key_has_mods(key))
+ break;
+
if (key == prev_key)
continue;
@@ -442,7 +453,7 @@ static int static_call_module_notify(struct notifier_block *nb,
case MODULE_STATE_COMING:
ret = static_call_add_module(mod);
if (ret) {
- WARN(1, "Failed to allocate memory for static calls");
+ pr_warn("Failed to allocate memory for static calls\n");
static_call_del_module(mod);
}
break;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3c8b78d9c4d1..d1b5705dc0c6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1254,7 +1254,7 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen,
colon_p = memchr(name, ':', namelen);
if (delim_p && colon_p)
- end = delim_p < colon_p ? delim_p : colon_p;
+ end = min(delim_p, colon_p);
else if (!delim_p && colon_p)
end = colon_p;
else {
diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst
index 4d81ed9af294..d97720943189 100644
--- a/scripts/Makefile.modinst
+++ b/scripts/Makefile.modinst
@@ -53,9 +53,11 @@ $(foreach x, % :, $(if $(findstring $x, $(dst)), \
$(error module installation path cannot contain '$x')))
suffix-y :=
+ifdef CONFIG_MODULE_COMPRESS_ALL
suffix-$(CONFIG_MODULE_COMPRESS_GZIP) := .gz
suffix-$(CONFIG_MODULE_COMPRESS_XZ) := .xz
suffix-$(CONFIG_MODULE_COMPRESS_ZSTD) := .zst
+endif
modules := $(patsubst $(extmod_prefix)%.o, $(dst)/%.ko$(suffix-y), $(modules))
install-$(CONFIG_MODULES) += $(modules)
diff --git a/scripts/coccinelle/api/string_choices.cocci b/scripts/coccinelle/api/string_choices.cocci
index 5e729f187f22..375045086912 100644
--- a/scripts/coccinelle/api/string_choices.cocci
+++ b/scripts/coccinelle/api/string_choices.cocci
@@ -14,23 +14,18 @@ expression E;
- ((E == 1) ? "" : "s")
+ str_plural(E)
|
-- ((E != 1) ? "s" : "")
-+ str_plural(E)
-|
- ((E > 1) ? "s" : "")
+ str_plural(E)
)
-@str_plural_r depends on !patch exists@
+@str_plural_r depends on !patch@
expression E;
position P;
@@
(
-* ((E@P == 1) ? "" : "s")
-|
-* ((E@P != 1) ? "s" : "")
+* (E@P == 1) ? "" : "s"
|
-* ((E@P > 1) ? "s" : "")
+* (E@P > 1) ? "s" : ""
)
@script:python depends on report@
@@ -40,21 +35,17 @@ e << str_plural_r.E;
coccilib.report.print_report(p[0], "opportunity for str_plural(%s)" % e)
-@str_up_down depends on patch@
+@str_up_down depends on patch disable neg_if_exp@
expression E;
@@
-(
- ((E) ? "up" : "down")
+ str_up_down(E)
-)
-@str_up_down_r depends on !patch exists@
+@str_up_down_r depends on !patch disable neg_if_exp@
expression E;
position P;
@@
-(
-* ((E@P) ? "up" : "down")
-)
+* E@P ? "up" : "down"
@script:python depends on report@
p << str_up_down_r.P;
@@ -63,21 +54,17 @@ e << str_up_down_r.E;
coccilib.report.print_report(p[0], "opportunity for str_up_down(%s)" % e)
-@str_down_up depends on patch@
+@str_down_up depends on patch disable neg_if_exp@
expression E;
@@
-(
- ((E) ? "down" : "up")
+ str_down_up(E)
-)
-@str_down_up_r depends on !patch exists@
+@str_down_up_r depends on !patch disable neg_if_exp@
expression E;
position P;
@@
-(
-* ((E@P) ? "down" : "up")
-)
+* E@P ? "down" : "up"
@script:python depends on report@
p << str_down_up_r.P;
@@ -85,3 +72,231 @@ e << str_down_up_r.E;
@@
coccilib.report.print_report(p[0], "opportunity for str_down_up(%s)" % e)
+
+@str_true_false depends on patch disable neg_if_exp@
+expression E;
+@@
+- ((E) ? "true" : "false")
++ str_true_false(E)
+
+@str_true_false_r depends on !patch disable neg_if_exp@
+expression E;
+position P;
+@@
+* E@P ? "true" : "false"
+
+@script:python depends on report@
+p << str_true_false_r.P;
+e << str_true_false_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_true_false(%s)" % e)
+
+@str_false_true depends on patch disable neg_if_exp@
+expression E;
+@@
+- ((E) ? "false" : "true")
++ str_false_true(E)
+
+@str_false_true_r depends on !patch disable neg_if_exp@
+expression E;
+position P;
+@@
+* E@P ? "false" : "true"
+
+@script:python depends on report@
+p << str_false_true_r.P;
+e << str_false_true_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_false_true(%s)" % e)
+
+@str_hi_lo depends on patch disable neg_if_exp@
+expression E;
+@@
+- ((E) ? "hi" : "lo")
++ str_hi_lo(E)
+
+@str_hi_lo_r depends on !patch disable neg_if_exp@
+expression E;
+position P;
+@@
+* E@P ? "hi" : "lo"
+
+@script:python depends on report@
+p << str_hi_lo_r.P;
+e << str_hi_lo_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_hi_lo(%s)" % e)
+
+@str_high_low depends on patch disable neg_if_exp@
+expression E;
+@@
+- ((E) ? "high" : "low")
++ str_high_low(E)
+
+@str_high_low_r depends on !patch disable neg_if_exp@
+expression E;
+position P;
+@@
+* E@P ? "high" : "low"
+
+@script:python depends on report@
+p << str_high_low_r.P;
+e << str_high_low_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_high_low(%s)" % e)
+
+@str_lo_hi depends on patch disable neg_if_exp@
+expression E;
+@@
+- ((E) ? "lo" : "hi")
++ str_lo_hi(E)
+
+@str_lo_hi_r depends on !patch disable neg_if_exp@
+expression E;
+position P;
+@@
+* E@P ? "lo" : "hi"
+
+@script:python depends on report@
+p << str_lo_hi_r.P;
+e << str_lo_hi_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_lo_hi(%s)" % e)
+
+@str_low_high depends on patch disable neg_if_exp@
+expression E;
+@@
+- ((E) ? "low" : "high")
++ str_low_high(E)
+
+@str_low_high_r depends on !patch disable neg_if_exp@
+expression E;
+position P;
+@@
+* E@P ? "low" : "high"
+
+@script:python depends on report@
+p << str_low_high_r.P;
+e << str_low_high_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_low_high(%s)" % e)
+
+@str_enable_disable depends on patch@
+expression E;
+@@
+- ((E) ? "enable" : "disable")
++ str_enable_disable(E)
+
+@str_enable_disable_r depends on !patch@
+expression E;
+position P;
+@@
+* E@P ? "enable" : "disable"
+
+@script:python depends on report@
+p << str_enable_disable_r.P;
+e << str_enable_disable_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_enable_disable(%s)" % e)
+
+@str_enabled_disabled depends on patch@
+expression E;
+@@
+- ((E) ? "enabled" : "disabled")
++ str_enabled_disabled(E)
+
+@str_enabled_disabled_r depends on !patch@
+expression E;
+position P;
+@@
+* E@P ? "enabled" : "disabled"
+
+@script:python depends on report@
+p << str_enabled_disabled_r.P;
+e << str_enabled_disabled_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_enabled_disabled(%s)" % e)
+
+@str_read_write depends on patch disable neg_if_exp@
+expression E;
+@@
+- ((E) ? "read" : "write")
++ str_read_write(E)
+
+@str_read_write_r depends on !patch disable neg_if_exp@
+expression E;
+position P;
+@@
+* E@P ? "read" : "write"
+
+@script:python depends on report@
+p << str_read_write_r.P;
+e << str_read_write_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_read_write(%s)" % e)
+
+@str_write_read depends on patch disable neg_if_exp@
+expression E;
+@@
+- ((E) ? "write" : "read")
++ str_write_read(E)
+
+@str_write_read_r depends on !patch disable neg_if_exp@
+expression E;
+position P;
+@@
+* E@P ? "write" : "read"
+
+@script:python depends on report@
+p << str_write_read_r.P;
+e << str_write_read_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_write_read(%s)" % e)
+
+@str_on_off depends on patch@
+expression E;
+@@
+- ((E) ? "on" : "off")
++ str_on_off(E)
+
+@str_on_off_r depends on !patch@
+expression E;
+position P;
+@@
+* E@P ? "on" : "off"
+
+@script:python depends on report@
+p << str_on_off_r.P;
+e << str_on_off_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_on_off(%s)" % e)
+
+@str_yes_no depends on patch@
+expression E;
+@@
+- ((E) ? "yes" : "no")
++ str_yes_no(E)
+
+@str_yes_no_r depends on !patch@
+expression E;
+position P;
+@@
+* E@P ? "yes" : "no"
+
+@script:python depends on report@
+p << str_yes_no_r.P;
+e << str_yes_no_r.E;
+@@
+
+coccilib.report.print_report(p[0], "opportunity for str_yes_no(%s)" % e)
diff --git a/tools/include/linux/linkage.h b/tools/include/linux/linkage.h
index b7183576d8eb..7baaa5898ca2 100644
--- a/tools/include/linux/linkage.h
+++ b/tools/include/linux/linkage.h
@@ -4,7 +4,9 @@
#include <linux/export.h>
#define SYM_FUNC_START(x) .globl x; x:
-
#define SYM_FUNC_END(x)
+#define SYM_DATA_START(x) .globl x; x:
+#define SYM_DATA_START_LOCAL(x) x:
+#define SYM_DATA_END(x)
#endif /* _TOOLS_INCLUDE_LINUX_LINKAGE_H */
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 6d9381d60172..7f57abf936e7 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -5,3 +5,7 @@
!*.h
!*.S
!*.sh
+!.gitignore
+!config
+!settings
+!Makefile
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 0c4b254ab56b..960cf6a77198 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -130,6 +130,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test
TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test
TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test
TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
+TEST_GEN_PROGS_x86_64 += coalesced_io_test
TEST_GEN_PROGS_x86_64 += demand_paging_test
TEST_GEN_PROGS_x86_64 += dirty_log_test
TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
@@ -167,6 +168,7 @@ TEST_GEN_PROGS_aarch64 += aarch64/vpmu_counter_access
TEST_GEN_PROGS_aarch64 += aarch64/no-vgic-v3
TEST_GEN_PROGS_aarch64 += access_tracking_perf_test
TEST_GEN_PROGS_aarch64 += arch_timer
+TEST_GEN_PROGS_aarch64 += coalesced_io_test
TEST_GEN_PROGS_aarch64 += demand_paging_test
TEST_GEN_PROGS_aarch64 += dirty_log_test
TEST_GEN_PROGS_aarch64 += dirty_log_perf_test
@@ -188,6 +190,7 @@ TEST_GEN_PROGS_s390x += s390x/tprot
TEST_GEN_PROGS_s390x += s390x/cmma_test
TEST_GEN_PROGS_s390x += s390x/debug_test
TEST_GEN_PROGS_s390x += s390x/shared_zeropage_test
+TEST_GEN_PROGS_s390x += s390x/ucontrol_test
TEST_GEN_PROGS_s390x += demand_paging_test
TEST_GEN_PROGS_s390x += dirty_log_test
TEST_GEN_PROGS_s390x += guest_print_test
@@ -200,6 +203,7 @@ TEST_GEN_PROGS_s390x += kvm_binary_stats_test
TEST_GEN_PROGS_riscv += riscv/sbi_pmu_test
TEST_GEN_PROGS_riscv += riscv/ebreak_test
TEST_GEN_PROGS_riscv += arch_timer
+TEST_GEN_PROGS_riscv += coalesced_io_test
TEST_GEN_PROGS_riscv += demand_paging_test
TEST_GEN_PROGS_riscv += dirty_log_test
TEST_GEN_PROGS_riscv += get-reg-list
diff --git a/tools/testing/selftests/kvm/coalesced_io_test.c b/tools/testing/selftests/kvm/coalesced_io_test.c
new file mode 100644
index 000000000000..60cb25454899
--- /dev/null
+++ b/tools/testing/selftests/kvm/coalesced_io_test.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include <linux/sizes.h>
+
+#include <kvm_util.h>
+#include <processor.h>
+
+#include "ucall_common.h"
+
+struct kvm_coalesced_io {
+ struct kvm_coalesced_mmio_ring *ring;
+ uint32_t ring_size;
+ uint64_t mmio_gpa;
+ uint64_t *mmio;
+
+ /*
+ * x86-only, but define pio_port for all architectures to minimize the
+ * amount of #ifdeffery and complexity, without having to sacrifice
+ * verbose error messages.
+ */
+ uint8_t pio_port;
+};
+
+static struct kvm_coalesced_io kvm_builtin_io_ring;
+
+#ifdef __x86_64__
+static const int has_pio = 1;
+#else
+static const int has_pio = 0;
+#endif
+
+static void guest_code(struct kvm_coalesced_io *io)
+{
+ int i, j;
+
+ for (;;) {
+ for (j = 0; j < 1 + has_pio; j++) {
+ /*
+ * KVM always leaves one free entry, i.e. exits to
+ * userspace before the last entry is filled.
+ */
+ for (i = 0; i < io->ring_size - 1; i++) {
+#ifdef __x86_64__
+ if (i & 1)
+ outl(io->pio_port, io->pio_port + i);
+ else
+#endif
+ WRITE_ONCE(*io->mmio, io->mmio_gpa + i);
+ }
+#ifdef __x86_64__
+ if (j & 1)
+ outl(io->pio_port, io->pio_port + i);
+ else
+#endif
+ WRITE_ONCE(*io->mmio, io->mmio_gpa + i);
+ }
+ GUEST_SYNC(0);
+
+ WRITE_ONCE(*io->mmio, io->mmio_gpa + i);
+#ifdef __x86_64__
+ outl(io->pio_port, io->pio_port + i);
+#endif
+ }
+}
+
+static void vcpu_run_and_verify_io_exit(struct kvm_vcpu *vcpu,
+ struct kvm_coalesced_io *io,
+ uint32_t ring_start,
+ uint32_t expected_exit)
+{
+ const bool want_pio = expected_exit == KVM_EXIT_IO;
+ struct kvm_coalesced_mmio_ring *ring = io->ring;
+ struct kvm_run *run = vcpu->run;
+ uint32_t pio_value;
+
+ WRITE_ONCE(ring->first, ring_start);
+ WRITE_ONCE(ring->last, ring_start);
+
+ vcpu_run(vcpu);
+
+ /*
+ * Annoyingly, reading PIO data is safe only for PIO exits, otherwise
+ * data_offset is garbage, e.g. an MMIO gpa.
+ */
+ if (run->exit_reason == KVM_EXIT_IO)
+ pio_value = *(uint32_t *)((void *)run + run->io.data_offset);
+ else
+ pio_value = 0;
+
+ TEST_ASSERT((!want_pio && (run->exit_reason == KVM_EXIT_MMIO && run->mmio.is_write &&
+ run->mmio.phys_addr == io->mmio_gpa && run->mmio.len == 8 &&
+ *(uint64_t *)run->mmio.data == io->mmio_gpa + io->ring_size - 1)) ||
+ (want_pio && (run->exit_reason == KVM_EXIT_IO && run->io.port == io->pio_port &&
+ run->io.direction == KVM_EXIT_IO_OUT && run->io.count == 1 &&
+ pio_value == io->pio_port + io->ring_size - 1)),
+ "For start = %u, expected exit on %u-byte %s write 0x%llx = %lx, got exit_reason = %u (%s)\n "
+ "(MMIO addr = 0x%llx, write = %u, len = %u, data = %lx)\n "
+ "(PIO port = 0x%x, write = %u, len = %u, count = %u, data = %x",
+ ring_start, want_pio ? 4 : 8, want_pio ? "PIO" : "MMIO",
+ want_pio ? (unsigned long long)io->pio_port : io->mmio_gpa,
+ (want_pio ? io->pio_port : io->mmio_gpa) + io->ring_size - 1, run->exit_reason,
+ run->exit_reason == KVM_EXIT_MMIO ? "MMIO" : run->exit_reason == KVM_EXIT_IO ? "PIO" : "other",
+ run->mmio.phys_addr, run->mmio.is_write, run->mmio.len, *(uint64_t *)run->mmio.data,
+ run->io.port, run->io.direction, run->io.size, run->io.count, pio_value);
+}
+
+static void vcpu_run_and_verify_coalesced_io(struct kvm_vcpu *vcpu,
+ struct kvm_coalesced_io *io,
+ uint32_t ring_start,
+ uint32_t expected_exit)
+{
+ struct kvm_coalesced_mmio_ring *ring = io->ring;
+ int i;
+
+ vcpu_run_and_verify_io_exit(vcpu, io, ring_start, expected_exit);
+
+ TEST_ASSERT((ring->last + 1) % io->ring_size == ring->first,
+ "Expected ring to be full (minus 1), first = %u, last = %u, max = %u, start = %u",
+ ring->first, ring->last, io->ring_size, ring_start);
+
+ for (i = 0; i < io->ring_size - 1; i++) {
+ uint32_t idx = (ring->first + i) % io->ring_size;
+ struct kvm_coalesced_mmio *entry = &ring->coalesced_mmio[idx];
+
+#ifdef __x86_64__
+ if (i & 1)
+ TEST_ASSERT(entry->phys_addr == io->pio_port &&
+ entry->len == 4 && entry->pio &&
+ *(uint32_t *)entry->data == io->pio_port + i,
+ "Wanted 4-byte port I/O 0x%x = 0x%x in entry %u, got %u-byte %s 0x%llx = 0x%x",
+ io->pio_port, io->pio_port + i, i,
+ entry->len, entry->pio ? "PIO" : "MMIO",
+ entry->phys_addr, *(uint32_t *)entry->data);
+ else
+#endif
+ TEST_ASSERT(entry->phys_addr == io->mmio_gpa &&
+ entry->len == 8 && !entry->pio,
+ "Wanted 8-byte MMIO to 0x%lx = %lx in entry %u, got %u-byte %s 0x%llx = 0x%lx",
+ io->mmio_gpa, io->mmio_gpa + i, i,
+ entry->len, entry->pio ? "PIO" : "MMIO",
+ entry->phys_addr, *(uint64_t *)entry->data);
+ }
+}
+
+static void test_coalesced_io(struct kvm_vcpu *vcpu,
+ struct kvm_coalesced_io *io, uint32_t ring_start)
+{
+ struct kvm_coalesced_mmio_ring *ring = io->ring;
+
+ kvm_vm_register_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */);
+#ifdef __x86_64__
+ kvm_vm_register_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */);
+#endif
+
+ vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_MMIO);
+#ifdef __x86_64__
+ vcpu_run_and_verify_coalesced_io(vcpu, io, ring_start, KVM_EXIT_IO);
+#endif
+
+ /*
+ * Verify ucall, which may use non-coalesced MMIO or PIO, generates an
+ * immediate exit.
+ */
+ WRITE_ONCE(ring->first, ring_start);
+ WRITE_ONCE(ring->last, ring_start);
+ vcpu_run(vcpu);
+ TEST_ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC);
+ TEST_ASSERT_EQ(ring->first, ring_start);
+ TEST_ASSERT_EQ(ring->last, ring_start);
+
+ /* Verify that non-coalesced MMIO/PIO generates an exit to userspace. */
+ kvm_vm_unregister_coalesced_io(vcpu->vm, io->mmio_gpa, 8, false /* pio */);
+ vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_MMIO);
+
+#ifdef __x86_64__
+ kvm_vm_unregister_coalesced_io(vcpu->vm, io->pio_port, 8, true /* pio */);
+ vcpu_run_and_verify_io_exit(vcpu, io, ring_start, KVM_EXIT_IO);
+#endif
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ int i;
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_MMIO));
+
+#ifdef __x86_64__
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_COALESCED_PIO));
+#endif
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+
+ kvm_builtin_io_ring = (struct kvm_coalesced_io) {
+ /*
+ * The I/O ring is a kernel-allocated page whose address is
+ * relative to each vCPU's run page, with the page offset
+ * provided by KVM in the return of KVM_CAP_COALESCED_MMIO.
+ */
+ .ring = (void *)vcpu->run +
+ (kvm_check_cap(KVM_CAP_COALESCED_MMIO) * getpagesize()),
+
+ /*
+ * The size of the I/O ring is fixed, but KVM defines the sized
+ * based on the kernel's PAGE_SIZE. Thus, userspace must query
+ * the host's page size at runtime to compute the ring size.
+ */
+ .ring_size = (getpagesize() - sizeof(struct kvm_coalesced_mmio_ring)) /
+ sizeof(struct kvm_coalesced_mmio),
+
+ /*
+ * Arbitrary address+port (MMIO mustn't overlap memslots), with
+ * the MMIO GPA identity mapped in the guest.
+ */
+ .mmio_gpa = 4ull * SZ_1G,
+ .mmio = (uint64_t *)(4ull * SZ_1G),
+ .pio_port = 0x80,
+ };
+
+ virt_map(vm, (uint64_t)kvm_builtin_io_ring.mmio, kvm_builtin_io_ring.mmio_gpa, 1);
+
+ sync_global_to_guest(vm, kvm_builtin_io_ring);
+ vcpu_args_set(vcpu, 1, &kvm_builtin_io_ring);
+
+ for (i = 0; i < kvm_builtin_io_ring.ring_size; i++)
+ test_coalesced_io(vcpu, &kvm_builtin_io_ring, i);
+
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/guest_print_test.c b/tools/testing/selftests/kvm/guest_print_test.c
index 8092c2d0f5d6..bcf582852db9 100644
--- a/tools/testing/selftests/kvm/guest_print_test.c
+++ b/tools/testing/selftests/kvm/guest_print_test.c
@@ -107,6 +107,21 @@ static void ucall_abort(const char *assert_msg, const char *expected_assert_msg)
expected_assert_msg, &assert_msg[offset]);
}
+/*
+ * Open code vcpu_run(), sans the UCALL_ABORT handling, so that intentional
+ * guest asserts guest can be verified instead of being reported as failures.
+ */
+static void do_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ int r;
+
+ do {
+ r = __vcpu_run(vcpu);
+ } while (r == -1 && errno == EINTR);
+
+ TEST_ASSERT(!r, KVM_IOCTL_ERROR(KVM_RUN, r));
+}
+
static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf,
const char *expected_assert)
{
@@ -114,7 +129,7 @@ static void run_test(struct kvm_vcpu *vcpu, const char *expected_printf,
struct ucall uc;
while (1) {
- vcpu_run(vcpu);
+ do_vcpu_run(vcpu);
TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON,
"Unexpected exit reason: %u (%s),",
@@ -159,7 +174,7 @@ static void test_limits(void)
vm = vm_create_with_one_vcpu(&vcpu, guest_code_limits);
run = vcpu->run;
- vcpu_run(vcpu);
+ do_vcpu_run(vcpu);
TEST_ASSERT(run->exit_reason == UCALL_EXIT_REASON,
"Unexpected exit reason: %u (%s),",
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 63c2aaae51f3..bc7c242480d6 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -428,8 +428,6 @@ const char *vm_guest_mode_string(uint32_t i);
void kvm_vm_free(struct kvm_vm *vmp);
void kvm_vm_restart(struct kvm_vm *vmp);
void kvm_vm_release(struct kvm_vm *vmp);
-int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
- size_t len);
void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename);
int kvm_memfd_alloc(size_t size, bool hugepages);
@@ -460,6 +458,32 @@ static inline uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm)
return __vm_ioctl(vm, KVM_RESET_DIRTY_RINGS, NULL);
}
+static inline void kvm_vm_register_coalesced_io(struct kvm_vm *vm,
+ uint64_t address,
+ uint64_t size, bool pio)
+{
+ struct kvm_coalesced_mmio_zone zone = {
+ .addr = address,
+ .size = size,
+ .pio = pio,
+ };
+
+ vm_ioctl(vm, KVM_REGISTER_COALESCED_MMIO, &zone);
+}
+
+static inline void kvm_vm_unregister_coalesced_io(struct kvm_vm *vm,
+ uint64_t address,
+ uint64_t size, bool pio)
+{
+ struct kvm_coalesced_mmio_zone zone = {
+ .addr = address,
+ .size = size,
+ .pio = pio,
+ };
+
+ vm_ioctl(vm, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+}
+
static inline int vm_get_stats_fd(struct kvm_vm *vm)
{
int fd = __vm_ioctl(vm, KVM_GET_STATS_FD, NULL);
diff --git a/tools/testing/selftests/kvm/include/s390x/debug_print.h b/tools/testing/selftests/kvm/include/s390x/debug_print.h
new file mode 100644
index 000000000000..1bf275631cc6
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390x/debug_print.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Definition for kernel virtual machines on s390x
+ *
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ * Christoph Schlameuss <[email protected]>
+ */
+
+#ifndef SELFTEST_KVM_DEBUG_PRINT_H
+#define SELFTEST_KVM_DEBUG_PRINT_H
+
+#include "asm/ptrace.h"
+#include "kvm_util.h"
+#include "sie.h"
+
+static inline void print_hex_bytes(const char *name, u64 addr, size_t len)
+{
+ u64 pos;
+
+ pr_debug("%s (%p)\n", name, (void *)addr);
+ pr_debug(" 0/0x00---------|");
+ if (len > 8)
+ pr_debug(" 8/0x08---------|");
+ if (len > 16)
+ pr_debug(" 16/0x10--------|");
+ if (len > 24)
+ pr_debug(" 24/0x18--------|");
+ for (pos = 0; pos < len; pos += 8) {
+ if ((pos % 32) == 0)
+ pr_debug("\n %3lu 0x%.3lx ", pos, pos);
+ pr_debug(" %16lx", *((u64 *)(addr + pos)));
+ }
+ pr_debug("\n");
+}
+
+static inline void print_hex(const char *name, u64 addr)
+{
+ print_hex_bytes(name, addr, 512);
+}
+
+static inline void print_psw(struct kvm_run *run, struct kvm_s390_sie_block *sie_block)
+{
+ pr_debug("flags:0x%x psw:0x%.16llx:0x%.16llx exit:%u %s\n",
+ run->flags,
+ run->psw_mask, run->psw_addr,
+ run->exit_reason, exit_reason_str(run->exit_reason));
+ pr_debug("sie_block psw:0x%.16llx:0x%.16llx\n",
+ sie_block->psw_mask, sie_block->psw_addr);
+}
+
+static inline void print_run(struct kvm_run *run, struct kvm_s390_sie_block *sie_block)
+{
+ print_hex_bytes("run", (u64)run, 0x150);
+ print_hex("sie_block", (u64)sie_block);
+ print_psw(run, sie_block);
+}
+
+static inline void print_regs(struct kvm_run *run)
+{
+ struct kvm_sync_regs *sync_regs = &run->s.regs;
+
+ print_hex_bytes("GPRS", (u64)sync_regs->gprs, 8 * NUM_GPRS);
+ print_hex_bytes("ACRS", (u64)sync_regs->acrs, 4 * NUM_ACRS);
+ print_hex_bytes("CRS", (u64)sync_regs->crs, 8 * NUM_CRS);
+}
+
+#endif /* SELFTEST_KVM_DEBUG_PRINT_H */
diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h
index 255c9b990f4c..481bd2fd6a32 100644
--- a/tools/testing/selftests/kvm/include/s390x/processor.h
+++ b/tools/testing/selftests/kvm/include/s390x/processor.h
@@ -21,6 +21,11 @@
#define PAGE_PROTECT 0x200 /* HW read-only bit */
#define PAGE_NOEXEC 0x100 /* HW no-execute bit */
+/* Page size definitions */
+#define PAGE_SHIFT 12
+#define PAGE_SIZE BIT_ULL(PAGE_SHIFT)
+#define PAGE_MASK (~(PAGE_SIZE - 1))
+
/* Is there a portable way to do this? */
static inline void cpu_relax(void)
{
diff --git a/tools/testing/selftests/kvm/include/s390x/sie.h b/tools/testing/selftests/kvm/include/s390x/sie.h
new file mode 100644
index 000000000000..160acd4a1db9
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390x/sie.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Definition for kernel virtual machines on s390.
+ *
+ * Adapted copy of struct definition kvm_s390_sie_block from
+ * arch/s390/include/asm/kvm_host.h for use in userspace selftest programs.
+ *
+ * Copyright IBM Corp. 2008, 2024
+ *
+ * Authors:
+ * Christoph Schlameuss <[email protected]>
+ * Carsten Otte <[email protected]>
+ */
+
+#ifndef SELFTEST_KVM_SIE_H
+#define SELFTEST_KVM_SIE_H
+
+#include <linux/types.h>
+
+struct kvm_s390_sie_block {
+#define CPUSTAT_STOPPED 0x80000000
+#define CPUSTAT_WAIT 0x10000000
+#define CPUSTAT_ECALL_PEND 0x08000000
+#define CPUSTAT_STOP_INT 0x04000000
+#define CPUSTAT_IO_INT 0x02000000
+#define CPUSTAT_EXT_INT 0x01000000
+#define CPUSTAT_RUNNING 0x00800000
+#define CPUSTAT_RETAINED 0x00400000
+#define CPUSTAT_TIMING_SUB 0x00020000
+#define CPUSTAT_SIE_SUB 0x00010000
+#define CPUSTAT_RRF 0x00008000
+#define CPUSTAT_SLSV 0x00004000
+#define CPUSTAT_SLSR 0x00002000
+#define CPUSTAT_ZARCH 0x00000800
+#define CPUSTAT_MCDS 0x00000100
+#define CPUSTAT_KSS 0x00000200
+#define CPUSTAT_SM 0x00000080
+#define CPUSTAT_IBS 0x00000040
+#define CPUSTAT_GED2 0x00000010
+#define CPUSTAT_G 0x00000008
+#define CPUSTAT_GED 0x00000004
+#define CPUSTAT_J 0x00000002
+#define CPUSTAT_P 0x00000001
+ __u32 cpuflags; /* 0x0000 */
+ __u32: 1; /* 0x0004 */
+ __u32 prefix : 18;
+ __u32: 1;
+ __u32 ibc : 12;
+ __u8 reserved08[4]; /* 0x0008 */
+#define PROG_IN_SIE BIT(0)
+ __u32 prog0c; /* 0x000c */
+ union {
+ __u8 reserved10[16]; /* 0x0010 */
+ struct {
+ __u64 pv_handle_cpu;
+ __u64 pv_handle_config;
+ };
+ };
+#define PROG_BLOCK_SIE BIT(0)
+#define PROG_REQUEST BIT(1)
+ __u32 prog20; /* 0x0020 */
+ __u8 reserved24[4]; /* 0x0024 */
+ __u64 cputm; /* 0x0028 */
+ __u64 ckc; /* 0x0030 */
+ __u64 epoch; /* 0x0038 */
+ __u32 svcc; /* 0x0040 */
+#define LCTL_CR0 0x8000
+#define LCTL_CR6 0x0200
+#define LCTL_CR9 0x0040
+#define LCTL_CR10 0x0020
+#define LCTL_CR11 0x0010
+#define LCTL_CR14 0x0002
+ __u16 lctl; /* 0x0044 */
+ __s16 icpua; /* 0x0046 */
+#define ICTL_OPEREXC 0x80000000
+#define ICTL_PINT 0x20000000
+#define ICTL_LPSW 0x00400000
+#define ICTL_STCTL 0x00040000
+#define ICTL_ISKE 0x00004000
+#define ICTL_SSKE 0x00002000
+#define ICTL_RRBE 0x00001000
+#define ICTL_TPROT 0x00000200
+ __u32 ictl; /* 0x0048 */
+#define ECA_CEI 0x80000000
+#define ECA_IB 0x40000000
+#define ECA_SIGPI 0x10000000
+#define ECA_MVPGI 0x01000000
+#define ECA_AIV 0x00200000
+#define ECA_VX 0x00020000
+#define ECA_PROTEXCI 0x00002000
+#define ECA_APIE 0x00000008
+#define ECA_SII 0x00000001
+ __u32 eca; /* 0x004c */
+#define ICPT_INST 0x04
+#define ICPT_PROGI 0x08
+#define ICPT_INSTPROGI 0x0C
+#define ICPT_EXTREQ 0x10
+#define ICPT_EXTINT 0x14
+#define ICPT_IOREQ 0x18
+#define ICPT_WAIT 0x1c
+#define ICPT_VALIDITY 0x20
+#define ICPT_STOP 0x28
+#define ICPT_OPEREXC 0x2C
+#define ICPT_PARTEXEC 0x38
+#define ICPT_IOINST 0x40
+#define ICPT_KSS 0x5c
+#define ICPT_MCHKREQ 0x60
+#define ICPT_INT_ENABLE 0x64
+#define ICPT_PV_INSTR 0x68
+#define ICPT_PV_NOTIFY 0x6c
+#define ICPT_PV_PREF 0x70
+ __u8 icptcode; /* 0x0050 */
+ __u8 icptstatus; /* 0x0051 */
+ __u16 ihcpu; /* 0x0052 */
+ __u8 reserved54; /* 0x0054 */
+#define IICTL_CODE_NONE 0x00
+#define IICTL_CODE_MCHK 0x01
+#define IICTL_CODE_EXT 0x02
+#define IICTL_CODE_IO 0x03
+#define IICTL_CODE_RESTART 0x04
+#define IICTL_CODE_SPECIFICATION 0x10
+#define IICTL_CODE_OPERAND 0x11
+ __u8 iictl; /* 0x0055 */
+ __u16 ipa; /* 0x0056 */
+ __u32 ipb; /* 0x0058 */
+ __u32 scaoh; /* 0x005c */
+#define FPF_BPBC 0x20
+ __u8 fpf; /* 0x0060 */
+#define ECB_GS 0x40
+#define ECB_TE 0x10
+#define ECB_SPECI 0x08
+#define ECB_SRSI 0x04
+#define ECB_HOSTPROTINT 0x02
+#define ECB_PTF 0x01
+ __u8 ecb; /* 0x0061 */
+#define ECB2_CMMA 0x80
+#define ECB2_IEP 0x20
+#define ECB2_PFMFI 0x08
+#define ECB2_ESCA 0x04
+#define ECB2_ZPCI_LSI 0x02
+ __u8 ecb2; /* 0x0062 */
+#define ECB3_AISI 0x20
+#define ECB3_AISII 0x10
+#define ECB3_DEA 0x08
+#define ECB3_AES 0x04
+#define ECB3_RI 0x01
+ __u8 ecb3; /* 0x0063 */
+#define ESCA_SCAOL_MASK ~0x3fU
+ __u32 scaol; /* 0x0064 */
+ __u8 sdf; /* 0x0068 */
+ __u8 epdx; /* 0x0069 */
+ __u8 cpnc; /* 0x006a */
+ __u8 reserved6b; /* 0x006b */
+ __u32 todpr; /* 0x006c */
+#define GISA_FORMAT1 0x00000001
+ __u32 gd; /* 0x0070 */
+ __u8 reserved74[12]; /* 0x0074 */
+ __u64 mso; /* 0x0080 */
+ __u64 msl; /* 0x0088 */
+ __u64 psw_mask; /* 0x0090 */
+ __u64 psw_addr; /* 0x0098 */
+ __u64 gg14; /* 0x00a0 */
+ __u64 gg15; /* 0x00a8 */
+ __u8 reservedb0[8]; /* 0x00b0 */
+#define HPID_KVM 0x4
+#define HPID_VSIE 0x5
+ __u8 hpid; /* 0x00b8 */
+ __u8 reservedb9[7]; /* 0x00b9 */
+ union {
+ struct {
+ __u32 eiparams; /* 0x00c0 */
+ __u16 extcpuaddr; /* 0x00c4 */
+ __u16 eic; /* 0x00c6 */
+ };
+ __u64 mcic; /* 0x00c0 */
+ } __packed;
+ __u32 reservedc8; /* 0x00c8 */
+ union {
+ struct {
+ __u16 pgmilc; /* 0x00cc */
+ __u16 iprcc; /* 0x00ce */
+ };
+ __u32 edc; /* 0x00cc */
+ } __packed;
+ union {
+ struct {
+ __u32 dxc; /* 0x00d0 */
+ __u16 mcn; /* 0x00d4 */
+ __u8 perc; /* 0x00d6 */
+ __u8 peratmid; /* 0x00d7 */
+ };
+ __u64 faddr; /* 0x00d0 */
+ } __packed;
+ __u64 peraddr; /* 0x00d8 */
+ __u8 eai; /* 0x00e0 */
+ __u8 peraid; /* 0x00e1 */
+ __u8 oai; /* 0x00e2 */
+ __u8 armid; /* 0x00e3 */
+ __u8 reservede4[4]; /* 0x00e4 */
+ union {
+ __u64 tecmc; /* 0x00e8 */
+ struct {
+ __u16 subchannel_id; /* 0x00e8 */
+ __u16 subchannel_nr; /* 0x00ea */
+ __u32 io_int_parm; /* 0x00ec */
+ __u32 io_int_word; /* 0x00f0 */
+ };
+ } __packed;
+ __u8 reservedf4[8]; /* 0x00f4 */
+#define CRYCB_FORMAT_MASK 0x00000003
+#define CRYCB_FORMAT0 0x00000000
+#define CRYCB_FORMAT1 0x00000001
+#define CRYCB_FORMAT2 0x00000003
+ __u32 crycbd; /* 0x00fc */
+ __u64 gcr[16]; /* 0x0100 */
+ union {
+ __u64 gbea; /* 0x0180 */
+ __u64 sidad;
+ };
+ __u8 reserved188[8]; /* 0x0188 */
+ __u64 sdnxo; /* 0x0190 */
+ __u8 reserved198[8]; /* 0x0198 */
+ __u32 fac; /* 0x01a0 */
+ __u8 reserved1a4[20]; /* 0x01a4 */
+ __u64 cbrlo; /* 0x01b8 */
+ __u8 reserved1c0[8]; /* 0x01c0 */
+#define ECD_HOSTREGMGMT 0x20000000
+#define ECD_MEF 0x08000000
+#define ECD_ETOKENF 0x02000000
+#define ECD_ECC 0x00200000
+ __u32 ecd; /* 0x01c8 */
+ __u8 reserved1cc[18]; /* 0x01cc */
+ __u64 pp; /* 0x01de */
+ __u8 reserved1e6[2]; /* 0x01e6 */
+ __u64 itdba; /* 0x01e8 */
+ __u64 riccbd; /* 0x01f0 */
+ __u64 gvrd; /* 0x01f8 */
+} __packed __aligned(512);
+
+#endif /* SELFTEST_KVM_SIE_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/apic.h b/tools/testing/selftests/kvm/include/x86_64/apic.h
index 0f268b55fa06..51990094effd 100644
--- a/tools/testing/selftests/kvm/include/x86_64/apic.h
+++ b/tools/testing/selftests/kvm/include/x86_64/apic.h
@@ -11,6 +11,7 @@
#include <stdint.h>
#include "processor.h"
+#include "ucall_common.h"
#define APIC_DEFAULT_GPA 0xfee00000ULL
@@ -93,9 +94,27 @@ static inline uint64_t x2apic_read_reg(unsigned int reg)
return rdmsr(APIC_BASE_MSR + (reg >> 4));
}
+static inline uint8_t x2apic_write_reg_safe(unsigned int reg, uint64_t value)
+{
+ return wrmsr_safe(APIC_BASE_MSR + (reg >> 4), value);
+}
+
static inline void x2apic_write_reg(unsigned int reg, uint64_t value)
{
- wrmsr(APIC_BASE_MSR + (reg >> 4), value);
+ uint8_t fault = x2apic_write_reg_safe(reg, value);
+
+ __GUEST_ASSERT(!fault, "Unexpected fault 0x%x on WRMSR(%x) = %lx\n",
+ fault, APIC_BASE_MSR + (reg >> 4), value);
}
+static inline void x2apic_write_reg_fault(unsigned int reg, uint64_t value)
+{
+ uint8_t fault = x2apic_write_reg_safe(reg, value);
+
+ __GUEST_ASSERT(fault == GP_VECTOR,
+ "Wanted #GP on WRMSR(%x) = %lx, got 0x%x\n",
+ APIC_BASE_MSR + (reg >> 4), value, fault);
+}
+
+
#endif /* SELFTEST_KVM_APIC_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/hyperv.h b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
index fa65b908b13e..6849e2552f1b 100644
--- a/tools/testing/selftests/kvm/include/x86_64/hyperv.h
+++ b/tools/testing/selftests/kvm/include/x86_64/hyperv.h
@@ -186,6 +186,18 @@
#define HV_X64_ENLIGHTENED_VMCS_RECOMMENDED \
KVM_X86_CPU_FEATURE(HYPERV_CPUID_ENLIGHTMENT_INFO, 0, EAX, 14)
+/* HYPERV_CPUID_NESTED_FEATURES.EAX */
+#define HV_X64_NESTED_DIRECT_FLUSH \
+ KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 17)
+#define HV_X64_NESTED_GUEST_MAPPING_FLUSH \
+ KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 18)
+#define HV_X64_NESTED_MSR_BITMAP \
+ KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EAX, 19)
+
+/* HYPERV_CPUID_NESTED_FEATURES.EBX */
+#define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL \
+ KVM_X86_CPU_FEATURE(HYPERV_CPUID_NESTED_FEATURES, 0, EBX, 0)
+
/* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING \
KVM_X86_CPU_FEATURE(HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, 0, EAX, 1)
@@ -343,4 +355,10 @@ struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
/* HV_X64_MSR_TSC_INVARIANT_CONTROL bits */
#define HV_INVARIANT_TSC_EXPOSED BIT_ULL(0)
+const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
+const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu);
+void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu);
+
+bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature);
+
#endif /* !SELFTEST_KVM_HYPERV_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
index a0c1440017bb..e247f99e0473 100644
--- a/tools/testing/selftests/kvm/include/x86_64/processor.h
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -25,6 +25,10 @@ extern bool host_cpu_is_intel;
extern bool host_cpu_is_amd;
extern uint64_t guest_tsc_khz;
+#ifndef MAX_NR_CPUID_ENTRIES
+#define MAX_NR_CPUID_ENTRIES 100
+#endif
+
/* Forced emulation prefix, used to invoke the emulator unconditionally. */
#define KVM_FEP "ud2; .byte 'k', 'v', 'm';"
@@ -908,8 +912,6 @@ static inline void vcpu_xcrs_set(struct kvm_vcpu *vcpu, struct kvm_xcrs *xcrs)
const struct kvm_cpuid_entry2 *get_cpuid_entry(const struct kvm_cpuid2 *cpuid,
uint32_t function, uint32_t index);
const struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
-const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void);
-const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu);
static inline uint32_t kvm_cpu_fms(void)
{
@@ -1009,7 +1011,6 @@ static inline struct kvm_cpuid2 *allocate_kvm_cpuid2(int nr_entries)
}
void vcpu_init_cpuid(struct kvm_vcpu *vcpu, const struct kvm_cpuid2 *cpuid);
-void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu);
static inline struct kvm_cpuid_entry2 *__vcpu_get_cpuid_entry(struct kvm_vcpu *vcpu,
uint32_t function,
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 56b170b725b3..a2b7df5f1d39 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -712,16 +712,13 @@ void kvm_vm_release(struct kvm_vm *vmp)
}
static void __vm_mem_region_delete(struct kvm_vm *vm,
- struct userspace_mem_region *region,
- bool unlink)
+ struct userspace_mem_region *region)
{
int ret;
- if (unlink) {
- rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
- rb_erase(&region->hva_node, &vm->regions.hva_tree);
- hash_del(&region->slot_node);
- }
+ rb_erase(&region->gpa_node, &vm->regions.gpa_tree);
+ rb_erase(&region->hva_node, &vm->regions.hva_tree);
+ hash_del(&region->slot_node);
region->region.memory_size = 0;
vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &region->region);
@@ -762,7 +759,7 @@ void kvm_vm_free(struct kvm_vm *vmp)
/* Free userspace_mem_regions. */
hash_for_each_safe(vmp->regions.slot_hash, ctr, node, region, slot_node)
- __vm_mem_region_delete(vmp, region, false);
+ __vm_mem_region_delete(vmp, region);
/* Free sparsebit arrays. */
sparsebit_free(&vmp->vpages_valid);
@@ -794,76 +791,6 @@ int kvm_memfd_alloc(size_t size, bool hugepages)
return fd;
}
-/*
- * Memory Compare, host virtual to guest virtual
- *
- * Input Args:
- * hva - Starting host virtual address
- * vm - Virtual Machine
- * gva - Starting guest virtual address
- * len - number of bytes to compare
- *
- * Output Args: None
- *
- * Input/Output Args: None
- *
- * Return:
- * Returns 0 if the bytes starting at hva for a length of len
- * are equal the guest virtual bytes starting at gva. Returns
- * a value < 0, if bytes at hva are less than those at gva.
- * Otherwise a value > 0 is returned.
- *
- * Compares the bytes starting at the host virtual address hva, for
- * a length of len, to the guest bytes starting at the guest virtual
- * address given by gva.
- */
-int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
-{
- size_t amt;
-
- /*
- * Compare a batch of bytes until either a match is found
- * or all the bytes have been compared.
- */
- for (uintptr_t offset = 0; offset < len; offset += amt) {
- uintptr_t ptr1 = (uintptr_t)hva + offset;
-
- /*
- * Determine host address for guest virtual address
- * at offset.
- */
- uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
-
- /*
- * Determine amount to compare on this pass.
- * Don't allow the comparsion to cross a page boundary.
- */
- amt = len - offset;
- if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift))
- amt = vm->page_size - (ptr1 % vm->page_size);
- if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift))
- amt = vm->page_size - (ptr2 % vm->page_size);
-
- assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
- assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
-
- /*
- * Perform the comparison. If there is a difference
- * return that result to the caller, otherwise need
- * to continue on looking for a mismatch.
- */
- int ret = memcmp((void *)ptr1, (void *)ptr2, amt);
- if (ret != 0)
- return ret;
- }
-
- /*
- * No mismatch found. Let the caller know the two memory
- * areas are equal.
- */
- return 0;
-}
-
static void vm_userspace_mem_region_gpa_insert(struct rb_root *gpa_tree,
struct userspace_mem_region *region)
{
@@ -1270,7 +1197,7 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
*/
void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
{
- __vm_mem_region_delete(vm, memslot2region(vm, slot), true);
+ __vm_mem_region_delete(vm, memslot2region(vm, slot));
}
void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size,
diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c
index 4ad4492eea1d..20cfe970e3e3 100644
--- a/tools/testing/selftests/kvm/lib/s390x/processor.c
+++ b/tools/testing/selftests/kvm/lib/s390x/processor.c
@@ -14,7 +14,7 @@ void virt_arch_pgd_alloc(struct kvm_vm *vm)
{
vm_paddr_t paddr;
- TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
vm->page_size);
if (vm->pgd_created)
@@ -79,7 +79,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa)
}
/* Fill in page table entry */
- idx = (gva >> 12) & 0x0ffu; /* page index */
+ idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */
if (!(entry[idx] & PAGE_INVALID))
fprintf(stderr,
"WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa);
@@ -91,7 +91,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
int ri, idx;
uint64_t *entry;
- TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
vm->page_size);
entry = addr_gpa2hva(vm, vm->pgd);
@@ -103,7 +103,7 @@ vm_paddr_t addr_arch_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
}
- idx = (gva >> 12) & 0x0ffu; /* page index */
+ idx = (gva >> PAGE_SHIFT) & 0x0ffu; /* page index */
TEST_ASSERT(!(entry[idx] & PAGE_INVALID),
"No page mapping for vm virtual address 0x%lx", gva);
@@ -168,7 +168,7 @@ struct kvm_vcpu *vm_arch_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id)
struct kvm_sregs sregs;
struct kvm_vcpu *vcpu;
- TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ TEST_ASSERT(vm->page_size == PAGE_SIZE, "Unsupported page size: 0x%x",
vm->page_size);
stack_vaddr = __vm_vaddr_alloc(vm, stack_size,
diff --git a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c
index efb7e7a1354d..15bc8cd583aa 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/hyperv.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/hyperv.c
@@ -8,6 +8,73 @@
#include "processor.h"
#include "hyperv.h"
+const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
+{
+ static struct kvm_cpuid2 *cpuid;
+ int kvm_fd;
+
+ if (cpuid)
+ return cpuid;
+
+ cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
+ kvm_fd = open_kvm_dev_path_or_exit();
+
+ kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+
+ close(kvm_fd);
+ return cpuid;
+}
+
+void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu)
+{
+ static struct kvm_cpuid2 *cpuid_full;
+ const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv;
+ int i, nent = 0;
+
+ if (!cpuid_full) {
+ cpuid_sys = kvm_get_supported_cpuid();
+ cpuid_hv = kvm_get_supported_hv_cpuid();
+
+ cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent);
+ if (!cpuid_full) {
+ perror("malloc");
+ abort();
+ }
+
+ /* Need to skip KVM CPUID leaves 0x400000xx */
+ for (i = 0; i < cpuid_sys->nent; i++) {
+ if (cpuid_sys->entries[i].function >= 0x40000000 &&
+ cpuid_sys->entries[i].function < 0x40000100)
+ continue;
+ cpuid_full->entries[nent] = cpuid_sys->entries[i];
+ nent++;
+ }
+
+ memcpy(&cpuid_full->entries[nent], cpuid_hv->entries,
+ cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2));
+ cpuid_full->nent = nent + cpuid_hv->nent;
+ }
+
+ vcpu_init_cpuid(vcpu, cpuid_full);
+}
+
+const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
+
+ vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+
+ return cpuid;
+}
+
+bool kvm_hv_cpu_has(struct kvm_x86_cpu_feature feature)
+{
+ if (!kvm_has_cap(KVM_CAP_SYS_HYPERV_CPUID))
+ return false;
+
+ return kvm_cpuid_has(kvm_get_supported_hv_cpuid(), feature);
+}
+
struct hyperv_test_pages *vcpu_alloc_hyperv_test_pages(struct kvm_vm *vm,
vm_vaddr_t *p_hv_pages_gva)
{
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
index 153739f2e201..974bcd2df6d7 100644
--- a/tools/testing/selftests/kvm/lib/x86_64/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -19,8 +19,6 @@
#define KERNEL_DS 0x10
#define KERNEL_TSS 0x18
-#define MAX_NR_CPUID_ENTRIES 100
-
vm_vaddr_t exception_handlers;
bool host_cpu_is_amd;
bool host_cpu_is_intel;
@@ -566,10 +564,8 @@ void route_exception(struct ex_regs *regs)
if (kvm_fixup_exception(regs))
return;
- ucall_assert(UCALL_UNHANDLED,
- "Unhandled exception in guest", __FILE__, __LINE__,
- "Unhandled exception '0x%lx' at guest RIP '0x%lx'",
- regs->vector, regs->rip);
+ GUEST_FAIL("Unhandled exception '0x%lx' at guest RIP '0x%lx'",
+ regs->vector, regs->rip);
}
static void vm_init_descriptor_tables(struct kvm_vm *vm)
@@ -611,7 +607,7 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu)
{
struct ucall uc;
- if (get_ucall(vcpu, &uc) == UCALL_UNHANDLED)
+ if (get_ucall(vcpu, &uc) == UCALL_ABORT)
REPORT_GUEST_ASSERT(uc);
}
@@ -1195,65 +1191,6 @@ void xen_hypercall(uint64_t nr, uint64_t a0, void *a1)
GUEST_ASSERT(!__xen_hypercall(nr, a0, a1));
}
-const struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(void)
-{
- static struct kvm_cpuid2 *cpuid;
- int kvm_fd;
-
- if (cpuid)
- return cpuid;
-
- cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
- kvm_fd = open_kvm_dev_path_or_exit();
-
- kvm_ioctl(kvm_fd, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
-
- close(kvm_fd);
- return cpuid;
-}
-
-void vcpu_set_hv_cpuid(struct kvm_vcpu *vcpu)
-{
- static struct kvm_cpuid2 *cpuid_full;
- const struct kvm_cpuid2 *cpuid_sys, *cpuid_hv;
- int i, nent = 0;
-
- if (!cpuid_full) {
- cpuid_sys = kvm_get_supported_cpuid();
- cpuid_hv = kvm_get_supported_hv_cpuid();
-
- cpuid_full = allocate_kvm_cpuid2(cpuid_sys->nent + cpuid_hv->nent);
- if (!cpuid_full) {
- perror("malloc");
- abort();
- }
-
- /* Need to skip KVM CPUID leaves 0x400000xx */
- for (i = 0; i < cpuid_sys->nent; i++) {
- if (cpuid_sys->entries[i].function >= 0x40000000 &&
- cpuid_sys->entries[i].function < 0x40000100)
- continue;
- cpuid_full->entries[nent] = cpuid_sys->entries[i];
- nent++;
- }
-
- memcpy(&cpuid_full->entries[nent], cpuid_hv->entries,
- cpuid_hv->nent * sizeof(struct kvm_cpuid_entry2));
- cpuid_full->nent = nent + cpuid_hv->nent;
- }
-
- vcpu_init_cpuid(vcpu, cpuid_full);
-}
-
-const struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vcpu *vcpu)
-{
- struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(MAX_NR_CPUID_ENTRIES);
-
- vcpu_ioctl(vcpu, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
-
- return cpuid;
-}
-
unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
{
const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
index 49f162573126..e3343f0df9e1 100644
--- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c
+++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c
@@ -79,6 +79,7 @@ struct test_params {
useconds_t delay;
uint64_t nr_iterations;
bool partition_vcpu_memory_access;
+ bool disable_slot_zap_quirk;
};
static void run_test(enum vm_guest_mode mode, void *arg)
@@ -89,6 +90,13 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vm = memstress_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1,
VM_MEM_SRC_ANONYMOUS,
p->partition_vcpu_memory_access);
+#ifdef __x86_64__
+ if (p->disable_slot_zap_quirk)
+ vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+
+ pr_info("Memslot zap quirk %s\n", p->disable_slot_zap_quirk ?
+ "disabled" : "enabled");
+#endif
pr_info("Finished creating vCPUs\n");
@@ -107,11 +115,12 @@ static void run_test(enum vm_guest_mode mode, void *arg)
static void help(char *name)
{
puts("");
- printf("usage: %s [-h] [-m mode] [-d delay_usec]\n"
+ printf("usage: %s [-h] [-m mode] [-d delay_usec] [-q]\n"
" [-b memory] [-v vcpus] [-o] [-i iterations]\n", name);
guest_modes_help();
printf(" -d: add a delay between each iteration of adding and\n"
" deleting a memslot in usec.\n");
+ printf(" -q: Disable memslot zap quirk.\n");
printf(" -b: specify the size of the memory region which should be\n"
" accessed by each vCPU. e.g. 10M or 3G.\n"
" Default: 1G\n");
@@ -137,7 +146,7 @@ int main(int argc, char *argv[])
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "hm:d:b:v:oi:")) != -1) {
+ while ((opt = getopt(argc, argv, "hm:d:qb:v:oi:")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
@@ -160,6 +169,12 @@ int main(int argc, char *argv[])
case 'i':
p.nr_iterations = atoi_positive("Number of iterations", optarg);
break;
+ case 'q':
+ p.disable_slot_zap_quirk = true;
+
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
+ KVM_X86_QUIRK_SLOT_ZAP_ALL);
+ break;
case 'h':
default:
help(argv[0]);
diff --git a/tools/testing/selftests/kvm/memslot_perf_test.c b/tools/testing/selftests/kvm/memslot_perf_test.c
index 579a64f97333..893366982f77 100644
--- a/tools/testing/selftests/kvm/memslot_perf_test.c
+++ b/tools/testing/selftests/kvm/memslot_perf_test.c
@@ -113,6 +113,7 @@ static_assert(ATOMIC_BOOL_LOCK_FREE == 2, "atomic bool is not lockless");
static sem_t vcpu_ready;
static bool map_unmap_verify;
+static bool disable_slot_zap_quirk;
static bool verbose;
#define pr_info_v(...) \
@@ -578,6 +579,9 @@ static bool test_memslot_move_prepare(struct vm_data *data,
uint32_t guest_page_size = data->vm->page_size;
uint64_t movesrcgpa, movetestgpa;
+ if (disable_slot_zap_quirk)
+ vm_enable_cap(data->vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+
movesrcgpa = vm_slot2gpa(data, data->nslots - 1);
if (isactive) {
@@ -896,6 +900,7 @@ static void help(char *name, struct test_args *targs)
pr_info(" -h: print this help screen.\n");
pr_info(" -v: enable verbose mode (not for benchmarking).\n");
pr_info(" -d: enable extra debug checks.\n");
+ pr_info(" -q: Disable memslot zap quirk during memslot move.\n");
pr_info(" -s: specify memslot count cap (-1 means no cap; currently: %i)\n",
targs->nslots);
pr_info(" -f: specify the first test to run (currently: %i; max %zu)\n",
@@ -954,7 +959,7 @@ static bool parse_args(int argc, char *argv[],
uint32_t max_mem_slots;
int opt;
- while ((opt = getopt(argc, argv, "hvds:f:e:l:r:")) != -1) {
+ while ((opt = getopt(argc, argv, "hvdqs:f:e:l:r:")) != -1) {
switch (opt) {
case 'h':
default:
@@ -966,6 +971,11 @@ static bool parse_args(int argc, char *argv[],
case 'd':
map_unmap_verify = true;
break;
+ case 'q':
+ disable_slot_zap_quirk = true;
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) &
+ KVM_X86_QUIRK_SLOT_ZAP_ALL);
+ break;
case 's':
targs->nslots = atoi_paranoid(optarg);
if (targs->nslots <= 1 && targs->nslots != -1) {
diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c
index b39033844756..e32dd59703a0 100644
--- a/tools/testing/selftests/kvm/s390x/cmma_test.c
+++ b/tools/testing/selftests/kvm/s390x/cmma_test.c
@@ -17,16 +17,17 @@
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"
+#include "processor.h"
#define MAIN_PAGE_COUNT 512
#define TEST_DATA_PAGE_COUNT 512
#define TEST_DATA_MEMSLOT 1
-#define TEST_DATA_START_GFN 4096
+#define TEST_DATA_START_GFN PAGE_SIZE
#define TEST_DATA_TWO_PAGE_COUNT 256
#define TEST_DATA_TWO_MEMSLOT 2
-#define TEST_DATA_TWO_START_GFN 8192
+#define TEST_DATA_TWO_START_GFN (2 * PAGE_SIZE)
static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT];
@@ -66,7 +67,7 @@ static void guest_dirty_test_data(void)
" lghi 5,%[page_count]\n"
/* r5 += r1 */
"2: agfr 5,1\n"
- /* r2 = r1 << 12 */
+ /* r2 = r1 << PAGE_SHIFT */
"1: sllg 2,1,12(0)\n"
/* essa(r4, r2, SET_STABLE) */
" .insn rrf,0xb9ab0000,4,2,1,0\n"
diff --git a/tools/testing/selftests/kvm/s390x/config b/tools/testing/selftests/kvm/s390x/config
new file mode 100644
index 000000000000..23270f2d679f
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/config
@@ -0,0 +1,2 @@
+CONFIG_KVM=y
+CONFIG_KVM_S390_UCONTROL=y
diff --git a/tools/testing/selftests/kvm/s390x/debug_test.c b/tools/testing/selftests/kvm/s390x/debug_test.c
index 84313fb27529..ad8095968601 100644
--- a/tools/testing/selftests/kvm/s390x/debug_test.c
+++ b/tools/testing/selftests/kvm/s390x/debug_test.c
@@ -2,12 +2,12 @@
/* Test KVM debugging features. */
#include "kvm_util.h"
#include "test_util.h"
+#include "sie.h"
#include <linux/kvm.h>
#define __LC_SVC_NEW_PSW 0x1c0
#define __LC_PGM_NEW_PSW 0x1d0
-#define ICPT_INSTRUCTION 0x04
#define IPA0_DIAG 0x8300
#define PGM_SPECIFICATION 0x06
@@ -85,7 +85,7 @@ static void test_step_pgm_diag(void)
vm = test_step_int_1(&vcpu, test_step_pgm_diag_guest_code,
__LC_PGM_NEW_PSW, new_psw);
TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC);
- TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INSTRUCTION);
+ TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_INST);
TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa & 0xff00, IPA0_DIAG);
vcpu_ioctl(vcpu, KVM_S390_IRQ, &irq);
vcpu_run(vcpu);
diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
index f2df7416be84..4374b4cd2a80 100644
--- a/tools/testing/selftests/kvm/s390x/memop.c
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -16,6 +16,7 @@
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"
+#include "processor.h"
enum mop_target {
LOGICAL,
@@ -226,9 +227,6 @@ static void memop_ioctl(struct test_info info, struct kvm_s390_mem_op *ksmo,
#define CHECK_N_DO(f, ...) ({ f(__VA_ARGS__, CHECK_ONLY); f(__VA_ARGS__); })
-#define PAGE_SHIFT 12
-#define PAGE_SIZE (1ULL << PAGE_SHIFT)
-#define PAGE_MASK (~(PAGE_SIZE - 1))
#define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38))
#define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39))
diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c
index 7a742a673b7c..12d5e1cb62e3 100644
--- a/tools/testing/selftests/kvm/s390x/tprot.c
+++ b/tools/testing/selftests/kvm/s390x/tprot.c
@@ -9,9 +9,8 @@
#include "kvm_util.h"
#include "kselftest.h"
#include "ucall_common.h"
+#include "processor.h"
-#define PAGE_SHIFT 12
-#define PAGE_SIZE (1 << PAGE_SHIFT)
#define CR0_FETCH_PROTECTION_OVERRIDE (1UL << (63 - 38))
#define CR0_STORAGE_PROTECTION_OVERRIDE (1UL << (63 - 39))
@@ -151,7 +150,7 @@ static enum stage perform_next_stage(int *i, bool mapped_0)
* instead.
* In order to skip these tests we detect this inside the guest
*/
- skip = tests[*i].addr < (void *)4096 &&
+ skip = tests[*i].addr < (void *)PAGE_SIZE &&
tests[*i].expected != TRANSL_UNAVAIL &&
!mapped_0;
if (!skip) {
diff --git a/tools/testing/selftests/kvm/s390x/ucontrol_test.c b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
new file mode 100644
index 000000000000..f257beec1430
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/ucontrol_test.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test code for the s390x kvm ucontrol interface
+ *
+ * Copyright IBM Corp. 2024
+ *
+ * Authors:
+ * Christoph Schlameuss <[email protected]>
+ */
+#include "debug_print.h"
+#include "kselftest_harness.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "sie.h"
+
+#include <linux/capability.h>
+#include <linux/sizes.h>
+
+#define VM_MEM_SIZE (4 * SZ_1M)
+
+/* so directly declare capget to check caps without libcap */
+int capget(cap_user_header_t header, cap_user_data_t data);
+
+/**
+ * In order to create user controlled virtual machines on S390,
+ * check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL
+ * as privileged user (SYS_ADMIN).
+ */
+void require_ucontrol_admin(void)
+{
+ struct __user_cap_data_struct data[_LINUX_CAPABILITY_U32S_3];
+ struct __user_cap_header_struct hdr = {
+ .version = _LINUX_CAPABILITY_VERSION_3,
+ };
+ int rc;
+
+ rc = capget(&hdr, data);
+ TEST_ASSERT_EQ(0, rc);
+ TEST_REQUIRE((data->effective & CAP_TO_MASK(CAP_SYS_ADMIN)) > 0);
+
+ TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_UCONTROL));
+}
+
+/* Test program setting some registers and looping */
+extern char test_gprs_asm[];
+asm("test_gprs_asm:\n"
+ "xgr %r0, %r0\n"
+ "lgfi %r1,1\n"
+ "lgfi %r2,2\n"
+ "lgfi %r3,3\n"
+ "lgfi %r4,4\n"
+ "lgfi %r5,5\n"
+ "lgfi %r6,6\n"
+ "lgfi %r7,7\n"
+ "0:\n"
+ " diag 0,0,0x44\n"
+ " ahi %r0,1\n"
+ " j 0b\n"
+);
+
+FIXTURE(uc_kvm)
+{
+ struct kvm_s390_sie_block *sie_block;
+ struct kvm_run *run;
+ uintptr_t base_gpa;
+ uintptr_t code_gpa;
+ uintptr_t base_hva;
+ uintptr_t code_hva;
+ int kvm_run_size;
+ void *vm_mem;
+ int vcpu_fd;
+ int kvm_fd;
+ int vm_fd;
+};
+
+/**
+ * create VM with single vcpu, map kvm_run and SIE control block for easy access
+ */
+FIXTURE_SETUP(uc_kvm)
+{
+ struct kvm_s390_vm_cpu_processor info;
+ int rc;
+
+ require_ucontrol_admin();
+
+ self->kvm_fd = open_kvm_dev_path_or_exit();
+ self->vm_fd = ioctl(self->kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL);
+ ASSERT_GE(self->vm_fd, 0);
+
+ kvm_device_attr_get(self->vm_fd, KVM_S390_VM_CPU_MODEL,
+ KVM_S390_VM_CPU_PROCESSOR, &info);
+ TH_LOG("create VM 0x%llx", info.cpuid);
+
+ self->vcpu_fd = ioctl(self->vm_fd, KVM_CREATE_VCPU, 0);
+ ASSERT_GE(self->vcpu_fd, 0);
+
+ self->kvm_run_size = ioctl(self->kvm_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
+ ASSERT_GE(self->kvm_run_size, sizeof(struct kvm_run))
+ TH_LOG(KVM_IOCTL_ERROR(KVM_GET_VCPU_MMAP_SIZE, self->kvm_run_size));
+ self->run = (struct kvm_run *)mmap(NULL, self->kvm_run_size,
+ PROT_READ | PROT_WRITE, MAP_SHARED, self->vcpu_fd, 0);
+ ASSERT_NE(self->run, MAP_FAILED);
+ /**
+ * For virtual cpus that have been created with S390 user controlled
+ * virtual machines, the resulting vcpu fd can be memory mapped at page
+ * offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of
+ * the virtual cpu's hardware control block.
+ */
+ self->sie_block = (struct kvm_s390_sie_block *)mmap(NULL, PAGE_SIZE,
+ PROT_READ | PROT_WRITE, MAP_SHARED,
+ self->vcpu_fd, KVM_S390_SIE_PAGE_OFFSET << PAGE_SHIFT);
+ ASSERT_NE(self->sie_block, MAP_FAILED);
+
+ TH_LOG("VM created %p %p", self->run, self->sie_block);
+
+ self->base_gpa = 0;
+ self->code_gpa = self->base_gpa + (3 * SZ_1M);
+
+ self->vm_mem = aligned_alloc(SZ_1M, VM_MEM_SIZE);
+ ASSERT_NE(NULL, self->vm_mem) TH_LOG("malloc failed %u", errno);
+ self->base_hva = (uintptr_t)self->vm_mem;
+ self->code_hva = self->base_hva - self->base_gpa + self->code_gpa;
+ struct kvm_s390_ucas_mapping map = {
+ .user_addr = self->base_hva,
+ .vcpu_addr = self->base_gpa,
+ .length = VM_MEM_SIZE,
+ };
+ TH_LOG("ucas map %p %p 0x%llx",
+ (void *)map.user_addr, (void *)map.vcpu_addr, map.length);
+ rc = ioctl(self->vcpu_fd, KVM_S390_UCAS_MAP, &map);
+ ASSERT_EQ(0, rc) TH_LOG("ucas map result %d not expected, %s",
+ rc, strerror(errno));
+
+ TH_LOG("page in %p", (void *)self->base_gpa);
+ rc = ioctl(self->vcpu_fd, KVM_S390_VCPU_FAULT, self->base_gpa);
+ ASSERT_EQ(0, rc) TH_LOG("vcpu fault (%p) result %d not expected, %s",
+ (void *)self->base_hva, rc, strerror(errno));
+
+ self->sie_block->cpuflags &= ~CPUSTAT_STOPPED;
+}
+
+FIXTURE_TEARDOWN(uc_kvm)
+{
+ munmap(self->sie_block, PAGE_SIZE);
+ munmap(self->run, self->kvm_run_size);
+ close(self->vcpu_fd);
+ close(self->vm_fd);
+ close(self->kvm_fd);
+ free(self->vm_mem);
+}
+
+TEST_F(uc_kvm, uc_sie_assertions)
+{
+ /* assert interception of Code 08 (Program Interruption) is set */
+ EXPECT_EQ(0, self->sie_block->ecb & ECB_SPECI);
+}
+
+TEST_F(uc_kvm, uc_attr_mem_limit)
+{
+ u64 limit;
+ struct kvm_device_attr attr = {
+ .group = KVM_S390_VM_MEM_CTRL,
+ .attr = KVM_S390_VM_MEM_LIMIT_SIZE,
+ .addr = (unsigned long)&limit,
+ };
+ int rc;
+
+ rc = ioctl(self->vm_fd, KVM_GET_DEVICE_ATTR, &attr);
+ EXPECT_EQ(0, rc);
+ EXPECT_EQ(~0UL, limit);
+
+ /* assert set not supported */
+ rc = ioctl(self->vm_fd, KVM_SET_DEVICE_ATTR, &attr);
+ EXPECT_EQ(-1, rc);
+ EXPECT_EQ(EINVAL, errno);
+}
+
+TEST_F(uc_kvm, uc_no_dirty_log)
+{
+ struct kvm_dirty_log dlog;
+ int rc;
+
+ rc = ioctl(self->vm_fd, KVM_GET_DIRTY_LOG, &dlog);
+ EXPECT_EQ(-1, rc);
+ EXPECT_EQ(EINVAL, errno);
+}
+
+/**
+ * Assert HPAGE CAP cannot be enabled on UCONTROL VM
+ */
+TEST(uc_cap_hpage)
+{
+ int rc, kvm_fd, vm_fd, vcpu_fd;
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_S390_HPAGE_1M,
+ };
+
+ require_ucontrol_admin();
+
+ kvm_fd = open_kvm_dev_path_or_exit();
+ vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, KVM_VM_S390_UCONTROL);
+ ASSERT_GE(vm_fd, 0);
+
+ /* assert hpages are not supported on ucontrol vm */
+ rc = ioctl(vm_fd, KVM_CHECK_EXTENSION, KVM_CAP_S390_HPAGE_1M);
+ EXPECT_EQ(0, rc);
+
+ /* Test that KVM_CAP_S390_HPAGE_1M can't be enabled for a ucontrol vm */
+ rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap);
+ EXPECT_EQ(-1, rc);
+ EXPECT_EQ(EINVAL, errno);
+
+ /* assert HPAGE CAP is rejected after vCPU creation */
+ vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
+ ASSERT_GE(vcpu_fd, 0);
+ rc = ioctl(vm_fd, KVM_ENABLE_CAP, cap);
+ EXPECT_EQ(-1, rc);
+ EXPECT_EQ(EBUSY, errno);
+
+ close(vcpu_fd);
+ close(vm_fd);
+ close(kvm_fd);
+}
+
+/* verify SIEIC exit
+ * * fail on codes not expected in the test cases
+ */
+static bool uc_handle_sieic(FIXTURE_DATA(uc_kvm) * self)
+{
+ struct kvm_s390_sie_block *sie_block = self->sie_block;
+ struct kvm_run *run = self->run;
+
+ /* check SIE interception code */
+ pr_info("sieic: 0x%.2x 0x%.4x 0x%.4x\n",
+ run->s390_sieic.icptcode,
+ run->s390_sieic.ipa,
+ run->s390_sieic.ipb);
+ switch (run->s390_sieic.icptcode) {
+ case ICPT_INST:
+ /* end execution in caller on intercepted instruction */
+ pr_info("sie instruction interception\n");
+ return false;
+ case ICPT_OPEREXC:
+ /* operation exception */
+ TEST_FAIL("sie exception on %.4x%.8x", sie_block->ipa, sie_block->ipb);
+ default:
+ TEST_FAIL("UNEXPECTED SIEIC CODE %d", run->s390_sieic.icptcode);
+ }
+ return true;
+}
+
+/* verify VM state on exit */
+static bool uc_handle_exit(FIXTURE_DATA(uc_kvm) * self)
+{
+ struct kvm_run *run = self->run;
+
+ switch (run->exit_reason) {
+ case KVM_EXIT_S390_SIEIC:
+ return uc_handle_sieic(self);
+ default:
+ pr_info("exit_reason %2d not handled\n", run->exit_reason);
+ }
+ return true;
+}
+
+/* run the VM until interrupted */
+static int uc_run_once(FIXTURE_DATA(uc_kvm) * self)
+{
+ int rc;
+
+ rc = ioctl(self->vcpu_fd, KVM_RUN, NULL);
+ print_run(self->run, self->sie_block);
+ print_regs(self->run);
+ pr_debug("run %d / %d %s\n", rc, errno, strerror(errno));
+ return rc;
+}
+
+static void uc_assert_diag44(FIXTURE_DATA(uc_kvm) * self)
+{
+ struct kvm_s390_sie_block *sie_block = self->sie_block;
+
+ /* assert vm was interrupted by diag 0x0044 */
+ TEST_ASSERT_EQ(KVM_EXIT_S390_SIEIC, self->run->exit_reason);
+ TEST_ASSERT_EQ(ICPT_INST, sie_block->icptcode);
+ TEST_ASSERT_EQ(0x8300, sie_block->ipa);
+ TEST_ASSERT_EQ(0x440000, sie_block->ipb);
+}
+
+TEST_F(uc_kvm, uc_gprs)
+{
+ struct kvm_sync_regs *sync_regs = &self->run->s.regs;
+ struct kvm_run *run = self->run;
+ struct kvm_regs regs = {};
+
+ /* Set registers to values that are different from the ones that we expect below */
+ for (int i = 0; i < 8; i++)
+ sync_regs->gprs[i] = 8;
+ run->kvm_dirty_regs |= KVM_SYNC_GPRS;
+
+ /* copy test_gprs_asm to code_hva / code_gpa */
+ TH_LOG("copy code %p to vm mapped memory %p / %p",
+ &test_gprs_asm, (void *)self->code_hva, (void *)self->code_gpa);
+ memcpy((void *)self->code_hva, &test_gprs_asm, PAGE_SIZE);
+
+ /* DAT disabled + 64 bit mode */
+ run->psw_mask = 0x0000000180000000ULL;
+ run->psw_addr = self->code_gpa;
+
+ /* run and expect interception of diag 44 */
+ ASSERT_EQ(0, uc_run_once(self));
+ ASSERT_EQ(false, uc_handle_exit(self));
+ uc_assert_diag44(self);
+
+ /* Retrieve and check guest register values */
+ ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs));
+ for (int i = 0; i < 8; i++) {
+ ASSERT_EQ(i, regs.gprs[i]);
+ ASSERT_EQ(i, sync_regs->gprs[i]);
+ }
+
+ /* run and expect interception of diag 44 again */
+ ASSERT_EQ(0, uc_run_once(self));
+ ASSERT_EQ(false, uc_handle_exit(self));
+ uc_assert_diag44(self);
+
+ /* check continued increment of register 0 value */
+ ASSERT_EQ(0, ioctl(self->vcpu_fd, KVM_GET_REGS, &regs));
+ ASSERT_EQ(1, regs.gprs[0]);
+ ASSERT_EQ(1, sync_regs->gprs[0]);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
index bb8002084f52..a8267628e9ed 100644
--- a/tools/testing/selftests/kvm/set_memory_region_test.c
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -175,7 +175,7 @@ static void guest_code_move_memory_region(void)
GUEST_DONE();
}
-static void test_move_memory_region(void)
+static void test_move_memory_region(bool disable_slot_zap_quirk)
{
pthread_t vcpu_thread;
struct kvm_vcpu *vcpu;
@@ -184,6 +184,9 @@ static void test_move_memory_region(void)
vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_move_memory_region);
+ if (disable_slot_zap_quirk)
+ vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+
hva = addr_gpa2hva(vm, MEM_REGION_GPA);
/*
@@ -266,7 +269,7 @@ static void guest_code_delete_memory_region(void)
GUEST_ASSERT(0);
}
-static void test_delete_memory_region(void)
+static void test_delete_memory_region(bool disable_slot_zap_quirk)
{
pthread_t vcpu_thread;
struct kvm_vcpu *vcpu;
@@ -276,6 +279,9 @@ static void test_delete_memory_region(void)
vm = spawn_vm(&vcpu, &vcpu_thread, guest_code_delete_memory_region);
+ if (disable_slot_zap_quirk)
+ vm_enable_cap(vm, KVM_CAP_DISABLE_QUIRKS2, KVM_X86_QUIRK_SLOT_ZAP_ALL);
+
/* Delete the memory region, the guest should not die. */
vm_mem_region_delete(vm, MEM_REGION_SLOT);
wait_for_vcpu();
@@ -553,7 +559,10 @@ int main(int argc, char *argv[])
{
#ifdef __x86_64__
int i, loops;
+ int j, disable_slot_zap_quirk = 0;
+ if (kvm_check_cap(KVM_CAP_DISABLE_QUIRKS2) & KVM_X86_QUIRK_SLOT_ZAP_ALL)
+ disable_slot_zap_quirk = 1;
/*
* FIXME: the zero-memslot test fails on aarch64 and s390x because
* KVM_RUN fails with ENOEXEC or EFAULT.
@@ -579,13 +588,17 @@ int main(int argc, char *argv[])
else
loops = 10;
- pr_info("Testing MOVE of in-use region, %d loops\n", loops);
- for (i = 0; i < loops; i++)
- test_move_memory_region();
+ for (j = 0; j <= disable_slot_zap_quirk; j++) {
+ pr_info("Testing MOVE of in-use region, %d loops, slot zap quirk %s\n",
+ loops, j ? "disabled" : "enabled");
+ for (i = 0; i < loops; i++)
+ test_move_memory_region(!!j);
- pr_info("Testing DELETE of in-use region, %d loops\n", loops);
- for (i = 0; i < loops; i++)
- test_delete_memory_region();
+ pr_info("Testing DELETE of in-use region, %d loops, slot zap quirk %s\n",
+ loops, j ? "disabled" : "enabled");
+ for (i = 0; i < loops; i++)
+ test_delete_memory_region(!!j);
+ }
#endif
return 0;
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c
index f6b295e0b2d2..76cc2df9238a 100644
--- a/tools/testing/selftests/kvm/x86_64/debug_regs.c
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -47,15 +47,18 @@ static void guest_code(void)
/*
* Single step test, covers 2 basic instructions and 2 emulated
*
- * Enable interrupts during the single stepping to see that
- * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ
+ * Enable interrupts during the single stepping to see that pending
+ * interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ.
+ *
+ * Write MSR_IA32_TSC_DEADLINE to verify that KVM's fastpath handler
+ * exits to userspace due to single-step being enabled.
*/
asm volatile("ss_start: "
"sti\n\t"
"xor %%eax,%%eax\n\t"
"cpuid\n\t"
- "movl $0x1a0,%%ecx\n\t"
- "rdmsr\n\t"
+ "movl $" __stringify(MSR_IA32_TSC_DEADLINE) ", %%ecx\n\t"
+ "wrmsr\n\t"
"cli\n\t"
: : : "eax", "ebx", "ecx", "edx");
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
index e192720bfe14..74cf19661309 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_evmcs.c
@@ -242,7 +242,7 @@ int main(int argc, char *argv[])
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX));
TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE));
TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS));
- TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH));
+ TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH));
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
index b987a3d79715..0ddb63229bcb 100644
--- a/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_svm_test.c
@@ -157,7 +157,7 @@ int main(int argc, char *argv[])
int stage;
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM));
- TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_DIRECT_TLBFLUSH));
+ TEST_REQUIRE(kvm_hv_cpu_has(HV_X64_NESTED_DIRECT_FLUSH));
/* Create VM */
vm = vm_create_with_one_vcpu(&vcpu, guest_code);
diff --git a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
index 7c70c0da4fb7..2e9197eb1652 100644
--- a/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86_64/sev_smoke_test.c
@@ -160,6 +160,36 @@ static void test_sev(void *guest_code, uint64_t policy)
kvm_vm_free(vm);
}
+static void guest_shutdown_code(void)
+{
+ struct desc_ptr idt;
+
+ /* Clobber the IDT so that #UD is guaranteed to trigger SHUTDOWN. */
+ memset(&idt, 0, sizeof(idt));
+ __asm__ __volatile__("lidt %0" :: "m"(idt));
+
+ __asm__ __volatile__("ud2");
+}
+
+static void test_sev_es_shutdown(void)
+{
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+
+ uint32_t type = KVM_X86_SEV_ES_VM;
+
+ vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu);
+
+ vm_sev_launch(vm, SEV_POLICY_ES, NULL);
+
+ vcpu_run(vcpu);
+ TEST_ASSERT(vcpu->run->exit_reason == KVM_EXIT_SHUTDOWN,
+ "Wanted SHUTDOWN, got %s",
+ exit_reason_str(vcpu->run->exit_reason));
+
+ kvm_vm_free(vm);
+}
+
int main(int argc, char *argv[])
{
TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
@@ -171,6 +201,8 @@ int main(int argc, char *argv[])
test_sev(guest_sev_es_code, SEV_POLICY_ES | SEV_POLICY_NO_DBG);
test_sev(guest_sev_es_code, SEV_POLICY_ES);
+ test_sev_es_shutdown();
+
if (kvm_has_cap(KVM_CAP_XCRS) &&
(xgetbv(0) & XFEATURE_MASK_X87_AVX) == XFEATURE_MASK_X87_AVX) {
test_sync_vmsa(0);
diff --git a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
index 618cd2442390..88bcca188799 100644
--- a/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xapic_state_test.c
@@ -13,6 +13,7 @@
struct xapic_vcpu {
struct kvm_vcpu *vcpu;
bool is_x2apic;
+ bool has_xavic_errata;
};
static void xapic_guest_code(void)
@@ -31,6 +32,10 @@ static void xapic_guest_code(void)
}
}
+#define X2APIC_RSVD_BITS_MASK (GENMASK_ULL(31, 20) | \
+ GENMASK_ULL(17, 16) | \
+ GENMASK_ULL(13, 13))
+
static void x2apic_guest_code(void)
{
asm volatile("cli");
@@ -41,7 +46,12 @@ static void x2apic_guest_code(void)
uint64_t val = x2apic_read_reg(APIC_IRR) |
x2apic_read_reg(APIC_IRR + 0x10) << 32;
- x2apic_write_reg(APIC_ICR, val);
+ if (val & X2APIC_RSVD_BITS_MASK) {
+ x2apic_write_reg_fault(APIC_ICR, val);
+ } else {
+ x2apic_write_reg(APIC_ICR, val);
+ GUEST_ASSERT_EQ(x2apic_read_reg(APIC_ICR), val);
+ }
GUEST_SYNC(val);
} while (1);
}
@@ -71,27 +81,28 @@ static void ____test_icr(struct xapic_vcpu *x, uint64_t val)
icr = (u64)(*((u32 *)&xapic.regs[APIC_ICR])) |
(u64)(*((u32 *)&xapic.regs[APIC_ICR2])) << 32;
if (!x->is_x2apic) {
- val &= (-1u | (0xffull << (32 + 24)));
- TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY);
- } else {
- TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY);
+ if (!x->has_xavic_errata)
+ val &= (-1u | (0xffull << (32 + 24)));
+ } else if (val & X2APIC_RSVD_BITS_MASK) {
+ return;
}
-}
-#define X2APIC_RSVED_BITS_MASK (GENMASK_ULL(31,20) | \
- GENMASK_ULL(17,16) | \
- GENMASK_ULL(13,13))
+ if (x->has_xavic_errata)
+ TEST_ASSERT_EQ(icr & ~APIC_ICR_BUSY, val & ~APIC_ICR_BUSY);
+ else
+ TEST_ASSERT_EQ(icr, val & ~APIC_ICR_BUSY);
+}
static void __test_icr(struct xapic_vcpu *x, uint64_t val)
{
- if (x->is_x2apic) {
- /* Hardware writing vICR register requires reserved bits 31:20,
- * 17:16 and 13 kept as zero to avoid #GP exception. Data value
- * written to vICR should mask out those bits above.
- */
- val &= ~X2APIC_RSVED_BITS_MASK;
- }
- ____test_icr(x, val | APIC_ICR_BUSY);
+ /*
+ * The BUSY bit is reserved on both AMD and Intel, but only AMD treats
+ * it is as _must_ be zero. Intel simply ignores the bit. Don't test
+ * the BUSY bit for x2APIC, as there is no single correct behavior.
+ */
+ if (!x->is_x2apic)
+ ____test_icr(x, val | APIC_ICR_BUSY);
+
____test_icr(x, val & ~(u64)APIC_ICR_BUSY);
}
@@ -231,6 +242,15 @@ int main(int argc, char *argv[])
vm = vm_create_with_one_vcpu(&x.vcpu, xapic_guest_code);
x.is_x2apic = false;
+ /*
+ * AMD's AVIC implementation is buggy (fails to clear the ICR BUSY bit),
+ * and also diverges from KVM with respect to ICR2[23:0] (KVM and Intel
+ * drops writes, AMD does not). Account for the errata when checking
+ * that KVM reads back what was written.
+ */
+ x.has_xavic_errata = host_cpu_is_amd &&
+ get_kvm_amd_param_bool("avic");
+
vcpu_clear_cpuid_feature(x.vcpu, X86_FEATURE_X2APIC);
virt_pg_map(vm, APIC_DEFAULT_GPA, APIC_DEFAULT_GPA);
diff --git a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
index e149d0574961..2585087cdf5c 100644
--- a/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
+++ b/tools/testing/selftests/kvm/x86_64/xen_vmcall_test.c
@@ -10,6 +10,7 @@
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
+#include "hyperv.h"
#define HCALL_REGION_GPA 0xc0000000ULL
#define HCALL_REGION_SLOT 10
diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
index 27f6fdf11969..644915862af8 100644
--- a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
+++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
@@ -131,6 +131,8 @@ asm (
"_start:\n\t"
#ifdef __x86_64__
"mov %rsp,%rdi\n\t"
+ "and $-16,%rsp\n\t"
+ "sub $8,%rsp\n\t"
"jmp c_main"
#else
"push %esp\n\t"
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 1b90acb6e3fe..375d6285475e 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -40,27 +40,6 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
return 1;
}
-static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev, u32 last)
-{
- struct kvm_coalesced_mmio_ring *ring;
- unsigned avail;
-
- /* Are we able to batch it ? */
-
- /* last is the first free entry
- * check if we don't meet the first used entry
- * there is always one unused entry in the buffer
- */
- ring = dev->kvm->coalesced_mmio_ring;
- avail = (ring->first - last - 1) % KVM_COALESCED_MMIO_MAX;
- if (avail == 0) {
- /* full */
- return 0;
- }
-
- return 1;
-}
-
static int coalesced_mmio_write(struct kvm_vcpu *vcpu,
struct kvm_io_device *this, gpa_t addr,
int len, const void *val)
@@ -74,9 +53,15 @@ static int coalesced_mmio_write(struct kvm_vcpu *vcpu,
spin_lock(&dev->kvm->ring_lock);
+ /*
+ * last is the index of the entry to fill. Verify userspace hasn't
+ * set last to be out of range, and that there is room in the ring.
+ * Leave one entry free in the ring so that userspace can differentiate
+ * between an empty ring and a full ring.
+ */
insert = READ_ONCE(ring->last);
- if (!coalesced_mmio_has_room(dev, insert) ||
- insert >= KVM_COALESCED_MMIO_MAX) {
+ if (insert >= KVM_COALESCED_MMIO_MAX ||
+ (insert + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) {
spin_unlock(&dev->kvm->ring_lock);
return -EOPNOTSUPP;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4f81366f8b61..05cbb2548d99 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -136,8 +136,8 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file)
#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
.open = kvm_no_compat_open
#endif
-static int hardware_enable_all(void);
-static void hardware_disable_all(void);
+static int kvm_enable_virtualization(void);
+static void kvm_disable_virtualization(void);
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
@@ -1220,7 +1220,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
if (r)
goto out_err_no_arch_destroy_vm;
- r = hardware_enable_all();
+ r = kvm_enable_virtualization();
if (r)
goto out_err_no_disable;
@@ -1263,7 +1263,7 @@ out_no_coalesced_mmio:
mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
#endif
out_err_no_mmu_notifier:
- hardware_disable_all();
+ kvm_disable_virtualization();
out_err_no_disable:
kvm_arch_destroy_vm(kvm);
out_err_no_arch_destroy_vm:
@@ -1360,7 +1360,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
#endif
kvm_arch_free_vm(kvm);
preempt_notifier_dec();
- hardware_disable_all();
+ kvm_disable_virtualization();
mmdrop(mm);
}
@@ -3270,6 +3270,9 @@ static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
int r;
unsigned long addr;
+ if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+ return -EFAULT;
+
addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
if (kvm_is_error_hva(addr))
return -EFAULT;
@@ -3343,6 +3346,9 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
int r;
unsigned long addr;
+ if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+ return -EFAULT;
+
addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
if (kvm_is_error_hva(addr))
return -EFAULT;
@@ -3373,6 +3379,9 @@ static int __kvm_write_guest_page(struct kvm *kvm,
int r;
unsigned long addr;
+ if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
+ return -EFAULT;
+
addr = gfn_to_hva_memslot(memslot, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
@@ -3576,7 +3585,7 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
int ret;
while ((seg = next_segment(len, offset)) != 0) {
- ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
+ ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
if (ret < 0)
return ret;
offset = 0;
@@ -5566,137 +5575,67 @@ static struct miscdevice kvm_dev = {
};
#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
+static bool enable_virt_at_load = true;
+module_param(enable_virt_at_load, bool, 0444);
+
__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);
-static DEFINE_PER_CPU(bool, hardware_enabled);
+static DEFINE_PER_CPU(bool, virtualization_enabled);
+static DEFINE_MUTEX(kvm_usage_lock);
static int kvm_usage_count;
-static int __hardware_enable_nolock(void)
+__weak void kvm_arch_enable_virtualization(void)
+{
+
+}
+
+__weak void kvm_arch_disable_virtualization(void)
+{
+
+}
+
+static int kvm_enable_virtualization_cpu(void)
{
- if (__this_cpu_read(hardware_enabled))
+ if (__this_cpu_read(virtualization_enabled))
return 0;
- if (kvm_arch_hardware_enable()) {
+ if (kvm_arch_enable_virtualization_cpu()) {
pr_info("kvm: enabling virtualization on CPU%d failed\n",
raw_smp_processor_id());
return -EIO;
}
- __this_cpu_write(hardware_enabled, true);
+ __this_cpu_write(virtualization_enabled, true);
return 0;
}
-static void hardware_enable_nolock(void *failed)
-{
- if (__hardware_enable_nolock())
- atomic_inc(failed);
-}
-
static int kvm_online_cpu(unsigned int cpu)
{
- int ret = 0;
-
/*
* Abort the CPU online process if hardware virtualization cannot
* be enabled. Otherwise running VMs would encounter unrecoverable
* errors when scheduled to this CPU.
*/
- mutex_lock(&kvm_lock);
- if (kvm_usage_count)
- ret = __hardware_enable_nolock();
- mutex_unlock(&kvm_lock);
- return ret;
+ return kvm_enable_virtualization_cpu();
}
-static void hardware_disable_nolock(void *junk)
+static void kvm_disable_virtualization_cpu(void *ign)
{
- /*
- * Note, hardware_disable_all_nolock() tells all online CPUs to disable
- * hardware, not just CPUs that successfully enabled hardware!
- */
- if (!__this_cpu_read(hardware_enabled))
+ if (!__this_cpu_read(virtualization_enabled))
return;
- kvm_arch_hardware_disable();
+ kvm_arch_disable_virtualization_cpu();
- __this_cpu_write(hardware_enabled, false);
+ __this_cpu_write(virtualization_enabled, false);
}
static int kvm_offline_cpu(unsigned int cpu)
{
- mutex_lock(&kvm_lock);
- if (kvm_usage_count)
- hardware_disable_nolock(NULL);
- mutex_unlock(&kvm_lock);
+ kvm_disable_virtualization_cpu(NULL);
return 0;
}
-static void hardware_disable_all_nolock(void)
-{
- BUG_ON(!kvm_usage_count);
-
- kvm_usage_count--;
- if (!kvm_usage_count)
- on_each_cpu(hardware_disable_nolock, NULL, 1);
-}
-
-static void hardware_disable_all(void)
-{
- cpus_read_lock();
- mutex_lock(&kvm_lock);
- hardware_disable_all_nolock();
- mutex_unlock(&kvm_lock);
- cpus_read_unlock();
-}
-
-static int hardware_enable_all(void)
-{
- atomic_t failed = ATOMIC_INIT(0);
- int r;
-
- /*
- * Do not enable hardware virtualization if the system is going down.
- * If userspace initiated a forced reboot, e.g. reboot -f, then it's
- * possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
- * after kvm_reboot() is called. Note, this relies on system_state
- * being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
- * hook instead of registering a dedicated reboot notifier (the latter
- * runs before system_state is updated).
- */
- if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
- system_state == SYSTEM_RESTART)
- return -EBUSY;
-
- /*
- * When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
- * is called, and so on_each_cpu() between them includes the CPU that
- * is being onlined. As a result, hardware_enable_nolock() may get
- * invoked before kvm_online_cpu(), which also enables hardware if the
- * usage count is non-zero. Disable CPU hotplug to avoid attempting to
- * enable hardware multiple times.
- */
- cpus_read_lock();
- mutex_lock(&kvm_lock);
-
- r = 0;
-
- kvm_usage_count++;
- if (kvm_usage_count == 1) {
- on_each_cpu(hardware_enable_nolock, &failed, 1);
-
- if (atomic_read(&failed)) {
- hardware_disable_all_nolock();
- r = -EBUSY;
- }
- }
-
- mutex_unlock(&kvm_lock);
- cpus_read_unlock();
-
- return r;
-}
-
static void kvm_shutdown(void)
{
/*
@@ -5712,34 +5651,32 @@ static void kvm_shutdown(void)
*/
pr_info("kvm: exiting hardware virtualization\n");
kvm_rebooting = true;
- on_each_cpu(hardware_disable_nolock, NULL, 1);
+ on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
}
static int kvm_suspend(void)
{
/*
* Secondary CPUs and CPU hotplug are disabled across the suspend/resume
- * callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
- * is stable. Assert that kvm_lock is not held to ensure the system
- * isn't suspended while KVM is enabling hardware. Hardware enabling
- * can be preempted, but the task cannot be frozen until it has dropped
- * all locks (userspace tasks are frozen via a fake signal).
+ * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
+ * count is stable. Assert that kvm_usage_lock is not held to ensure
+ * the system isn't suspended while KVM is enabling hardware. Hardware
+ * enabling can be preempted, but the task cannot be frozen until it has
+ * dropped all locks (userspace tasks are frozen via a fake signal).
*/
- lockdep_assert_not_held(&kvm_lock);
+ lockdep_assert_not_held(&kvm_usage_lock);
lockdep_assert_irqs_disabled();
- if (kvm_usage_count)
- hardware_disable_nolock(NULL);
+ kvm_disable_virtualization_cpu(NULL);
return 0;
}
static void kvm_resume(void)
{
- lockdep_assert_not_held(&kvm_lock);
+ lockdep_assert_not_held(&kvm_usage_lock);
lockdep_assert_irqs_disabled();
- if (kvm_usage_count)
- WARN_ON_ONCE(__hardware_enable_nolock());
+ WARN_ON_ONCE(kvm_enable_virtualization_cpu());
}
static struct syscore_ops kvm_syscore_ops = {
@@ -5747,13 +5684,95 @@ static struct syscore_ops kvm_syscore_ops = {
.resume = kvm_resume,
.shutdown = kvm_shutdown,
};
+
+static int kvm_enable_virtualization(void)
+{
+ int r;
+
+ guard(mutex)(&kvm_usage_lock);
+
+ if (kvm_usage_count++)
+ return 0;
+
+ kvm_arch_enable_virtualization();
+
+ r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
+ kvm_online_cpu, kvm_offline_cpu);
+ if (r)
+ goto err_cpuhp;
+
+ register_syscore_ops(&kvm_syscore_ops);
+
+ /*
+ * Undo virtualization enabling and bail if the system is going down.
+ * If userspace initiated a forced reboot, e.g. reboot -f, then it's
+ * possible for an in-flight operation to enable virtualization after
+ * syscore_shutdown() is called, i.e. without kvm_shutdown() being
+ * invoked. Note, this relies on system_state being set _before_
+ * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
+ * or this CPU observes the impending shutdown. Which is why KVM uses
+ * a syscore ops hook instead of registering a dedicated reboot
+ * notifier (the latter runs before system_state is updated).
+ */
+ if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
+ system_state == SYSTEM_RESTART) {
+ r = -EBUSY;
+ goto err_rebooting;
+ }
+
+ return 0;
+
+err_rebooting:
+ unregister_syscore_ops(&kvm_syscore_ops);
+ cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+err_cpuhp:
+ kvm_arch_disable_virtualization();
+ --kvm_usage_count;
+ return r;
+}
+
+static void kvm_disable_virtualization(void)
+{
+ guard(mutex)(&kvm_usage_lock);
+
+ if (--kvm_usage_count)
+ return;
+
+ unregister_syscore_ops(&kvm_syscore_ops);
+ cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
+ kvm_arch_disable_virtualization();
+}
+
+static int kvm_init_virtualization(void)
+{
+ if (enable_virt_at_load)
+ return kvm_enable_virtualization();
+
+ return 0;
+}
+
+static void kvm_uninit_virtualization(void)
+{
+ if (enable_virt_at_load)
+ kvm_disable_virtualization();
+}
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
-static int hardware_enable_all(void)
+static int kvm_enable_virtualization(void)
+{
+ return 0;
+}
+
+static int kvm_init_virtualization(void)
{
return 0;
}
-static void hardware_disable_all(void)
+static void kvm_disable_virtualization(void)
+{
+
+}
+
+static void kvm_uninit_virtualization(void)
{
}
@@ -6454,15 +6473,6 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
int r;
int cpu;
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
- r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
- kvm_online_cpu, kvm_offline_cpu);
- if (r)
- return r;
-
- register_syscore_ops(&kvm_syscore_ops);
-#endif
-
/* A kmem cache lets us meet the alignment requirements of fx_save. */
if (!vcpu_align)
vcpu_align = __alignof__(struct kvm_vcpu);
@@ -6473,10 +6483,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
offsetofend(struct kvm_vcpu, stats_id)
- offsetof(struct kvm_vcpu, arch),
NULL);
- if (!kvm_vcpu_cache) {
- r = -ENOMEM;
- goto err_vcpu_cache;
- }
+ if (!kvm_vcpu_cache)
+ return -ENOMEM;
for_each_possible_cpu(cpu) {
if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
@@ -6510,6 +6518,10 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
kvm_gmem_init(module);
+ r = kvm_init_virtualization();
+ if (r)
+ goto err_virt;
+
/*
* Registration _must_ be the very last thing done, as this exposes
* /dev/kvm to userspace, i.e. all infrastructure must be setup!
@@ -6523,6 +6535,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
return 0;
err_register:
+ kvm_uninit_virtualization();
+err_virt:
kvm_vfio_ops_exit();
err_vfio:
kvm_async_pf_deinit();
@@ -6533,11 +6547,6 @@ err_cpu_kick_mask:
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
-err_vcpu_cache:
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
- unregister_syscore_ops(&kvm_syscore_ops);
- cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
-#endif
return r;
}
EXPORT_SYMBOL_GPL(kvm_init);
@@ -6553,16 +6562,14 @@ void kvm_exit(void)
*/
misc_deregister(&kvm_dev);
+ kvm_uninit_virtualization();
+
debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
kvm_vfio_ops_exit();
kvm_async_pf_deinit();
-#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
- unregister_syscore_ops(&kvm_syscore_ops);
- cpuhp_remove_state_nocalls(CPUHP_AP_KVM_ONLINE);
-#endif
kvm_irqfd_exit();
}
EXPORT_SYMBOL_GPL(kvm_exit);