aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2024-04-17 15:48:59 +1000
committerDave Airlie <airlied@redhat.com>2024-04-17 15:48:59 +1000
commit34633158b8eb8fca145c9a73f8fe4f98c7275b06 (patch)
treea8e0e2d55dff19f68a1c6842142255e9deaf2d7d /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent6e1f415e7129f7cd4c2394af83b35cdcdd40baf7 (diff)
parentab956ed95b8bc4a65c913d7057075866d5fc3724 (diff)
Merge tag 'amd-drm-next-6.10-2024-04-13' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.10-2024-04-13: amdgpu: - HDCP fixes - ODM fixes - RAS fixes - Devcoredump improvements - Misc code cleanups - Expose VCN activity via sysfs - SMY 13.0.x updates - Enable fast updates on DCN 3.1.4 - Add dclk and vclk reporting on additional devices - Add ACA RAS infrastructure - Implement TLB flush fence - EEPROM handling fixes - SMUIO 14.0.2 support - SMU 14.0.1 Updates - Sync page table freeing with TLB flushes - DML2 refactor - DC debug improvements - SR-IOV fixes - Suspend and Resume fixes - DCN 3.5.x Updates - Z8 fixes - UMSCH fixes - GPU reset fixes - HDP fix for second GFX pipe on GC 10.x - Enable secondary GFX pipe on GC 10.3 - Refactor and clean up BACO/BOCO/BAMACO handling - VCN partitioning fix - DC DWB fixes - VSC SDP fixes - DCN 3.1.6 fix - GC 11.5 fixes - Remove invalid TTM resource start check - DCN 1.0 fixes amdkfd: - MQD handling cleanup - Preemption handling fixes for XCDs - TLB flush fix for GC 9.4.2 - Properly clean up workqueue during module unload - Fix memory leak process create failure - Range check CP bad op exception targets to avoid reporting invalid exceptions to userspace radeon: - Misc code cleanups From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20240413213708.3427038-1-alexander.deucher@amd.com Signed-off-by: Dave Airlie <airlied@redhat.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c82
1 files changed, 63 insertions, 19 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index aa16d51dd842..170da4551ed5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -74,6 +74,7 @@
#include "amdgpu_fru_eeprom.h"
#include "amdgpu_reset.h"
#include "amdgpu_virt.h"
+#include "amdgpu_dev_coredump.h"
#include <linux/suspend.h>
#include <drm/task_barrier.h>
@@ -143,6 +144,8 @@ const char *amdgpu_asic_name[] = {
"LAST",
};
+static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
+
/**
* DOC: pcie_replay_count
*
@@ -335,10 +338,12 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)
*
* @dev: drm_device pointer
*
- * Returns true if the device supporte BACO,
- * otherwise return false.
+ * Return:
+ * 1 if the device supporte BACO;
+ * 3 if the device support MACO (only works if BACO is supported)
+ * otherwise return 0.
*/
-bool amdgpu_device_supports_baco(struct drm_device *dev)
+int amdgpu_device_supports_baco(struct drm_device *dev)
{
struct amdgpu_device *adev = drm_to_adev(dev);
@@ -4069,6 +4074,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
/* Enable TMZ based on IP_VERSION */
amdgpu_gmc_tmz_set(adev);
+ if (amdgpu_sriov_vf(adev) &&
+ amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
+ /* VF MMIO access (except mailbox range) from CPU
+ * will be blocked during sriov runtime
+ */
+ adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
+
amdgpu_gmc_noretry_set(adev);
/* Need to get xgmi info early to decide the reset behavior*/
if (adev->gmc.xgmi.supported) {
@@ -4135,18 +4147,22 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->ip_blocks[i].status.hw = true;
}
}
+ } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
+ !amdgpu_device_has_display_hardware(adev)) {
+ r = psp_gpu_reset(adev);
} else {
- tmp = amdgpu_reset_method;
- /* It should do a default reset when loading or reloading the driver,
- * regardless of the module parameter reset_method.
- */
- amdgpu_reset_method = AMD_RESET_METHOD_NONE;
- r = amdgpu_asic_reset(adev);
- amdgpu_reset_method = tmp;
- if (r) {
- dev_err(adev->dev, "asic reset on init failed\n");
- goto failed;
- }
+ tmp = amdgpu_reset_method;
+ /* It should do a default reset when loading or reloading the driver,
+ * regardless of the module parameter reset_method.
+ */
+ amdgpu_reset_method = AMD_RESET_METHOD_NONE;
+ r = amdgpu_asic_reset(adev);
+ amdgpu_reset_method = tmp;
+ }
+
+ if (r) {
+ dev_err(adev->dev, "asic reset on init failed\n");
+ goto failed;
}
}
@@ -4970,12 +4986,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
retry:
amdgpu_amdkfd_pre_reset(adev);
+ amdgpu_device_stop_pending_resets(adev);
+
if (from_hypervisor)
r = amdgpu_virt_request_full_gpu(adev, true);
else
r = amdgpu_virt_reset_gpu(adev);
if (r)
return r;
+ amdgpu_ras_set_fed(adev, false);
amdgpu_irq_gpu_reset_resume_helper(adev);
/* some sw clean up VF needs to do before recover */
@@ -5534,6 +5553,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
}
+static int amdgpu_device_health_check(struct list_head *device_list_handle)
+{
+ struct amdgpu_device *tmp_adev;
+ int ret = 0;
+ u32 status;
+
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+ if (PCI_POSSIBLE_ERROR(status)) {
+ dev_err(tmp_adev->dev, "device lost from bus!");
+ ret = -ENODEV;
+ }
+ }
+
+ return ret;
+}
+
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
@@ -5605,6 +5641,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
device_list_handle = &device_list;
}
+ if (!amdgpu_sriov_vf(adev)) {
+ r = amdgpu_device_health_check(device_list_handle);
+ if (r)
+ goto end_reset;
+ }
+
/* We need to lock reset domain only once both for XGMI and single device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
@@ -5687,11 +5729,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
tmp_adev->asic_reset_res = r;
}
- /*
- * Drop all pending non scheduler resets. Scheduler resets
- * were already dropped during drm_sched_stop
- */
- amdgpu_device_stop_pending_resets(tmp_adev);
+ if (!amdgpu_sriov_vf(tmp_adev))
+ /*
+ * Drop all pending non scheduler resets. Scheduler resets
+ * were already dropped during drm_sched_stop
+ */
+ amdgpu_device_stop_pending_resets(tmp_adev);
}
/* Actual ASIC resets if needed.*/
@@ -5770,6 +5813,7 @@ skip_sched_resume:
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+end_reset:
if (hive) {
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);