Merge tag 'amd-drm-next-6.2-2022-11-04' of https://gitlab.freedesktop.org/agd5f/linux into drm-next

amd-drm-next-6.2-2022-11-04: amdgpu: - Add TMZ support for GC 11.0.1 - More IP version check conversions - Mode2 reset fixes for sienna cichlid - SMU 13.x fixes - RAS enablement on MP 13.x - Replace kmap with kmap_local_page() - Misc Clang warning fixes - SR-IOV fixes for GC 11.x - PCI AER fix - DCN 3.2.x commit sequence rework - SDMA 4.x doorbell fix - Expose additional new GC 11.x firmware versions - Misc code cleanups - S0i3 fixes - More DC FPU cleanup - Add more DC kerneldoc - Misc spelling and grammer fixes - DCN 3.1.x fixes - Plane modifier fix - MCA RAS enablement - Secure display locking fix - RAS TA rework - RAS EEPROM fixes - Fail suspend if eviction fails - Drop AMD specific DSC workarounds in favor of drm EDID quirks - SR-IOV suspend/resume fixes - Enable DCN support for ARM - Enable secure display on DCN 2.1 amdkfd: - Cache size fixes for GC 10.3.x - kfd_dev struct cleanup - GC11.x CWSR trap handler fix - Userptr fixes - Warning fixes radeon: - Replace kmap with kmap_local_page() UAPI: - Expose additional new GC 11.x firmware versions via the existing INFO query drm: - Add some new EDID DSC quirks Signed-off-by: Dave Airlie <airlied@redhat.com> # Conflicts: # drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221104205827.6008-1-alexander.deucher@amd.com
author: Dave Airlie <airlied@redhat.com> 2022-11-08 16:32:31 +1000
committer: Dave Airlie <airlied@redhat.com> 2022-11-08 16:32:31 +1000
commit: 49e8e6343df688d68b12c2af50791ca37520f0b7 (patch)
tree: 44b193d9161a2fa071e7b1ecb96014e8e73a240a /drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
parent: 60ba8c5bd94e17ab4b024f5cecf8b48e2cf36412 (diff)
parent: fcf00f8d29f2fc6bf00531a1447be28b99073cc3 (diff)
1 files changed, 73 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index aad3c8b4c810..f76c19fc0392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -22,6 +22,59 @@
  */
 
 #include "amdgpu.h"
+#include "umc_v6_7.h"
+
+static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
+				    struct ras_err_data *err_data, uint64_t err_addr,
+				    uint32_t ch_inst, uint32_t umc_inst)
+{
+	switch (adev->ip_versions[UMC_HWIP][0]) {
+	case IP_VERSION(6, 7, 0):
+		umc_v6_7_convert_error_address(adev,
+				err_data, err_addr, ch_inst, umc_inst);
+		break;
+	default:
+		dev_warn(adev->dev,
+			 "UMC address to Physical address translation is not supported\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	return AMDGPU_RAS_SUCCESS;
+}
+
+int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
+			uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst)
+{
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int ret = AMDGPU_RAS_FAIL;
+
+	err_data.err_addr =
+		kcalloc(adev->umc.max_ras_err_cnt_per_query,
+			sizeof(struct eeprom_table_record), GFP_KERNEL);
+	if (!err_data.err_addr) {
+		dev_warn(adev->dev,
+			"Failed to alloc memory for umc error record in MCA notifier!\n");
+		return AMDGPU_RAS_FAIL;
+	}
+
+	/*
+	 * Translate UMC channel address to Physical address
+	 */
+	ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr,
+					ch_inst, umc_inst);
+	if (ret)
+		goto out;
+
+	if (amdgpu_bad_page_threshold != 0) {
+		amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
+						err_data.err_addr_cnt);
+		amdgpu_ras_save_bad_pages(adev);
+	}
+
+out:
+	kfree(err_data.err_addr);
+	return ret;
+}
 
 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 		void *ras_error_status,
@@ -112,23 +165,29 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
 	return AMDGPU_RAS_SUCCESS;
 }
 
-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-		void *ras_error_status,
-		bool reset)
+int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
 {
-	int ret;
-	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
-	struct ras_common_if head = {
-		.block = AMDGPU_RAS_BLOCK__UMC,
-	};
-	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+	int ret = AMDGPU_RAS_SUCCESS;
 
-	ret =
-		amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
+	if (!adev->gmc.xgmi.connected_to_cpu) {
+		struct ras_err_data err_data = {0, 0, 0, NULL};
+		struct ras_common_if head = {
+			.block = AMDGPU_RAS_BLOCK__UMC,
+		};
+		struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
 
-	if (ret == AMDGPU_RAS_SUCCESS && obj) {
-		obj->err_data.ue_count += err_data->ue_count;
-		obj->err_data.ce_count += err_data->ce_count;
+		ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
+
+		if (ret == AMDGPU_RAS_SUCCESS && obj) {
+			obj->err_data.ue_count += err_data.ue_count;
+			obj->err_data.ce_count += err_data.ce_count;
+		}
+	} else if (reset) {
+		/* MCA poison handler is only responsible for GPU reset,
+		 * let MCA notifier do page retirement.
+		 */
+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		amdgpu_ras_reset_gpu(adev);
 	}
 
 	return ret;
author	Dave Airlie <airlied@redhat.com>	2022-11-08 16:32:31 +1000
committer	Dave Airlie <airlied@redhat.com>	2022-11-08 16:32:31 +1000
commit	49e8e6343df688d68b12c2af50791ca37520f0b7 (patch)
tree	44b193d9161a2fa071e7b1ecb96014e8e73a240a /drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
parent	60ba8c5bd94e17ab4b024f5cecf8b48e2cf36412 (diff)
parent	fcf00f8d29f2fc6bf00531a1447be28b99073cc3 (diff)