Merge tag 'drm-next-2020-12-11' of git://anongit.freedesktop.org/drm/drm

Pull drm updates from Dave Airlie: "Not a huge amount of big things here, AMD has support for a few new HW variants (vangogh, green sardine, dimgrey cavefish), Intel has some more DG1 enablement. We have a few big reworks of the TTM layers and interfaces, GEM and atomic internal API reworks cross tree. fbdev is marked orphaned in here as well to reflect the current reality. core: - documentation updates - deprecate DRM_FORMAT_MOD_NONE - atomic crtc enable/disable rework - GEM convert drivers to gem object functions - remove SCATTER_LIST_MAX_SEGMENT sched: - avoid infinite waits ttm: - remove AGP support - don't modify caching for swapout - ttm pinning rework - major TTM reworks - new backend allocator - multihop support vram-helper: - top down BO placement fix - TTM changes - GEM object support displayport: - DP 2.0 DPCD prep work - DP MST extended DPCD caps fbdev: - mark as orphaned amdgpu: - Initial Vangogh support - Green Sardine support - Dimgrey Cavefish support - SG display support for renoir - SMU7 improvements - gfx9+ modiifier support - CI BACO fixes radeon: - expose voltage via hwmon on SUMO amdkfd: - fix unique id handling i915: - more DG1 enablement - bigjoiner support - integer scaling filter support - async flip support - ICL+ DSI command mode - Improve display shutdown - Display refactoring - eLLC machine fbdev loading fix - dma scatterlist fixes - TGL hang fixes - eLLC display buffer caching on SKL+ - MOCS PTE seeting for gen9+ msm: - Shutdown hook - GPU cooling device support - DSI 7nm and 10nm phy/pll updates - sm8150/sm2850 DPU support - GEM locking re-work - LLCC system cache support aspeed: - sysfs output config support ast: - LUT fix - new display mode gma500: - remove 2d framebuffer accel panfrost: - move gpu reset to a worker exynos: - new HDMI mode support mediatek: - MT8167 support - yaml bindings - MIPI DSI phy code moved etnaviv: - new perf counter - more lockdep annotation hibmc: - i2c DDC support ingenic: - pixel clock reset fix - reserved memory support - allow both DMA channels at once - different pixel format support - 30/24/8-bit palette modes tilcdc: - don't keep vblank irq enabled vc4: - new maintainer added - DSI registration fix virtio: - blob resource support - host visible and cross-device support - uuid api support" * tag 'drm-next-2020-12-11' of git://anongit.freedesktop.org/drm/drm: (1754 commits) drm/amdgpu: Initialise drm_gem_object_funcs for imported BOs drm/amdgpu: fix size calculation with stolen vga memory drm/amdgpu: remove amdgpu_ttm_late_init and amdgpu_bo_late_init drm/amdgpu: free the pre-OS console framebuffer after the first modeset drm/amdgpu: enable runtime pm using BACO on CI dGPUs drm/amdgpu/cik: enable BACO reset on Bonaire drm/amd/pm: update smu10.h WORKLOAD_PPLIB setting for raven drm/amd/pm: remove one unsupported smu function for vangogh drm/amd/display: setup system context for APUs drm/amd/display: add S/G support for Vangogh drm/amdkfd: Fix leak in dmabuf import drm/amdgpu: use AMDGPU_NUM_VMID when possible drm/amdgpu: fix sdma instance fw version and feature version init drm/amd/pm: update driver if version for dimgrey_cavefish drm/amd/display: 3.2.115 drm/amd/display: [FW Promotion] Release 0.0.45 drm/amd/display: Revert DCN2.1 dram_clock_change_latency update drm/amd/display: Enable gpu_vm_support for dcn3.01 drm/amd/display: Fixed the audio noise during mode switching with HDCP mode on drm/amd/display: Add wm table for Renoir ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2020-12-14 11:07:56 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2020-12-14 11:07:56 -0800
commit: 1d36dffa5d887715dacca0f717f4519b7be5e498 (patch)
tree: a68f7c00dbb3036a67806ed6c6b8cc61c3cff60d /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent: 2c85ebc57b3e1817b6ce1a6b703928e113a90442 (diff)
parent: b10733527bfd864605c33ab2e9a886eec317ec39 (diff)
1 files changed, 48 insertions, 130 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 82cd8e55595a..c136bd449744 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -80,6 +80,8 @@ enum amdgpu_ras_retire_page_reservation {
 
 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
 
+static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+				uint64_t addr);
 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr);
 
@@ -516,9 +518,9 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 /* obj end */
 
 static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev,
-				  const char* 		invoke_type,
-				  const char* 		block_name,
-				  enum ta_ras_status 	ret)
+					 const char* invoke_type,
+					 const char* block_name,
+					 enum ta_ras_status ret)
 {
 	switch (ret) {
 	case TA_RAS_STATUS__SUCCESS:
@@ -607,7 +609,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 	if (!con)
 		return -EINVAL;
 
-        info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
+	info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
 	if (!info)
 		return -ENOMEM;
 
@@ -903,13 +905,6 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 	return ret;
 }
 
-int amdgpu_ras_error_cure(struct amdgpu_device *adev,
-		struct ras_cure_if *info)
-{
-	/* psp fw has no cure interface for now. */
-	return 0;
-}
-
 /* get the total error counts on all IPs */
 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 		bool is_ce)
@@ -953,7 +948,7 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
 	case AMDGPU_RAS_RETIRE_PAGE_FAULT:
 	default:
 		return "F";
-	};
+	}
 }
 
 /**
@@ -1474,8 +1469,8 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
 }
 
 /* Parse RdRspStatus and WrRspStatus */
-void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
-		struct ras_query_if *info)
+static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
+					  struct ras_query_if *info)
 {
 	/*
 	 * Only two block need to query read/write
@@ -1548,10 +1543,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 			.size = AMDGPU_GPU_PAGE_SIZE,
 			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
 		};
-
-		if (data->last_reserved <= i)
+		ret = amdgpu_vram_mgr_query_page_status(
+				ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+				data->bps[i].retired_page);
+		if (ret == -EBUSY)
 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
-		else if (data->bps_bo[i] == NULL)
+		else if (ret == -ENOENT)
 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT;
 	}
 
@@ -1603,12 +1600,9 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
 	unsigned int new_space = old_space + pages;
 	unsigned int align_space = ALIGN(new_space, 512);
 	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-	struct amdgpu_bo **bps_bo =
-			kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
 
-	if (!bps || !bps_bo) {
+	if (!bps) {
 		kfree(bps);
-		kfree(bps_bo);
 		return -ENOMEM;
 	}
 
@@ -1617,14 +1611,8 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
 				data->count * sizeof(*data->bps));
 		kfree(data->bps);
 	}
-	if (data->bps_bo) {
-		memcpy(bps_bo, data->bps_bo,
-				data->count * sizeof(*data->bps_bo));
-		kfree(data->bps_bo);
-	}
 
 	data->bps = bps;
-	data->bps_bo = bps_bo;
 	data->space_left += align_space - old_space;
 	return 0;
 }
@@ -1636,6 +1624,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
 	int ret = 0;
+	uint32_t i;
 
 	if (!con || !con->eh_data || !bps || pages <= 0)
 		return 0;
@@ -1645,16 +1634,26 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 	if (!data)
 		goto out;
 
-	if (data->space_left <= pages)
-		if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
+	for (i = 0; i < pages; i++) {
+		if (amdgpu_ras_check_bad_page_unlock(con,
+			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT))
+			continue;
+
+		if (!data->space_left &&
+			amdgpu_ras_realloc_eh_data_space(adev, data, 256)) {
 			ret = -ENOMEM;
 			goto out;
 		}
 
-	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
-	data->count += pages;
-	data->space_left -= pages;
+		amdgpu_vram_mgr_reserve_range(
+			ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
+			AMDGPU_GPU_PAGE_SIZE);
 
+		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));
+		data->count++;
+		data->space_left--;
+	}
 out:
 	mutex_unlock(&con->recovery_lock);
 
@@ -1665,7 +1664,7 @@ out:
  * write error record array to eeprom, the function should be
  * protected by recovery_lock
  */
-static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
+int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
@@ -1727,6 +1726,20 @@ out:
 	return ret;
 }
 
+static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,
+				uint64_t addr)
+{
+	struct ras_err_handler_data *data = con->eh_data;
+	int i;
+
+	addr >>= AMDGPU_GPU_PAGE_SHIFT;
+	for (i = 0; i < data->count; i++)
+		if (addr == data->bps[i].retired_page)
+			return true;
+
+	return false;
+}
+
 /*
  * check if an address belongs to bad page
  *
@@ -1736,26 +1749,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
 				uint64_t addr)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-	struct ras_err_handler_data *data;
-	int i;
 	bool ret = false;
 
 	if (!con || !con->eh_data)
 		return ret;
 
 	mutex_lock(&con->recovery_lock);
-	data = con->eh_data;
-	if (!data)
-		goto out;
-
-	addr >>= AMDGPU_GPU_PAGE_SHIFT;
-	for (i = 0; i < data->count; i++)
-		if (addr == data->bps[i].retired_page) {
-			ret = true;
-			goto out;
-		}
-
-out:
+	ret = amdgpu_ras_check_bad_page_unlock(con, addr);
 	mutex_unlock(&con->recovery_lock);
 	return ret;
 }
@@ -1801,80 +1801,6 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
 	}
 }
 
-/* called in gpu recovery/init */
-int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
-{
-	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-	struct ras_err_handler_data *data;
-	uint64_t bp;
-	struct amdgpu_bo *bo = NULL;
-	int i, ret = 0;
-
-	/* Not reserve bad page when amdgpu_bad_page_threshold == 0. */
-	if (!con || !con->eh_data || (amdgpu_bad_page_threshold == 0))
-		return 0;
-
-	mutex_lock(&con->recovery_lock);
-	data = con->eh_data;
-	if (!data)
-		goto out;
-	/* reserve vram at driver post stage. */
-	for (i = data->last_reserved; i < data->count; i++) {
-		bp = data->bps[i].retired_page;
-
-		/* There are two cases of reserve error should be ignored:
-		 * 1) a ras bad page has been allocated (used by someone);
-		 * 2) a ras bad page has been reserved (duplicate error injection
-		 *    for one page);
-		 */
-		if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT,
-					       AMDGPU_GPU_PAGE_SIZE,
-					       AMDGPU_GEM_DOMAIN_VRAM,
-					       &bo, NULL))
-			dev_warn(adev->dev, "RAS WARN: reserve vram for "
-					"retired page %llx fail\n", bp);
-
-		data->bps_bo[i] = bo;
-		data->last_reserved = i + 1;
-		bo = NULL;
-	}
-
-	/* continue to save bad pages to eeprom even reesrve_vram fails */
-	ret = amdgpu_ras_save_bad_pages(adev);
-out:
-	mutex_unlock(&con->recovery_lock);
-	return ret;
-}
-
-/* called when driver unload */
-static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
-{
-	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-	struct ras_err_handler_data *data;
-	struct amdgpu_bo *bo;
-	int i;
-
-	if (!con || !con->eh_data)
-		return 0;
-
-	mutex_lock(&con->recovery_lock);
-	data = con->eh_data;
-	if (!data)
-		goto out;
-
-	for (i = data->last_reserved - 1; i >= 0; i--) {
-		bo = data->bps_bo[i];
-
-		amdgpu_bo_free_kernel(&bo, NULL, NULL);
-
-		data->bps_bo[i] = bo;
-		data->last_reserved = i;
-	}
-out:
-	mutex_unlock(&con->recovery_lock);
-	return 0;
-}
-
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -1914,18 +1840,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 		ret = amdgpu_ras_load_bad_pages(adev);
 		if (ret)
 			goto free;
-		ret = amdgpu_ras_reserve_bad_pages(adev);
-		if (ret)
-			goto release;
 	}
 
 	return 0;
 
-release:
-	amdgpu_ras_release_bad_pages(adev);
 free:
 	kfree((*data)->bps);
-	kfree((*data)->bps_bo);
 	kfree(*data);
 	con->eh_data = NULL;
 out:
@@ -1953,12 +1873,10 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 		return 0;
 
 	cancel_work_sync(&con->recovery_work);
-	amdgpu_ras_release_bad_pages(adev);
 
 	mutex_lock(&con->recovery_lock);
 	con->eh_data = NULL;
 	kfree(data->bps);
-	kfree(data->bps_bo);
 	kfree(data);
 	mutex_unlock(&con->recovery_lock);
 
@@ -2153,7 +2071,7 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,
 
 	amdgpu_ras_sysfs_remove(adev, ras_block);
 	if (ih_info->cb)
-                amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+		amdgpu_ras_interrupt_remove_handler(adev, ih_info);
 	amdgpu_ras_feature_enable(adev, ras_block, 0);
 }
author	Linus Torvalds <torvalds@linux-foundation.org>	2020-12-14 11:07:56 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2020-12-14 11:07:56 -0800
commit	1d36dffa5d887715dacca0f717f4519b7be5e498 (patch)
tree	a68f7c00dbb3036a67806ed6c6b8cc61c3cff60d /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent	2c85ebc57b3e1817b6ce1a6b703928e113a90442 (diff)
parent	b10733527bfd864605c33ab2e9a886eec317ec39 (diff)