diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 | 
1 files changed, 20 insertions, 18 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ff5361f5c2d4..2dad7aa9a03b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1811,7 +1811,8 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  		amdgpu_ras_query_error_status(adev, &info);  		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && -		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { +		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && +		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) {  			if (amdgpu_ras_reset_error_status(adev, info.head.block))  				dev_warn(adev->dev, "Failed to reset error counter and error status");  		} @@ -1949,6 +1950,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  		reset_context.method = AMD_RESET_METHOD_NONE;  		reset_context.reset_req_dev = adev;  		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); +		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);  		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);  	} @@ -2718,7 +2720,8 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)  	/* Need disable ras on all IPs here before ip [hw/sw]fini */ -	amdgpu_ras_disable_all_features(adev, 0); +	if (con->features) +		amdgpu_ras_disable_all_features(adev, 0);  	amdgpu_ras_recovery_fini(adev);  	return 0;  } @@ -2831,11 +2834,8 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,  	struct mce *m = (struct mce *)data;  	struct amdgpu_device *adev = NULL;  	uint32_t gpu_id = 0; -	uint32_t umc_inst = 0; -	uint32_t ch_inst, channel_index = 0; +	uint32_t umc_inst = 0, ch_inst = 0;  	struct ras_err_data err_data = {0, 0, 0, NULL}; -	struct eeprom_table_record err_rec; -	uint64_t retired_page;  	/*  	 * If the error was generated in UMC_V2, which belongs to GPU UMCs, @@ -2874,21 +2874,22 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,  	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",  			     umc_inst, ch_inst); +	err_data.err_addr = +		kcalloc(adev->umc.max_ras_err_cnt_per_query, +			sizeof(struct eeprom_table_record), GFP_KERNEL); +	if (!err_data.err_addr) { +		dev_warn(adev->dev, +			"Failed to alloc memory for umc error record in mca notifier!\n"); +		return NOTIFY_DONE; +	} +  	/*  	 * Translate UMC channel address to Physical address  	 */ -	channel_index = -		adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num -					  + ch_inst]; - -	retired_page = ADDR_OF_8KB_BLOCK(m->addr) | -			ADDR_OF_256B_BLOCK(channel_index) | -			OFFSET_IN_256B_BLOCK(m->addr); - -	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); -	err_data.err_addr = &err_rec; -	amdgpu_umc_fill_error_record(&err_data, m->addr, -			retired_page, channel_index, umc_inst); +	if (adev->umc.ras && +	    adev->umc.ras->convert_ras_error_address) +		adev->umc.ras->convert_ras_error_address(adev, +			&err_data, m->addr, ch_inst, umc_inst);  	if (amdgpu_bad_page_threshold != 0) {  		amdgpu_ras_add_bad_pages(adev, err_data.err_addr, @@ -2896,6 +2897,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,  		amdgpu_ras_save_bad_pages(adev);  	} +	kfree(err_data.err_addr);  	return NOTIFY_OK;  } |