diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 169 | 
1 files changed, 114 insertions, 55 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index a90029ee9733..46264a4002f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -23,6 +23,120 @@  #include "amdgpu_ras.h" +static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, +		void *ras_error_status, +		struct amdgpu_iv_entry *entry, +		bool reset) +{ +	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	int ret = 0; + +	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); +	ret = smu_get_ecc_info(&adev->smu, (void *)&(con->umc_ecc)); +	if (ret == -EOPNOTSUPP) { +		if (adev->umc.ras_funcs && +		    adev->umc.ras_funcs->query_ras_error_count) +		    adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status); + +		if (adev->umc.ras_funcs && +		    adev->umc.ras_funcs->query_ras_error_address && +		    adev->umc.max_ras_err_cnt_per_query) { +			err_data->err_addr = +				kcalloc(adev->umc.max_ras_err_cnt_per_query, +					sizeof(struct eeprom_table_record), GFP_KERNEL); + +			/* still call query_ras_error_address to clear error status +			 * even NOMEM error is encountered +			 */ +			if(!err_data->err_addr) +				dev_warn(adev->dev, "Failed to alloc memory for " +						"umc error address record!\n"); + +			/* umc query_ras_error_address is also responsible for clearing +			 * error status +			 */ +			adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status); +		} +	} else if (!ret) { +		if (adev->umc.ras_funcs && +		    adev->umc.ras_funcs->ecc_info_query_ras_error_count) +		    adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, ras_error_status); + +		if (adev->umc.ras_funcs && +		    adev->umc.ras_funcs->ecc_info_query_ras_error_address && +		    adev->umc.max_ras_err_cnt_per_query) { +			err_data->err_addr = +				kcalloc(adev->umc.max_ras_err_cnt_per_query, +					sizeof(struct eeprom_table_record), GFP_KERNEL); + +			/* still call query_ras_error_address to clear error status +			 * even NOMEM error is encountered +			 */ +			if(!err_data->err_addr) +				dev_warn(adev->dev, "Failed to alloc memory for " +						"umc error address record!\n"); + +			/* umc query_ras_error_address is also responsible for clearing +			 * error status +			 */ +			adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, ras_error_status); +		} +	} + +	/* only uncorrectable error needs gpu reset */ +	if (err_data->ue_count) { +		dev_info(adev->dev, "%ld uncorrectable hardware errors " +				"detected in UMC block\n", +				err_data->ue_count); + +		if ((amdgpu_bad_page_threshold != 0) && +			err_data->err_addr_cnt) { +			amdgpu_ras_add_bad_pages(adev, err_data->err_addr, +						err_data->err_addr_cnt); +			amdgpu_ras_save_bad_pages(adev); + +			if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num) +				adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); +		} + +		if (reset) +			amdgpu_ras_reset_gpu(adev); +	} + +	kfree(err_data->err_addr); +	return AMDGPU_RAS_SUCCESS; +} + +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, +		void *ras_error_status, +		bool reset) +{ +	int ret; +	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; +	struct ras_common_if head = { +		.block = AMDGPU_RAS_BLOCK__UMC, +	}; +	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + +	ret = +		amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + +	if (ret == AMDGPU_RAS_SUCCESS && obj) { +		obj->err_data.ue_count += err_data->ue_count; +		obj->err_data.ce_count += err_data->ce_count; +	} + +	return ret; +} + +static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, +		void *ras_error_status, +		struct amdgpu_iv_entry *entry) +{ +	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); +} +  int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)  {  	int r; @@ -88,61 +202,6 @@ void amdgpu_umc_ras_fini(struct amdgpu_device *adev)  	}  } -int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, -		void *ras_error_status, -		struct amdgpu_iv_entry *entry) -{ -	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; -	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); -	if (adev->umc.ras_funcs && -	    adev->umc.ras_funcs->query_ras_error_count) -	    adev->umc.ras_funcs->query_ras_error_count(adev, ras_error_status); - -	if (adev->umc.ras_funcs && -	    adev->umc.ras_funcs->query_ras_error_address && -	    adev->umc.max_ras_err_cnt_per_query) { -		err_data->err_addr = -			kcalloc(adev->umc.max_ras_err_cnt_per_query, -				sizeof(struct eeprom_table_record), GFP_KERNEL); - -		/* still call query_ras_error_address to clear error status -		 * even NOMEM error is encountered -		 */ -		if(!err_data->err_addr) -			dev_warn(adev->dev, "Failed to alloc memory for " -					"umc error address record!\n"); - -		/* umc query_ras_error_address is also responsible for clearing -		 * error status -		 */ -		adev->umc.ras_funcs->query_ras_error_address(adev, ras_error_status); -	} - -	/* only uncorrectable error needs gpu reset */ -	if (err_data->ue_count) { -		dev_info(adev->dev, "%ld uncorrectable hardware errors " -				"detected in UMC block\n", -				err_data->ue_count); - -		if ((amdgpu_bad_page_threshold != 0) && -			err_data->err_addr_cnt) { -			amdgpu_ras_add_bad_pages(adev, err_data->err_addr, -						err_data->err_addr_cnt); -			amdgpu_ras_save_bad_pages(adev); - -			if (adev->smu.ppt_funcs && adev->smu.ppt_funcs->send_hbm_bad_pages_num) -				adev->smu.ppt_funcs->send_hbm_bad_pages_num(&adev->smu, con->eeprom_control.ras_num_recs); -		} - -		amdgpu_ras_reset_gpu(adev); -	} - -	kfree(err_data->err_addr); -	return AMDGPU_RAS_SUCCESS; -} -  int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,  		struct amdgpu_irq_src *source,  		struct amdgpu_iv_entry *entry) |