diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 82 | 
1 files changed, 60 insertions, 22 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 08133de21fdd..8f47c14ecbc7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -867,9 +867,9 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,  /* feature ctl end */ -void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, -				       struct ras_common_if *ras_block, -				       struct ras_err_data  *err_data) +static void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev, +					      struct ras_common_if *ras_block, +					      struct ras_err_data  *err_data)  {  	switch (ras_block->sub_block_index) {  	case AMDGPU_RAS_MCA_BLOCK__MP0: @@ -892,6 +892,38 @@ void amdgpu_ras_mca_query_error_status(struct amdgpu_device *adev,  	}  } +static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) +{ +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +	int ret = 0; + +	/* +	 * choosing right query method according to +	 * whether smu support query error information +	 */ +	ret = smu_get_ecc_info(&adev->smu, (void *)&(ras->umc_ecc)); +	if (ret == -EOPNOTSUPP) { +		if (adev->umc.ras_funcs && +			adev->umc.ras_funcs->query_ras_error_count) +			adev->umc.ras_funcs->query_ras_error_count(adev, err_data); + +		/* umc query_ras_error_address is also responsible for clearing +		 * error status +		 */ +		if (adev->umc.ras_funcs && +		    adev->umc.ras_funcs->query_ras_error_address) +			adev->umc.ras_funcs->query_ras_error_address(adev, err_data); +	} else if (!ret) { +		if (adev->umc.ras_funcs && +			adev->umc.ras_funcs->ecc_info_query_ras_error_count) +			adev->umc.ras_funcs->ecc_info_query_ras_error_count(adev, err_data); + +		if (adev->umc.ras_funcs && +			adev->umc.ras_funcs->ecc_info_query_ras_error_address) +			adev->umc.ras_funcs->ecc_info_query_ras_error_address(adev, err_data); +	} +} +  /* query/inject/cure begin */  int amdgpu_ras_query_error_status(struct amdgpu_device *adev,  				  struct ras_query_if *info) @@ -905,15 +937,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,  	switch (info->head.block) {  	case AMDGPU_RAS_BLOCK__UMC: -		if (adev->umc.ras_funcs && -		    adev->umc.ras_funcs->query_ras_error_count) -			adev->umc.ras_funcs->query_ras_error_count(adev, &err_data); -		/* umc query_ras_error_address is also responsible for clearing -		 * error status -		 */ -		if (adev->umc.ras_funcs && -		    adev->umc.ras_funcs->query_ras_error_address) -			adev->umc.ras_funcs->query_ras_error_address(adev, &err_data); +		amdgpu_ras_get_ecc_info(adev, &err_data);  		break;  	case AMDGPU_RAS_BLOCK__SDMA:  		if (adev->sdma.funcs->query_ras_error_count) { @@ -1137,9 +1161,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,  /**   * amdgpu_ras_query_error_count -- Get error counts of all IPs - * adev: pointer to AMD GPU device - * ce_count: pointer to an integer to be set to the count of correctible errors. - * ue_count: pointer to an integer to be set to the count of uncorrectible + * @adev: pointer to AMD GPU device + * @ce_count: pointer to an integer to be set to the count of correctible errors. + * @ue_count: pointer to an integer to be set to the count of uncorrectible   * errors.   *   * If set, @ce_count or @ue_count, count and return the corresponding @@ -1568,6 +1592,7 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)  				/* Let IP handle its data, maybe we need get the output  				 * from the callback to udpate the error type/count, etc  				 */ +				memset(&err_data, 0, sizeof(err_data));  				ret = data->cb(obj->adev, &err_data, &entry);  				/* ue will trigger an interrupt, and in that case  				 * we need do a reset to recovery the whole system. @@ -1723,6 +1748,16 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  		if (info.head.block == AMDGPU_RAS_BLOCK__PCIE_BIF)  			continue; +		/* +		 * this is a workaround for aldebaran, skip send msg to +		 * smu to get ecc_info table due to smu handle get ecc +		 * info table failed temporarily. +		 * should be removed until smu fix handle ecc_info table. +		 */ +		if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) && +			(adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))) +			continue; +  		amdgpu_ras_query_error_status(adev, &info);  	}  } @@ -1804,8 +1839,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,  			.size = AMDGPU_GPU_PAGE_SIZE,  			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,  		}; -		status = amdgpu_vram_mgr_query_page_status( -				ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), +		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr,  				data->bps[i].retired_page);  		if (status == -EBUSY)  			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; @@ -1906,8 +1940,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,  			goto out;  		} -		amdgpu_vram_mgr_reserve_range( -			ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), +		amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr,  			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,  			AMDGPU_GPU_PAGE_SIZE); @@ -1935,9 +1968,11 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)  	if (!con || !con->eh_data)  		return 0; +	mutex_lock(&con->recovery_lock);  	control = &con->eeprom_control;  	data = con->eh_data;  	save_count = data->count - control->ras_num_recs; +	mutex_unlock(&con->recovery_lock);  	/* only new entries are saved */  	if (save_count > 0) {  		if (amdgpu_ras_eeprom_append(control, @@ -2336,7 +2371,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	}  	/* Init poison supported flag, the default value is false */ -	if (adev->df.funcs && +	if (adev->gmc.xgmi.connected_to_cpu) { +		/* enabled by default when GPU is connected to CPU */ +		con->poison_supported = true; +	} +	else if (adev->df.funcs &&  	    adev->df.funcs->query_ras_poison_mode &&  	    adev->umc.ras_funcs &&  	    adev->umc.ras_funcs->query_ras_poison_mode) { @@ -2477,7 +2516,6 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev,  	amdgpu_ras_sysfs_remove(adev, ras_block);  	if (ih_info->cb)  		amdgpu_ras_interrupt_remove_handler(adev, ih_info); -	amdgpu_ras_feature_enable(adev, ras_block, 0);  }  /* do some init work after IP late init as dependence. @@ -2647,7 +2685,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,  	 * and error occurred in DramECC (Extended error code = 0) then only  	 * process the error, else bail out.  	 */ -	if (!m || !((smca_get_bank_type(m->bank) == SMCA_UMC_V2) && +	if (!m || !((smca_get_bank_type(m->extcpu, m->bank) == SMCA_UMC_V2) &&  		    (XEC(m->status, 0x3f) == 0x0)))  		return NOTIFY_DONE; |