diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 68 | 
1 files changed, 32 insertions, 36 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2dad7aa9a03b..ad490c1e2f57 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1267,7 +1267,7 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,  	struct amdgpu_ras *con =  		container_of(attr, struct amdgpu_ras, features_attr); -	return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); +	return sysfs_emit(buf, "feature mask: 0x%x\n", con->features);  }  static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) @@ -1561,7 +1561,6 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  {  	bool poison_stat = false;  	struct amdgpu_device *adev = obj->adev; -	struct ras_err_data err_data = {0, 0, 0, NULL};  	struct amdgpu_ras_block_object *block_obj =  		amdgpu_ras_get_ras_block(adev, obj->head.block, 0); @@ -1584,7 +1583,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  	}  	if (!adev->gmc.xgmi.connected_to_cpu) -		amdgpu_umc_poison_handler(adev, &err_data, false); +		amdgpu_umc_poison_handler(adev, false);  	if (block_obj->hw_ops->handle_poison_consumption)  		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); @@ -1949,8 +1948,12 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  		reset_context.method = AMD_RESET_METHOD_NONE;  		reset_context.reset_req_dev = adev; -		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); -		clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags); + +		/* Perform full reset in fatal error mode */ +		if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) +			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); +		else +			clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);  		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);  	} @@ -2268,6 +2271,25 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)  static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  { +	if (amdgpu_sriov_vf(adev)) { +		switch (adev->ip_versions[MP0_HWIP][0]) { +		case IP_VERSION(13, 0, 2): +			return true; +		default: +			return false; +		} +	} + +	if (adev->asic_type == CHIP_IP_DISCOVERY) { +		switch (adev->ip_versions[MP0_HWIP][0]) { +		case IP_VERSION(13, 0, 0): +		case IP_VERSION(13, 0, 10): +			return true; +		default: +			return false; +		} +	} +  	return adev->asic_type == CHIP_VEGA10 ||  		adev->asic_type == CHIP_VEGA20 ||  		adev->asic_type == CHIP_ARCTURUS || @@ -2311,11 +2333,6 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)  	    !amdgpu_ras_asic_supported(adev))  		return; -	/* If driver run on sriov guest side, only enable ras for aldebaran */ -	if (amdgpu_sriov_vf(adev) && -		adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2)) -		return; -  	if (!adev->gmc.xgmi.connected_to_cpu) {  		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {  			dev_info(adev->dev, "MEM ECC is active.\n"); @@ -2331,7 +2348,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)  				adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |  							    1 << AMDGPU_RAS_BLOCK__DF); -				if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0)) +				if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) || +				    adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0))  					adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |  							1 << AMDGPU_RAS_BLOCK__JPEG);  				else @@ -2835,7 +2853,6 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,  	struct amdgpu_device *adev = NULL;  	uint32_t gpu_id = 0;  	uint32_t umc_inst = 0, ch_inst = 0; -	struct ras_err_data err_data = {0, 0, 0, NULL};  	/*  	 * If the error was generated in UMC_V2, which belongs to GPU UMCs, @@ -2874,31 +2891,10 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,  	dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",  			     umc_inst, ch_inst); -	err_data.err_addr = -		kcalloc(adev->umc.max_ras_err_cnt_per_query, -			sizeof(struct eeprom_table_record), GFP_KERNEL); -	if (!err_data.err_addr) { -		dev_warn(adev->dev, -			"Failed to alloc memory for umc error record in mca notifier!\n"); +	if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) +		return NOTIFY_OK; +	else  		return NOTIFY_DONE; -	} - -	/* -	 * Translate UMC channel address to Physical address -	 */ -	if (adev->umc.ras && -	    adev->umc.ras->convert_ras_error_address) -		adev->umc.ras->convert_ras_error_address(adev, -			&err_data, m->addr, ch_inst, umc_inst); - -	if (amdgpu_bad_page_threshold != 0) { -		amdgpu_ras_add_bad_pages(adev, err_data.err_addr, -						err_data.err_addr_cnt); -		amdgpu_ras_save_bad_pages(adev); -	} - -	kfree(err_data.err_addr); -	return NOTIFY_OK;  }  static struct notifier_block amdgpu_bad_page_nb = { |