diff options
| author | Dmitry Torokhov <[email protected]> | 2024-07-15 14:03:44 -0700 | 
|---|---|---|
| committer | Dmitry Torokhov <[email protected]> | 2024-07-15 14:03:44 -0700 | 
| commit | a23e1966932464e1c5226cb9ac4ce1d5fc10ba22 (patch) | |
| tree | bf5f1b57faa01ca31656bfc48c7d6b6f0bc39189 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
| parent | 7c7b1be19b228b450c2945ec379d7fc6bfef9852 (diff) | |
| parent | f3efefb6fdcce604413135bd8d4c5568e53a1f13 (diff) | |
Merge branch 'next' into for-linus
Prepare input updates for 6.11 merge window.
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1324 | 
1 files changed, 1104 insertions, 220 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8aaa427f8c0f..8ebab6f22e5a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -28,6 +28,7 @@  #include <linux/reboot.h>  #include <linux/syscalls.h>  #include <linux/pm_runtime.h> +#include <linux/list_sort.h>  #include "amdgpu.h"  #include "amdgpu_ras.h" @@ -35,8 +36,10 @@  #include "amdgpu_xgmi.h"  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"  #include "nbio_v4_3.h" +#include "nbio_v7_9.h"  #include "atom.h"  #include "amdgpu_reset.h" +#include "amdgpu_psp.h"  #ifdef CONFIG_X86_MCE_AMD  #include <asm/mce.h> @@ -71,6 +74,8 @@ const char *ras_block_string[] = {  	"mca",  	"vcn",  	"jpeg", +	"ih", +	"mpio",  };  const char *ras_mca_block_string[] = { @@ -92,7 +97,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)  	if (!ras_block)  		return "NULL"; -	if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT) +	if (ras_block->block >= AMDGPU_RAS_BLOCK_COUNT || +	    ras_block->block >= ARRAY_SIZE(ras_block_string))  		return "OUT OF RANGE";  	if (ras_block->block == AMDGPU_RAS_BLOCK__MCA) @@ -114,6 +120,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)  /* typical ECC bad page rate is 1 bad page per 100MB VRAM */  #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL) +#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms +  enum amdgpu_ras_retire_page_reservation {  	AMDGPU_RAS_RETIRE_PAGE_RESERVED,  	AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -151,8 +159,9 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)  static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)  { -	struct ras_err_data err_data = {0, 0, 0, NULL}; +	struct ras_err_data err_data;  	struct eeprom_table_record err_rec; +	int ret;  	if ((address >= adev->gmc.mc_vram_size) ||  	    (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { @@ -169,6 +178,10 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre  		return 0;  	} +	ret = amdgpu_ras_error_data_init(&err_data); +	if (ret) +		return ret; +  	memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));  	err_data.err_addr = &err_rec;  	amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0); @@ -179,6 +192,8 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre  		amdgpu_ras_save_bad_pages(adev, NULL);  	} +	amdgpu_ras_error_data_fini(&err_data); +  	dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");  	dev_warn(adev->dev, "Clear EEPROM:\n");  	dev_warn(adev->dev, "    echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); @@ -200,8 +215,8 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,  		return -EINVAL;  	/* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */ -	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && -	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { +	if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && +	    amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) {  		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))  			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");  	} @@ -296,11 +311,13 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,  			return -EINVAL;  		data->head.block = block_id; -		/* only ue and ce errors are supported */ +		/* only ue, ce and poison errors are supported */  		if (!memcmp("ue", err, 2))  			data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;  		else if (!memcmp("ce", err, 2))  			data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE; +		else if (!memcmp("poison", err, 6)) +			data->head.type = AMDGPU_RAS_ERROR__POISON;  		else  			return -EINVAL; @@ -422,9 +439,10 @@ static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev,   * The block is one of: umc, sdma, gfx, etc.   *	see ras_block_string[] for details   * - * The error type is one of: ue, ce, where, + * The error type is one of: ue, ce and poison where,   *	ue is multi-uncorrectable   *	ce is single-correctable + *	poison is poison   *   * The sub-block is a the sub-block index, pass 0 if there is no sub-block.   * The address and value are hexadecimal numbers, leading 0x is optional. @@ -610,14 +628,18 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,  	if (amdgpu_ras_query_error_status(obj->adev, &info))  		return -EINVAL; -	if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && -	    obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { +	if (amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && +	    amdgpu_ip_version(obj->adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) {  		if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))  			dev_warn(obj->adev->dev, "Failed to reset error counter and error status");  	} -	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, -			  "ce", info.ce_count); +	if (info.head.block == AMDGPU_RAS_BLOCK__UMC) +		return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, +				"ce", info.ce_count, "de", info.de_count); +	else +		return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, +				"ce", info.ce_count);  }  /* obj begin */ @@ -627,8 +649,11 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,  static inline void put_obj(struct ras_manager *obj)  { -	if (obj && (--obj->use == 0)) +	if (obj && (--obj->use == 0)) {  		list_del(&obj->node); +		amdgpu_ras_error_data_fini(&obj->err_data); +	} +  	if (obj && (obj->use < 0))  		DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", get_ras_block_str(&obj->head));  } @@ -658,6 +683,9 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,  	if (alive_obj(obj))  		return NULL; +	if (amdgpu_ras_error_data_init(&obj->err_data)) +		return NULL; +  	obj->head = *head;  	obj->adev = adev;  	list_add(&obj->node, &con->head); @@ -757,28 +785,28 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,  	return 0;  } -static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev, -		struct ras_common_if *head) -{ -	if (amdgpu_ras_is_feature_allowed(adev, head) || -		amdgpu_ras_is_poison_mode_supported(adev)) -		return 1; -	else -		return 0; -} -  /* wrapper of psp_ras_enable_features */  int amdgpu_ras_feature_enable(struct amdgpu_device *adev,  		struct ras_common_if *head, bool enable)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	union ta_ras_cmd_input *info; -	int ret = 0; +	int ret;  	if (!con)  		return -EINVAL; -	if (head->block == AMDGPU_RAS_BLOCK__GFX) { +	/* For non-gfx ip, do not enable ras feature if it is not allowed */ +	/* For gfx ip, regardless of feature support status, */ +	/* Force issue enable or disable ras feature commands */ +	if (head->block != AMDGPU_RAS_BLOCK__GFX && +	    !amdgpu_ras_is_feature_allowed(adev, head)) +		return 0; + +	/* Only enable gfx ras feature from host side */ +	if (head->block == AMDGPU_RAS_BLOCK__GFX && +	    !amdgpu_sriov_vf(adev) && +	    !amdgpu_ras_intr_triggered()) {  		info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);  		if (!info)  			return -ENOMEM; @@ -794,32 +822,24 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,  				.error_type = amdgpu_ras_error_to_ta(head->type),  			};  		} -	} -	/* Do not enable if it is not allowed. */ -	if (enable && !amdgpu_ras_check_feature_allowed(adev, head)) -		goto out; - -	/* Only enable ras feature operation handle on host side */ -	if (head->block == AMDGPU_RAS_BLOCK__GFX && -		!amdgpu_sriov_vf(adev) && -		!amdgpu_ras_intr_triggered()) {  		ret = psp_ras_enable_features(&adev->psp, info, enable);  		if (ret) {  			dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",  				enable ? "enable":"disable",  				get_ras_block_str(head),  				amdgpu_ras_is_poison_mode_supported(adev), ret); -			goto out; +			kfree(info); +			return ret;  		} + +		kfree(info);  	}  	/* setup the obj */  	__amdgpu_ras_feature_enable(adev, head, enable); -out: -	if (head->block == AMDGPU_RAS_BLOCK__GFX) -		kfree(info); -	return ret; + +	return 0;  }  /* Only used in device probe stage and called only once. */ @@ -1022,103 +1042,395 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d  	}  } -/* query/inject/cure begin */ -int amdgpu_ras_query_error_status(struct amdgpu_device *adev, -				  struct ras_query_if *info) +static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, +					      struct ras_manager *ras_mgr, +					      struct ras_err_data *err_data, +					      const char *blk_name, +					      bool is_ue, +					      bool is_de)  { -	struct amdgpu_ras_block_object *block_obj = NULL; -	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); -	struct ras_err_data err_data = {0, 0, 0, NULL}; +	struct amdgpu_smuio_mcm_config_info *mcm_info; +	struct ras_err_node *err_node; +	struct ras_err_info *err_info; + +	if (is_ue) { +		for_each_ras_error(err_node, err_data) { +			err_info = &err_node->err_info; +			mcm_info = &err_info->mcm_info; +			if (err_info->ue_count) { +				dev_info(adev->dev, "socket: %d, die: %d, " +					 "%lld new uncorrectable hardware errors detected in %s block\n", +					 mcm_info->socket_id, +					 mcm_info->die_id, +					 err_info->ue_count, +					 blk_name); +			} +		} -	if (!obj) -		return -EINVAL; +		for_each_ras_error(err_node, &ras_mgr->err_data) { +			err_info = &err_node->err_info; +			mcm_info = &err_info->mcm_info; +			dev_info(adev->dev, "socket: %d, die: %d, " +				 "%lld uncorrectable hardware errors detected in total in %s block\n", +				 mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); +		} -	if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { -		amdgpu_ras_get_ecc_info(adev, &err_data);  	} else { -		block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); -		if (!block_obj || !block_obj->hw_ops)   { -			dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", -				     get_ras_block_str(&info->head)); -			return -EINVAL; -		} +		if (is_de) { +			for_each_ras_error(err_node, err_data) { +				err_info = &err_node->err_info; +				mcm_info = &err_info->mcm_info; +				if (err_info->de_count) { +					dev_info(adev->dev, "socket: %d, die: %d, " +						"%lld new deferred hardware errors detected in %s block\n", +						mcm_info->socket_id, +						mcm_info->die_id, +						err_info->de_count, +						blk_name); +				} +			} -		if (block_obj->hw_ops->query_ras_error_count) -			block_obj->hw_ops->query_ras_error_count(adev, &err_data); +			for_each_ras_error(err_node, &ras_mgr->err_data) { +				err_info = &err_node->err_info; +				mcm_info = &err_info->mcm_info; +				dev_info(adev->dev, "socket: %d, die: %d, " +					"%lld deferred hardware errors detected in total in %s block\n", +					mcm_info->socket_id, mcm_info->die_id, +					err_info->de_count, blk_name); +			} +		} else { +			for_each_ras_error(err_node, err_data) { +				err_info = &err_node->err_info; +				mcm_info = &err_info->mcm_info; +				if (err_info->ce_count) { +					dev_info(adev->dev, "socket: %d, die: %d, " +						"%lld new correctable hardware errors detected in %s block\n", +						mcm_info->socket_id, +						mcm_info->die_id, +						err_info->ce_count, +						blk_name); +				} +			} -		if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || -		    (info->head.block == AMDGPU_RAS_BLOCK__GFX) || -		    (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { -				if (block_obj->hw_ops->query_ras_error_status) -					block_obj->hw_ops->query_ras_error_status(adev); +			for_each_ras_error(err_node, &ras_mgr->err_data) { +				err_info = &err_node->err_info; +				mcm_info = &err_info->mcm_info; +				dev_info(adev->dev, "socket: %d, die: %d, " +					"%lld correctable hardware errors detected in total in %s block\n", +					mcm_info->socket_id, mcm_info->die_id, +					err_info->ce_count, blk_name);  			} +		}  	} +} -	obj->err_data.ue_count += err_data.ue_count; -	obj->err_data.ce_count += err_data.ce_count; - -	info->ue_count = obj->err_data.ue_count; -	info->ce_count = obj->err_data.ce_count; +static inline bool err_data_has_source_info(struct ras_err_data *data) +{ +	return !list_empty(&data->err_node_list); +} -	if (err_data.ce_count) { -		if (adev->smuio.funcs && -		    adev->smuio.funcs->get_socket_id && -		    adev->smuio.funcs->get_die_id) { +static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev, +					     struct ras_query_if *query_if, +					     struct ras_err_data *err_data) +{ +	struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head); +	const char *blk_name = get_ras_block_str(&query_if->head); + +	if (err_data->ce_count) { +		if (err_data_has_source_info(err_data)) { +			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, +							  blk_name, false, false); +		} else if (!adev->aid_mask && +			   adev->smuio.funcs && +			   adev->smuio.funcs->get_socket_id && +			   adev->smuio.funcs->get_die_id) {  			dev_info(adev->dev, "socket: %d, die: %d " -					"%ld correctable hardware errors " -					"detected in %s block, no user " -					"action is needed.\n", -					adev->smuio.funcs->get_socket_id(adev), -					adev->smuio.funcs->get_die_id(adev), -					obj->err_data.ce_count, -					get_ras_block_str(&info->head)); +				 "%ld correctable hardware errors " +				 "detected in %s block\n", +				 adev->smuio.funcs->get_socket_id(adev), +				 adev->smuio.funcs->get_die_id(adev), +				 ras_mgr->err_data.ce_count, +				 blk_name);  		} else {  			dev_info(adev->dev, "%ld correctable hardware errors " -					"detected in %s block, no user " -					"action is needed.\n", -					obj->err_data.ce_count, -					get_ras_block_str(&info->head)); +				 "detected in %s block\n", +				 ras_mgr->err_data.ce_count, +				 blk_name);  		}  	} -	if (err_data.ue_count) { -		if (adev->smuio.funcs && -		    adev->smuio.funcs->get_socket_id && -		    adev->smuio.funcs->get_die_id) { + +	if (err_data->ue_count) { +		if (err_data_has_source_info(err_data)) { +			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, +							  blk_name, true, false); +		} else if (!adev->aid_mask && +			   adev->smuio.funcs && +			   adev->smuio.funcs->get_socket_id && +			   adev->smuio.funcs->get_die_id) {  			dev_info(adev->dev, "socket: %d, die: %d " -					"%ld uncorrectable hardware errors " -					"detected in %s block\n", -					adev->smuio.funcs->get_socket_id(adev), -					adev->smuio.funcs->get_die_id(adev), -					obj->err_data.ue_count, -					get_ras_block_str(&info->head)); +				 "%ld uncorrectable hardware errors " +				 "detected in %s block\n", +				 adev->smuio.funcs->get_socket_id(adev), +				 adev->smuio.funcs->get_die_id(adev), +				 ras_mgr->err_data.ue_count, +				 blk_name);  		} else {  			dev_info(adev->dev, "%ld uncorrectable hardware errors " -					"detected in %s block\n", -					obj->err_data.ue_count, -					get_ras_block_str(&info->head)); +				 "detected in %s block\n", +				 ras_mgr->err_data.ue_count, +				 blk_name);  		}  	} +	if (err_data->de_count) { +		if (err_data_has_source_info(err_data)) { +			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, +							  blk_name, false, true); +		} else if (!adev->aid_mask && +			   adev->smuio.funcs && +			   adev->smuio.funcs->get_socket_id && +			   adev->smuio.funcs->get_die_id) { +			dev_info(adev->dev, "socket: %d, die: %d " +				 "%ld deferred hardware errors " +				 "detected in %s block\n", +				 adev->smuio.funcs->get_socket_id(adev), +				 adev->smuio.funcs->get_die_id(adev), +				 ras_mgr->err_data.de_count, +				 blk_name); +		} else { +			dev_info(adev->dev, "%ld deferred hardware errors " +				 "detected in %s block\n", +				 ras_mgr->err_data.de_count, +				 blk_name); +		} +	} +} + +static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data) +{ +	struct ras_err_node *err_node; +	struct ras_err_info *err_info; + +	if (err_data_has_source_info(err_data)) { +		for_each_ras_error(err_node, err_data) { +			err_info = &err_node->err_info; +			amdgpu_ras_error_statistic_de_count(&obj->err_data, +					&err_info->mcm_info, NULL, err_info->de_count); +			amdgpu_ras_error_statistic_ce_count(&obj->err_data, +					&err_info->mcm_info, NULL, err_info->ce_count); +			amdgpu_ras_error_statistic_ue_count(&obj->err_data, +					&err_info->mcm_info, NULL, err_info->ue_count); +		} +	} else { +		/* for legacy asic path which doesn't has error source info */ +		obj->err_data.ue_count += err_data->ue_count; +		obj->err_data.ce_count += err_data->ce_count; +		obj->err_data.de_count += err_data->de_count; +	} +} + +static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk) +{ +	struct ras_common_if head; + +	memset(&head, 0, sizeof(head)); +	head.block = blk; + +	return amdgpu_ras_find_obj(adev, &head); +} + +int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, +			const struct aca_info *aca_info, void *data) +{ +	struct ras_manager *obj; + +	obj = get_ras_manager(adev, blk); +	if (!obj) +		return -EINVAL; + +	return amdgpu_aca_add_handle(adev, &obj->aca_handle, ras_block_str(blk), aca_info, data); +} + +int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk) +{ +	struct ras_manager *obj; + +	obj = get_ras_manager(adev, blk); +	if (!obj) +		return -EINVAL; + +	amdgpu_aca_remove_handle(&obj->aca_handle); +  	return 0;  } -int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, -		enum amdgpu_ras_block block) +static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk, +					 enum aca_error_type type, struct ras_err_data *err_data)  { -	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); +	struct ras_manager *obj; -	if (!amdgpu_ras_is_supported(adev, block)) +	obj = get_ras_manager(adev, blk); +	if (!obj)  		return -EINVAL; -	if (!block_obj || !block_obj->hw_ops)   { -		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", -			     ras_block_str(block)); +	return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data); +} + +ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, +				  struct aca_handle *handle, char *buf, void *data) +{ +	struct ras_manager *obj = container_of(handle, struct ras_manager, aca_handle); +	struct ras_query_if info = { +		.head = obj->head, +	}; + +	if (amdgpu_ras_query_error_status(obj->adev, &info))  		return -EINVAL; + +	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, +			  "ce", info.ce_count); +} + +static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, +						struct ras_query_if *info, +						struct ras_err_data *err_data, +						unsigned int error_query_mode) +{ +	enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; +	struct amdgpu_ras_block_object *block_obj = NULL; +	int ret; + +	if (blk == AMDGPU_RAS_BLOCK_COUNT) +		return -EINVAL; + +	if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY) +		return -EINVAL; + +	if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) { +		if (info->head.block == AMDGPU_RAS_BLOCK__UMC) { +			amdgpu_ras_get_ecc_info(adev, err_data); +		} else { +			block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); +			if (!block_obj || !block_obj->hw_ops) { +				dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", +					     get_ras_block_str(&info->head)); +				return -EINVAL; +			} + +			if (block_obj->hw_ops->query_ras_error_count) +				block_obj->hw_ops->query_ras_error_count(adev, err_data); + +			if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) || +			    (info->head.block == AMDGPU_RAS_BLOCK__GFX) || +			    (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) { +				if (block_obj->hw_ops->query_ras_error_status) +					block_obj->hw_ops->query_ras_error_status(adev); +			} +		} +	} else { +		if (amdgpu_aca_is_enabled(adev)) { +			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data); +			if (ret) +				return ret; + +			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data); +			if (ret) +				return ret; +		} else { +			/* FIXME: add code to check return value later */ +			amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data); +			amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data); +		} +	} + +	return 0; +} + +/* query/inject/cure begin */ +int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) +{ +	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); +	struct ras_err_data err_data; +	unsigned int error_query_mode; +	int ret; + +	if (!obj) +		return -EINVAL; + +	ret = amdgpu_ras_error_data_init(&err_data); +	if (ret) +		return ret; + +	if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) +		return -EINVAL; + +	ret = amdgpu_ras_query_error_status_helper(adev, info, +						   &err_data, +						   error_query_mode); +	if (ret) +		goto out_fini_err_data; + +	amdgpu_rasmgr_error_data_statistic_update(obj, &err_data); + +	info->ue_count = obj->err_data.ue_count; +	info->ce_count = obj->err_data.ce_count; +	info->de_count = obj->err_data.de_count; + +	amdgpu_ras_error_generate_report(adev, info, &err_data); + +out_fini_err_data: +	amdgpu_ras_error_data_fini(&err_data); + +	return ret; +} + +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, +		enum amdgpu_ras_block block) +{ +	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; +	const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; +	struct amdgpu_hive_info *hive; +	int hive_ras_recovery = 0; + +	if (!block_obj || !block_obj->hw_ops) { +		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", +				ras_block_str(block)); +		return -EOPNOTSUPP; +	} + +	if (!amdgpu_ras_is_supported(adev, block) || +	    !amdgpu_ras_get_aca_debug_mode(adev)) +		return -EOPNOTSUPP; + +	hive = amdgpu_get_xgmi_hive(adev); +	if (hive) { +		hive_ras_recovery = atomic_read(&hive->ras_recovery); +		amdgpu_put_xgmi_hive(hive);  	} +	/* skip ras error reset in gpu reset */ +	if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || +	    hive_ras_recovery) && +	    ((smu_funcs && smu_funcs->set_debug_mode) || +	     (mca_funcs && mca_funcs->mca_set_debug_mode))) +		return -EOPNOTSUPP; +  	if (block_obj->hw_ops->reset_ras_error_count)  		block_obj->hw_ops->reset_ras_error_count(adev); +	return 0; +} + +int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, +		enum amdgpu_ras_block block) +{ +	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); + +	if (amdgpu_ras_reset_error_count(adev, block) == -EOPNOTSUPP) +		return 0; +  	if ((block == AMDGPU_RAS_BLOCK__GFX) ||  	    (block == AMDGPU_RAS_BLOCK__MMHUB)) {  		if (block_obj->hw_ops->reset_ras_error_status) @@ -1159,7 +1471,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,  	}  	/* Calculate XGMI relative offset */ -	if (adev->gmc.xgmi.num_physical_nodes > 1) { +	if (adev->gmc.xgmi.num_physical_nodes > 1 && +	    info->head.block != AMDGPU_RAS_BLOCK__GFX) {  		block_info.address =  			amdgpu_xgmi_get_relative_phy_addr(adev,  							  block_info.address); @@ -1213,8 +1526,8 @@ static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev,  	/* some hardware/IP supports read to clear  	 * no need to explictly reset the err status after the query call */ -	if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && -	    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { +	if (amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 2) && +	    amdgpu_ip_version(adev, MP0_HWIP, 0) != IP_VERSION(11, 0, 4)) {  		if (amdgpu_ras_reset_error_status(adev, query_info->head.block))  			dev_warn(adev->dev,  				 "Failed to reset error counter and error status\n"); @@ -1374,20 +1687,39 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,  	return sysfs_emit(buf, "feature mask: 0x%x\n", con->features);  } +static ssize_t amdgpu_ras_sysfs_version_show(struct device *dev, +		struct device_attribute *attr, char *buf) +{ +	struct amdgpu_ras *con = +		container_of(attr, struct amdgpu_ras, version_attr); +	return sysfs_emit(buf, "table version: 0x%x\n", con->eeprom_control.tbl_hdr.version); +} + +static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev, +		struct device_attribute *attr, char *buf) +{ +	struct amdgpu_ras *con = +		container_of(attr, struct amdgpu_ras, schema_attr); +	return sysfs_emit(buf, "schema: 0x%x\n", con->schema); +} +  static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	sysfs_remove_file_from_group(&adev->dev->kobj, +	if (adev->dev->kobj.sd) +		sysfs_remove_file_from_group(&adev->dev->kobj,  				&con->badpages_attr.attr,  				RAS_FS_NAME);  } -static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) +static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct attribute *attrs[] = {  		&con->features_attr.attr, +		&con->version_attr.attr, +		&con->schema_attr.attr,  		NULL  	};  	struct attribute_group group = { @@ -1395,7 +1727,8 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)  		.attrs = attrs,  	}; -	sysfs_remove_group(&adev->dev->kobj, &group); +	if (adev->dev->kobj.sd) +		sysfs_remove_group(&adev->dev->kobj, &group);  	return 0;  } @@ -1442,7 +1775,8 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,  	if (!obj || !obj->attr_inuse)  		return -EINVAL; -	sysfs_remove_file_from_group(&adev->dev->kobj, +	if (adev->dev->kobj.sd) +		sysfs_remove_file_from_group(&adev->dev->kobj,  				&obj->sysfs_attr.attr,  				RAS_FS_NAME);  	obj->attr_inuse = 0; @@ -1463,7 +1797,7 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)  	if (amdgpu_bad_page_threshold != 0)  		amdgpu_ras_sysfs_remove_bad_page_node(adev); -	amdgpu_ras_sysfs_remove_feature_node(adev); +	amdgpu_ras_sysfs_remove_dev_attr_node(adev);  	return 0;  } @@ -1575,6 +1909,11 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  			amdgpu_ras_debugfs_create(adev, &fs_info, dir);  		}  	} + +	if (amdgpu_aca_is_enabled(adev)) +		amdgpu_aca_smu_debugfs_init(adev, dir); +	else +		amdgpu_mca_smu_debugfs_init(adev, dir);  }  /* debugfs end */ @@ -1584,6 +1923,10 @@ static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO,  		amdgpu_ras_sysfs_badpages_read, NULL, 0);  static DEVICE_ATTR(features, S_IRUGO,  		amdgpu_ras_sysfs_features_read, NULL); +static DEVICE_ATTR(version, 0444, +		amdgpu_ras_sysfs_version_show, NULL); +static DEVICE_ATTR(schema, 0444, +		amdgpu_ras_sysfs_schema_show, NULL);  static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1592,6 +1935,8 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  	};  	struct attribute *attrs[] = {  		&con->features_attr.attr, +		&con->version_attr.attr, +		&con->schema_attr.attr,  		NULL  	};  	struct bin_attribute *bin_attrs[] = { @@ -1600,11 +1945,20 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  	};  	int r; +	group.attrs = attrs; +  	/* add features entry */  	con->features_attr = dev_attr_features; -	group.attrs = attrs;  	sysfs_attr_init(attrs[0]); +	/* add version entry */ +	con->version_attr = dev_attr_version; +	sysfs_attr_init(attrs[1]); + +	/* add schema entry */ +	con->schema_attr = dev_attr_schema; +	sysfs_attr_init(attrs[2]); +  	if (amdgpu_bad_page_threshold != 0) {  		/* add bad_page_features entry */  		bin_attr_gpu_vram_bad_pages.private = NULL; @@ -1687,7 +2041,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  		}  	} -	amdgpu_umc_poison_handler(adev, false); +	amdgpu_umc_poison_handler(adev, obj->head.block, false);  	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)  		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); @@ -1706,19 +2060,23 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj  				struct amdgpu_iv_entry *entry)  {  	dev_info(obj->adev->dev, -		"Poison is created, no user action is needed.\n"); +		"Poison is created\n");  }  static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,  				struct amdgpu_iv_entry *entry)  {  	struct ras_ih_data *data = &obj->ih_data; -	struct ras_err_data err_data = {0, 0, 0, NULL}; +	struct ras_err_data err_data;  	int ret;  	if (!data->cb)  		return; +	ret = amdgpu_ras_error_data_init(&err_data); +	if (ret) +		return; +  	/* Let IP handle its data, maybe we need get the output  	 * from the callback to update the error type/count, etc  	 */ @@ -1734,7 +2092,10 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,  		 */  		obj->err_data.ue_count += err_data.ue_count;  		obj->err_data.ce_count += err_data.ce_count; +		obj->err_data.de_count += err_data.de_count;  	} + +	amdgpu_ras_error_data_fini(&err_data);  }  static void amdgpu_ras_interrupt_handler(struct ras_manager *obj) @@ -1910,14 +2271,18 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  		 * should be removed until smu fix handle ecc_info table.  		 */  		if ((info.head.block == AMDGPU_RAS_BLOCK__UMC) && -			(adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))) +		    (amdgpu_ip_version(adev, MP1_HWIP, 0) == +		     IP_VERSION(13, 0, 2)))  			continue;  		amdgpu_ras_query_error_status(adev, &info); -		if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && -		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && -		    adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) { +		if (amdgpu_ip_version(adev, MP0_HWIP, 0) != +			    IP_VERSION(11, 0, 2) && +		    amdgpu_ip_version(adev, MP0_HWIP, 0) != +			    IP_VERSION(11, 0, 4) && +		    amdgpu_ip_version(adev, MP0_HWIP, 0) != +			    IP_VERSION(13, 0, 0)) {  			if (amdgpu_ras_reset_error_status(adev, info.head.block))  				dev_warn(adev->dev, "Failed to reset error counter and error status");  		} @@ -2026,9 +2391,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  	struct amdgpu_device *remote_adev = NULL;  	struct amdgpu_device *adev = ras->adev;  	struct list_head device_list, *device_list_handle =  NULL; +	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); +	if (hive) +		atomic_set(&hive->ras_recovery, 1);  	if (!ras->disable_ras_err_cnt_harvest) { -		struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);  		/* Build list of devices to query RAS related errors */  		if  (hive && adev->gmc.xgmi.num_physical_nodes > 1) { @@ -2045,7 +2412,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  			amdgpu_ras_log_on_err_counter(remote_adev);  		} -		amdgpu_put_xgmi_hive(hive);  	}  	if (amdgpu_device_should_recover_gpu(ras->adev)) { @@ -2072,12 +2438,30 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  			if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {  				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;  				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + +				/* For any RAS error that needs a full reset to +				 * recover, set the fatal error status +				 */ +				if (hive) { +					list_for_each_entry(remote_adev, +							    &hive->device_list, +							    gmc.xgmi.head) +						amdgpu_ras_set_fed(remote_adev, +								   true); +				} else { +					amdgpu_ras_set_fed(adev, true); +				} +				psp_fatal_error_recovery_quirk(&adev->psp);  			}  		}  		amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);  	}  	atomic_set(&ras->in_recovery, 0); +	if (hive) { +		atomic_set(&hive->ras_recovery, 0); +		amdgpu_put_xgmi_hive(hive); +	}  }  /* alloc/realloc bps array */ @@ -2290,6 +2674,32 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,  	}  } +static int amdgpu_ras_page_retirement_thread(void *param) +{ +	struct amdgpu_device *adev = (struct amdgpu_device *)param; +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + +	while (!kthread_should_stop()) { + +		wait_event_interruptible(con->page_retirement_wq, +				kthread_should_stop() || +				atomic_read(&con->page_retirement_req_cnt)); + +		if (kthread_should_stop()) +			break; + +		dev_info(adev->dev, "Start processing page retirement. request:%d\n", +			atomic_read(&con->page_retirement_req_cnt)); + +		atomic_dec(&con->page_retirement_req_cnt); + +		amdgpu_umc_bad_page_polling_timeout(adev, +				false, MAX_UMC_POISON_POLLING_TIME_ASYNC); +	} + +	return 0; +} +  int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -2312,7 +2722,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  		return 0;  	data = &con->eh_data; -	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); +	*data = kzalloc(sizeof(**data), GFP_KERNEL);  	if (!*data) {  		ret = -ENOMEM;  		goto out; @@ -2353,6 +2763,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  		}  	} +	mutex_init(&con->page_retirement_lock); +	init_waitqueue_head(&con->page_retirement_wq); +	atomic_set(&con->page_retirement_req_cnt, 0); +	con->page_retirement_thread = +		kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); +	if (IS_ERR(con->page_retirement_thread)) { +		con->page_retirement_thread = NULL; +		dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n"); +	} +  #ifdef CONFIG_X86_MCE_AMD  	if ((adev->asic_type == CHIP_ALDEBARAN) &&  	    (adev->gmc.xgmi.connected_to_cpu)) @@ -2388,6 +2808,11 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)  	if (!data)  		return 0; +	if (con->page_retirement_thread) +		kthread_stop(con->page_retirement_thread); + +	atomic_set(&con->page_retirement_req_cnt, 0); +  	cancel_work_sync(&con->recovery_work);  	mutex_lock(&con->recovery_lock); @@ -2403,8 +2828,9 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)  static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  {  	if (amdgpu_sriov_vf(adev)) { -		switch (adev->ip_versions[MP0_HWIP][0]) { +		switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {  		case IP_VERSION(13, 0, 2): +		case IP_VERSION(13, 0, 6):  			return true;  		default:  			return false; @@ -2412,8 +2838,9 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  	}  	if (adev->asic_type == CHIP_IP_DISCOVERY) { -		switch (adev->ip_versions[MP0_HWIP][0]) { +		switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {  		case IP_VERSION(13, 0, 0): +		case IP_VERSION(13, 0, 6):  		case IP_VERSION(13, 0, 10):  			return true;  		default: @@ -2440,13 +2867,94 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev)  	if (!ctx)  		return; -	if (strnstr(ctx->vbios_version, "D16406", -		    sizeof(ctx->vbios_version)) || -		strnstr(ctx->vbios_version, "D36002", -			sizeof(ctx->vbios_version))) +	if (strnstr(ctx->vbios_pn, "D16406", +		    sizeof(ctx->vbios_pn)) || +		strnstr(ctx->vbios_pn, "D36002", +			sizeof(ctx->vbios_pn)))  		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX);  } +/* Query ras capablity via atomfirmware interface */ +static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) +{ +	/* mem_ecc cap */ +	if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { +		dev_info(adev->dev, "MEM ECC is active.\n"); +		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | +					 1 << AMDGPU_RAS_BLOCK__DF); +	} else { +		dev_info(adev->dev, "MEM ECC is not presented.\n"); +	} + +	/* sram_ecc cap */ +	if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { +		dev_info(adev->dev, "SRAM ECC is active.\n"); +		if (!amdgpu_sriov_vf(adev)) +			adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | +						  1 << AMDGPU_RAS_BLOCK__DF); +		else +			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | +						 1 << AMDGPU_RAS_BLOCK__SDMA | +						 1 << AMDGPU_RAS_BLOCK__GFX); + +		/* +		 * VCN/JPEG RAS can be supported on both bare metal and +		 * SRIOV environment +		 */ +		if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || +		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || +		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) +			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | +						 1 << AMDGPU_RAS_BLOCK__JPEG); +		else +			adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | +						  1 << AMDGPU_RAS_BLOCK__JPEG); + +		/* +		 * XGMI RAS is not supported if xgmi num physical nodes +		 * is zero +		 */ +		if (!adev->gmc.xgmi.num_physical_nodes) +			adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); +	} else { +		dev_info(adev->dev, "SRAM ECC is not presented.\n"); +	} +} + +/* Query poison mode from umc/df IP callbacks */ +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	bool df_poison, umc_poison; + +	/* poison setting is useless on SRIOV guest */ +	if (amdgpu_sriov_vf(adev) || !con) +		return; + +	/* Init poison supported flag, the default value is false */ +	if (adev->gmc.xgmi.connected_to_cpu || +	    adev->gmc.is_app_apu) { +		/* enabled by default when GPU is connected to CPU */ +		con->poison_supported = true; +	} else if (adev->df.funcs && +	    adev->df.funcs->query_ras_poison_mode && +	    adev->umc.ras && +	    adev->umc.ras->query_ras_poison_mode) { +		df_poison = +			adev->df.funcs->query_ras_poison_mode(adev); +		umc_poison = +			adev->umc.ras->query_ras_poison_mode(adev); + +		/* Only poison is set in both DF and UMC, we can support it */ +		if (df_poison && umc_poison) +			con->poison_supported = true; +		else if (df_poison != umc_poison) +			dev_warn(adev->dev, +				"Poison setting is inconsistent in DF/UMC(%d:%d)!\n", +				df_poison, umc_poison); +	} +} +  /*   * check hardware's ras ability which will be saved in hw_supported.   * if hardware does not support ras, we can skip some ras initializtion and @@ -2463,45 +2971,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)  	if (!amdgpu_ras_asic_supported(adev))  		return; -	if (!adev->gmc.xgmi.connected_to_cpu &&	!adev->gmc.is_app_apu) { -		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { -			dev_info(adev->dev, "MEM ECC is active.\n"); -			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | -						   1 << AMDGPU_RAS_BLOCK__DF); -		} else { -			dev_info(adev->dev, "MEM ECC is not presented.\n"); -		} - -		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { -			dev_info(adev->dev, "SRAM ECC is active.\n"); -			if (!amdgpu_sriov_vf(adev)) -				adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | -							    1 << AMDGPU_RAS_BLOCK__DF); -			else -				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | -								1 << AMDGPU_RAS_BLOCK__SDMA | -								1 << AMDGPU_RAS_BLOCK__GFX); +	/* query ras capability from psp */ +	if (amdgpu_psp_get_ras_capability(&adev->psp)) +		goto init_ras_enabled_flag; -			/* VCN/JPEG RAS can be supported on both bare metal and -			 * SRIOV environment -			 */ -			if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) || -			    adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0)) -				adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | -							1 << AMDGPU_RAS_BLOCK__JPEG); -			else -				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | -							1 << AMDGPU_RAS_BLOCK__JPEG); - -			/* -			 * XGMI RAS is not supported if xgmi num physical nodes -			 * is zero -			 */ -			if (!adev->gmc.xgmi.num_physical_nodes) -				adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); -		} else { -			dev_info(adev->dev, "SRAM ECC is not presented.\n"); -		} +	/* query ras capablity from bios */ +	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { +		amdgpu_ras_query_ras_capablity_from_vbios(adev);  	} else {  		/* driver only manages a few IP blocks RAS feature  		 * when GPU is connected cpu through XGMI */ @@ -2510,13 +2986,21 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)  					   1 << AMDGPU_RAS_BLOCK__MMHUB);  	} +	/* apply asic specific settings (vega20 only for now) */  	amdgpu_ras_get_quirks(adev); +	/* query poison mode from umc/df ip callback */ +	amdgpu_ras_query_poison_mode(adev); + +init_ras_enabled_flag:  	/* hw_supported needs to be aligned with RAS block mask. */  	adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK;  	adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :  		adev->ras_hw_enabled & amdgpu_ras_mask; + +	/* aca is disabled by default */ +	adev->aca.is_enabled = false;  }  static void amdgpu_ras_counte_dw(struct work_struct *work) @@ -2544,36 +3028,12 @@ Out:  	pm_runtime_put_autosuspend(dev->dev);  } -static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) +static int amdgpu_get_ras_schema(struct amdgpu_device *adev)  { -	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	bool df_poison, umc_poison; - -	/* poison setting is useless on SRIOV guest */ -	if (amdgpu_sriov_vf(adev) || !con) -		return; - -	/* Init poison supported flag, the default value is false */ -	if (adev->gmc.xgmi.connected_to_cpu) { -		/* enabled by default when GPU is connected to CPU */ -		con->poison_supported = true; -	} else if (adev->df.funcs && -	    adev->df.funcs->query_ras_poison_mode && -	    adev->umc.ras && -	    adev->umc.ras->query_ras_poison_mode) { -		df_poison = -			adev->df.funcs->query_ras_poison_mode(adev); -		umc_poison = -			adev->umc.ras->query_ras_poison_mode(adev); - -		/* Only poison is set in both DF and UMC, we can support it */ -		if (df_poison && umc_poison) -			con->poison_supported = true; -		else if (df_poison != umc_poison) -			dev_warn(adev->dev, -				"Poison setting is inconsistent in DF/UMC(%d:%d)!\n", -				df_poison, umc_poison); -	} +	return  amdgpu_ras_is_poison_mode_supported(adev) ? AMDGPU_RAS_ERROR__POISON : 0 | +			AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE | +			AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE | +			AMDGPU_RAS_ERROR__PARITY;  }  int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2584,10 +3044,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	if (con)  		return 0; -	con = kmalloc(sizeof(struct amdgpu_ras) + +	con = kzalloc(sizeof(*con) +  			sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +  			sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT, -			GFP_KERNEL|__GFP_ZERO); +			GFP_KERNEL);  	if (!con)  		return -ENOMEM; @@ -2618,6 +3078,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	con->update_channel_flag = false;  	con->features = 0; +	con->schema = 0;  	INIT_LIST_HEAD(&con->head);  	/* Might need get this flag from vbios. */  	con->flags = RAS_DEFAULT_FLAGS; @@ -2625,7 +3086,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	/* initialize nbio ras function ahead of any other  	 * ras functions so hardware fatal error interrupt  	 * can be enabled as early as possible */ -	switch (adev->ip_versions[NBIO_HWIP][0]) { +	switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) {  	case IP_VERSION(7, 4, 0):  	case IP_VERSION(7, 4, 1):  	case IP_VERSION(7, 4, 4): @@ -2642,6 +3103,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  			 * check DF RAS */  			adev->nbio.ras = &nbio_v4_3_ras;  		break; +	case IP_VERSION(7, 9, 0): +		if (!adev->gmc.is_app_apu) +			adev->nbio.ras = &nbio_v7_9_ras; +		break;  	default:  		/* nbio ras is not available */  		break; @@ -2667,7 +3132,14 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  			goto release_con;  	} -	amdgpu_ras_query_poison_mode(adev); +	/* Packed socket_id to ras feature mask bits[31:29] */ +	if (adev->smuio.funcs && +	    adev->smuio.funcs->get_socket_id) +		con->features |= ((adev->smuio.funcs->get_socket_id(adev)) << +					AMDGPU_RAS_FEATURES_SOCKETID_SHIFT); + +	/* Get RAS schema for particular SOC */ +	con->schema = amdgpu_get_ras_schema(adev);  	if (amdgpu_ras_fs_init(adev)) {  		r = -EINVAL; @@ -2765,23 +3237,28 @@ int amdgpu_ras_block_late_init(struct amdgpu_device *adev,  			goto cleanup;  	} -	r = amdgpu_ras_sysfs_create(adev, ras_block); -	if (r) -		goto interrupt; +	if (ras_obj->hw_ops && +	    (ras_obj->hw_ops->query_ras_error_count || +	     ras_obj->hw_ops->query_ras_error_status)) { +		r = amdgpu_ras_sysfs_create(adev, ras_block); +		if (r) +			goto interrupt; -	/* Those are the cached values at init. -	 */ -	query_info = kzalloc(sizeof(struct ras_query_if), GFP_KERNEL); -	if (!query_info) -		return -ENOMEM; -	memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); +		/* Those are the cached values at init. +		 */ +		query_info = kzalloc(sizeof(*query_info), GFP_KERNEL); +		if (!query_info) +			return -ENOMEM; +		memcpy(&query_info->head, ras_block, sizeof(struct ras_common_if)); -	if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { -		atomic_set(&con->ras_ce_count, ce_count); -		atomic_set(&con->ras_ue_count, ue_count); +		if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { +			atomic_set(&con->ras_ce_count, ce_count); +			atomic_set(&con->ras_ue_count, ue_count); +		} + +		kfree(query_info);  	} -	kfree(query_info);  	return 0;  interrupt: @@ -2865,7 +3342,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)  	amdgpu_ras_disable_all_features(adev, 0);  	/* Make sure all ras objects are disabled. */ -	if (con->features) +	if (AMDGPU_RAS_GET_FEATURES(con->features))  		amdgpu_ras_disable_all_features(adev, 1);  } @@ -2879,13 +3356,29 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)  	if (amdgpu_sriov_vf(adev))  		return 0; +	if (amdgpu_aca_is_enabled(adev)) { +		if (amdgpu_in_reset(adev)) +			r = amdgpu_aca_reset(adev); +		 else +			r = amdgpu_aca_init(adev); +		if (r) +			return r; + +		amdgpu_ras_set_aca_debug_mode(adev, false); +	} else { +		amdgpu_ras_set_mca_debug_mode(adev, false); +	} +  	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { -		if (!node->ras_obj) { +		obj = node->ras_obj; +		if (!obj) {  			dev_warn(adev->dev, "Warning: abnormal ras list node.\n");  			continue;  		} -		obj = node->ras_obj; +		if (!amdgpu_ras_is_supported(adev, obj->ras_comm.block)) +			continue; +  		if (obj->ras_late_init) {  			r = obj->ras_late_init(adev, &obj->ras_comm);  			if (r) { @@ -2910,7 +3403,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)  	/* Need disable ras on all IPs here before ip [hw/sw]fini */ -	if (con->features) +	if (AMDGPU_RAS_GET_FEATURES(con->features))  		amdgpu_ras_disable_all_features(adev, 0);  	amdgpu_ras_recovery_fini(adev);  	return 0; @@ -2943,10 +3436,13 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)  	amdgpu_ras_fs_fini(adev);  	amdgpu_ras_interrupt_remove_all(adev); -	WARN(con->features, "Feature mask is not cleared"); +	if (amdgpu_aca_is_enabled(adev)) +		amdgpu_aca_fini(adev); -	if (con->features) -		amdgpu_ras_disable_all_features(adev, 1); +	WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); + +	if (AMDGPU_RAS_GET_FEATURES(con->features)) +		amdgpu_ras_disable_all_features(adev, 0);  	cancel_delayed_work_sync(&con->ras_counte_delay_work); @@ -2956,12 +3452,28 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)  	return 0;  } -void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) +bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev)  { -	amdgpu_ras_check_supported(adev); -	if (!adev->ras_hw_enabled) -		return; +	struct amdgpu_ras *ras; + +	ras = amdgpu_ras_get_context(adev); +	if (!ras) +		return false; + +	return atomic_read(&ras->fed); +} +void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) +{ +	struct amdgpu_ras *ras; + +	ras = amdgpu_ras_get_context(adev); +	if (ras) +		atomic_set(&ras->fed, !!status); +} + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) +{  	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {  		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); @@ -3136,6 +3648,11 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,  	 * that the ras block supports ras function.  	 */  	if (!ret && +	    (block == AMDGPU_RAS_BLOCK__GFX || +	     block == AMDGPU_RAS_BLOCK__SDMA || +	     block == AMDGPU_RAS_BLOCK__VCN || +	     block == AMDGPU_RAS_BLOCK__JPEG) && +		(amdgpu_ras_mask & (1 << block)) &&  	    amdgpu_ras_is_poison_mode_supported(adev) &&  	    amdgpu_ras_get_ras_block(adev, block, 0))  		ret = 1; @@ -3152,6 +3669,73 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)  	return 0;  } +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	int ret = 0; + +	if (con) { +		ret = amdgpu_mca_smu_set_debug_mode(adev, enable); +		if (!ret) +			con->is_aca_debug_mode = enable; +	} + +	return ret; +} + +int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	int ret = 0; + +	if (con) { +		if (amdgpu_aca_is_enabled(adev)) +			ret = amdgpu_aca_smu_set_debug_mode(adev, enable); +		else +			ret = amdgpu_mca_smu_set_debug_mode(adev, enable); +		if (!ret) +			con->is_aca_debug_mode = enable; +	} + +	return ret; +} + +bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; +	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + +	if (!con) +		return false; + +	if ((amdgpu_aca_is_enabled(adev) && smu_funcs && smu_funcs->set_debug_mode) || +	    (!amdgpu_aca_is_enabled(adev) && mca_funcs && mca_funcs->mca_set_debug_mode)) +		return con->is_aca_debug_mode; +	else +		return true; +} + +bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, +				     unsigned int *error_query_mode) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; +	const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; + +	if (!con) { +		*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY; +		return false; +	} + +	if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) +		*error_query_mode = +			(con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY; +	else +		*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY; + +	return true; +}  /* Register each ip ras block into amdgpu ras */  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, @@ -3311,3 +3895,303 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,  		WREG32(err_status_hi_offset, 0);  	}  } + +int amdgpu_ras_error_data_init(struct ras_err_data *err_data) +{ +	memset(err_data, 0, sizeof(*err_data)); + +	INIT_LIST_HEAD(&err_data->err_node_list); + +	return 0; +} + +static void amdgpu_ras_error_node_release(struct ras_err_node *err_node) +{ +	if (!err_node) +		return; + +	list_del(&err_node->node); +	kvfree(err_node); +} + +void amdgpu_ras_error_data_fini(struct ras_err_data *err_data) +{ +	struct ras_err_node *err_node, *tmp; + +	list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) +		amdgpu_ras_error_node_release(err_node); +} + +static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data, +							     struct amdgpu_smuio_mcm_config_info *mcm_info) +{ +	struct ras_err_node *err_node; +	struct amdgpu_smuio_mcm_config_info *ref_id; + +	if (!err_data || !mcm_info) +		return NULL; + +	for_each_ras_error(err_node, err_data) { +		ref_id = &err_node->err_info.mcm_info; + +		if (mcm_info->socket_id == ref_id->socket_id && +		    mcm_info->die_id == ref_id->die_id) +			return err_node; +	} + +	return NULL; +} + +static struct ras_err_node *amdgpu_ras_error_node_new(void) +{ +	struct ras_err_node *err_node; + +	err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL); +	if (!err_node) +		return NULL; + +	INIT_LIST_HEAD(&err_node->node); + +	return err_node; +} + +static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct list_head *b) +{ +	struct ras_err_node *nodea = container_of(a, struct ras_err_node, node); +	struct ras_err_node *nodeb = container_of(b, struct ras_err_node, node); +	struct amdgpu_smuio_mcm_config_info *infoa = &nodea->err_info.mcm_info; +	struct amdgpu_smuio_mcm_config_info *infob = &nodeb->err_info.mcm_info; + +	if (unlikely(infoa->socket_id != infob->socket_id)) +		return infoa->socket_id - infob->socket_id; +	else +		return infoa->die_id - infob->die_id; + +	return 0; +} + +static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data, +				struct amdgpu_smuio_mcm_config_info *mcm_info) +{ +	struct ras_err_node *err_node; + +	err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info); +	if (err_node) +		return &err_node->err_info; + +	err_node = amdgpu_ras_error_node_new(); +	if (!err_node) +		return NULL; + +	INIT_LIST_HEAD(&err_node->err_info.err_addr_list); + +	memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info)); + +	err_data->err_list_count++; +	list_add_tail(&err_node->node, &err_data->err_node_list); +	list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp); + +	return &err_node->err_info; +} + +void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr) +{ +	struct ras_err_addr *mca_err_addr; + +	mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL); +	if (!mca_err_addr) +		return; + +	INIT_LIST_HEAD(&mca_err_addr->node); + +	mca_err_addr->err_status = err_addr->err_status; +	mca_err_addr->err_ipid = err_addr->err_ipid; +	mca_err_addr->err_addr = err_addr->err_addr; + +	list_add_tail(&mca_err_addr->node, &err_info->err_addr_list); +} + +void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr) +{ +	list_del(&mca_err_addr->node); +	kfree(mca_err_addr); +} + +int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, +		struct amdgpu_smuio_mcm_config_info *mcm_info, +		struct ras_err_addr *err_addr, u64 count) +{ +	struct ras_err_info *err_info; + +	if (!err_data || !mcm_info) +		return -EINVAL; + +	if (!count) +		return 0; + +	err_info = amdgpu_ras_error_get_info(err_data, mcm_info); +	if (!err_info) +		return -EINVAL; + +	if (err_addr && err_addr->err_status) +		amdgpu_ras_add_mca_err_addr(err_info, err_addr); + +	err_info->ue_count += count; +	err_data->ue_count += count; + +	return 0; +} + +int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, +		struct amdgpu_smuio_mcm_config_info *mcm_info, +		struct ras_err_addr *err_addr, u64 count) +{ +	struct ras_err_info *err_info; + +	if (!err_data || !mcm_info) +		return -EINVAL; + +	if (!count) +		return 0; + +	err_info = amdgpu_ras_error_get_info(err_data, mcm_info); +	if (!err_info) +		return -EINVAL; + +	err_info->ce_count += count; +	err_data->ce_count += count; + +	return 0; +} + +int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, +		struct amdgpu_smuio_mcm_config_info *mcm_info, +		struct ras_err_addr *err_addr, u64 count) +{ +	struct ras_err_info *err_info; + +	if (!err_data || !mcm_info) +		return -EINVAL; + +	if (!count) +		return 0; + +	err_info = amdgpu_ras_error_get_info(err_data, mcm_info); +	if (!err_info) +		return -EINVAL; + +	if (err_addr && err_addr->err_status) +		amdgpu_ras_add_mca_err_addr(err_info, err_addr); + +	err_info->de_count += count; +	err_data->de_count += count; + +	return 0; +} + +#define mmMP0_SMN_C2PMSG_92	0x1609C +#define mmMP0_SMN_C2PMSG_126	0x160BE +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, +						 u32 instance, u32 boot_error) +{ +	u32 socket_id, aid_id, hbm_id; +	u32 reg_data; +	u64 reg_addr; + +	socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); +	aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); +	hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); + +	/* The pattern for smn addressing in other SOC could be different from +	 * the one for aqua_vanjaram. We should revisit the code if the pattern +	 * is changed. In such case, replace the aqua_vanjaram implementation +	 * with more common helper */ +	reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + +		   aqua_vanjaram_encode_ext_smn_addressing(instance); + +	reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); +	dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", +		socket_id, aid_id, reg_data); + +	if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) +		dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", +			 socket_id, aid_id, hbm_id); + +	if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) +		dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", +			 socket_id, aid_id); + +	if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) +		dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", +			 socket_id, aid_id); + +	if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) +		dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", +			 socket_id, aid_id); + +	if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) +		dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", +			 socket_id, aid_id); + +	if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) +		dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", +			 socket_id, aid_id); + +	if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) +		dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", +			 socket_id, aid_id, hbm_id); + +	if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) +		dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", +			 socket_id, aid_id, hbm_id); +} + +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, +					     u32 instance, u32 *boot_error) +{ +	u32 reg_addr; +	u32 reg_data; +	int retry_loop; + +	reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + +		   aqua_vanjaram_encode_ext_smn_addressing(instance); + +	for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { +		reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); +		if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) { +			*boot_error = AMDGPU_RAS_BOOT_SUCEESS; +			return 0; +		} +		msleep(1); +	} + +	/* The pattern for smn addressing in other SOC could be different from +	 * the one for aqua_vanjaram. We should revisit the code if the pattern +	 * is changed. In such case, replace the aqua_vanjaram implementation +	 * with more common helper */ +	reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + +		   aqua_vanjaram_encode_ext_smn_addressing(instance); + +	for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { +		reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); +		if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) { +			*boot_error = reg_data; +			return 0; +		} +		msleep(1); +	} + +	*boot_error = reg_data; +	return -ETIME; +} + +void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances) +{ +	u32 boot_error = 0; +	u32 i; + +	for (i = 0; i < num_instances; i++) { +		if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error)) +			amdgpu_ras_boot_time_error_reporting(adev, i, boot_error); +	} +} |