diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 728 | 
1 files changed, 532 insertions, 196 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1adc81a55734..d0307c55da50 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -120,10 +120,12 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)  /* typical ECC bad page rate is 1 bad page per 100MB VRAM */  #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms +#define MAX_UMC_POISON_POLLING_TIME_ASYNC  300  //ms  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms +#define MAX_FLUSH_RETIRE_DWORK_TIMES  100 +  enum amdgpu_ras_retire_page_reservation {  	AMDGPU_RAS_RETIRE_PAGE_RESERVED,  	AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -1055,7 +1057,7 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,  	struct amdgpu_smuio_mcm_config_info *mcm_info;  	struct ras_err_node *err_node;  	struct ras_err_info *err_info; -	u64 event_id = qctx->event_id; +	u64 event_id = qctx->evid.event_id;  	if (is_ue) {  		for_each_ras_error(err_node, err_data) { @@ -1140,7 +1142,7 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,  {  	struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);  	const char *blk_name = get_ras_block_str(&query_if->head); -	u64 event_id = qctx->event_id; +	u64 event_id = qctx->evid.event_id;  	if (err_data->ce_count) {  		if (err_data_has_source_info(err_data)) { @@ -1295,11 +1297,14 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a  		.head = obj->head,  	}; +	if (!amdgpu_ras_get_error_query_ready(obj->adev)) +		return sysfs_emit(buf, "Query currently inaccessible\n"); +  	if (amdgpu_ras_query_error_status(obj->adev, &info))  		return -EINVAL;  	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, -			  "ce", info.ce_count, "de", info.ue_count); +			  "ce", info.ce_count, "de", info.de_count);  }  static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev, @@ -1363,7 +1368,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,  }  /* query/inject/cure begin */ -int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) +static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev, +						    struct ras_query_if *info, +						    enum ras_event_type type)  {  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);  	struct ras_err_data err_data; @@ -1382,12 +1389,19 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i  		return -EINVAL;  	memset(&qctx, 0, sizeof(qctx)); -	qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? -						   RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID); +	qctx.evid.type = type; +	qctx.evid.event_id = amdgpu_ras_acquire_event_id(adev, type); + +	if (!down_read_trylock(&adev->reset_domain->sem)) { +		ret = -EIO; +		goto out_fini_err_data; +	} +  	ret = amdgpu_ras_query_error_status_helper(adev, info,  						   &err_data,  						   &qctx,  						   error_query_mode); +	up_read(&adev->reset_domain->sem);  	if (ret)  		goto out_fini_err_data; @@ -1405,15 +1419,17 @@ out_fini_err_data:  	return ret;  } +int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info) +{ +	return amdgpu_ras_query_error_status_with_event(adev, info, RAS_EVENT_TYPE_INVALID); +} +  int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,  		enum amdgpu_ras_block block)  {  	struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); -	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);  	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;  	const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; -	struct amdgpu_hive_info *hive; -	int hive_ras_recovery = 0;  	if (!block_obj || !block_obj->hw_ops) {  		dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1425,15 +1441,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev,  	    !amdgpu_ras_get_aca_debug_mode(adev))  		return -EOPNOTSUPP; -	hive = amdgpu_get_xgmi_hive(adev); -	if (hive) { -		hive_ras_recovery = atomic_read(&hive->ras_recovery); -		amdgpu_put_xgmi_hive(hive); -	} -  	/* skip ras error reset in gpu reset */ -	if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || -	    hive_ras_recovery) && +	if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) &&  	    ((smu_funcs && smu_funcs->set_debug_mode) ||  	     (mca_funcs && mca_funcs->mca_set_debug_mode)))  		return -EOPNOTSUPP; @@ -1724,6 +1733,39 @@ static ssize_t amdgpu_ras_sysfs_schema_show(struct device *dev,  	return sysfs_emit(buf, "schema: 0x%x\n", con->schema);  } +static struct { +	enum ras_event_type type; +	const char *name; +} dump_event[] = { +	{RAS_EVENT_TYPE_FATAL, "Fatal Error"}, +	{RAS_EVENT_TYPE_POISON_CREATION, "Poison Creation"}, +	{RAS_EVENT_TYPE_POISON_CONSUMPTION, "Poison Consumption"}, +}; + +static ssize_t amdgpu_ras_sysfs_event_state_show(struct device *dev, +						 struct device_attribute *attr, char *buf) +{ +	struct amdgpu_ras *con = +		container_of(attr, struct amdgpu_ras, event_state_attr); +	struct ras_event_manager *event_mgr = con->event_mgr; +	struct ras_event_state *event_state; +	int i, size = 0; + +	if (!event_mgr) +		return -EINVAL; + +	size += sysfs_emit_at(buf, size, "current seqno: %llu\n", atomic64_read(&event_mgr->seqno)); +	for (i = 0; i < ARRAY_SIZE(dump_event); i++) { +		event_state = &event_mgr->event_state[dump_event[i].type]; +		size += sysfs_emit_at(buf, size, "%s: count:%llu, last_seqno:%llu\n", +				      dump_event[i].name, +				      atomic64_read(&event_state->count), +				      event_state->last_seqno); +	} + +	return (ssize_t)size; +} +  static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1741,6 +1783,7 @@ static int amdgpu_ras_sysfs_remove_dev_attr_node(struct amdgpu_device *adev)  		&con->features_attr.attr,  		&con->version_attr.attr,  		&con->schema_attr.attr, +		&con->event_state_attr.attr,  		NULL  	};  	struct attribute_group group = { @@ -1759,6 +1802,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,  {  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); +	if (amdgpu_aca_is_enabled(adev)) +		return 0; +  	if (!obj || obj->attr_inuse)  		return -EINVAL; @@ -1793,6 +1839,9 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,  {  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); +	if (amdgpu_aca_is_enabled(adev)) +		return 0; +  	if (!obj || !obj->attr_inuse)  		return -EINVAL; @@ -1905,6 +1954,23 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,  			    obj, &amdgpu_ras_debugfs_ops);  } +static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev) +{ +	bool ret; + +	switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { +	case IP_VERSION(13, 0, 6): +	case IP_VERSION(13, 0, 14): +		ret = true; +		break; +	default: +		ret = false; +		break; +	} + +	return ret; +} +  void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1931,10 +1997,12 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev)  		}  	} -	if (amdgpu_aca_is_enabled(adev)) -		amdgpu_aca_smu_debugfs_init(adev, dir); -	else -		amdgpu_mca_smu_debugfs_init(adev, dir); +	if (amdgpu_ras_aca_is_supported(adev)) { +		if (amdgpu_aca_is_enabled(adev)) +			amdgpu_aca_smu_debugfs_init(adev, dir); +		else +			amdgpu_mca_smu_debugfs_init(adev, dir); +	}  }  /* debugfs end */ @@ -1948,6 +2016,8 @@ static DEVICE_ATTR(version, 0444,  		amdgpu_ras_sysfs_version_show, NULL);  static DEVICE_ATTR(schema, 0444,  		amdgpu_ras_sysfs_schema_show, NULL); +static DEVICE_ATTR(event_state, 0444, +		   amdgpu_ras_sysfs_event_state_show, NULL);  static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1958,6 +2028,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  		&con->features_attr.attr,  		&con->version_attr.attr,  		&con->schema_attr.attr, +		&con->event_state_attr.attr,  		NULL  	};  	struct bin_attribute *bin_attrs[] = { @@ -1980,6 +2051,10 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev)  	con->schema_attr = dev_attr_schema;  	sysfs_attr_init(attrs[2]); +	/* add event_state entry */ +	con->event_state_attr = dev_attr_event_state; +	sysfs_attr_init(attrs[3]); +  	if (amdgpu_bad_page_threshold != 0) {  		/* add bad_page_features entry */  		bin_attr_gpu_vram_bad_pages.private = NULL; @@ -2043,8 +2118,16 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  	struct amdgpu_device *adev = obj->adev;  	struct amdgpu_ras_block_object *block_obj =  		amdgpu_ras_get_ras_block(adev, obj->head.block, 0); +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION; +	u64 event_id; +	int ret; -	if (!block_obj) +	if (!block_obj || !con) +		return; + +	ret = amdgpu_ras_mark_ras_event(adev, type); +	if (ret)  		return;  	/* both query_poison_status and handle_poison_consumption are optional, @@ -2067,29 +2150,41 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)  		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); -	/* gpu reset is fallback for failed and default cases */ -	if (poison_stat) { -		dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", -				block_obj->ras_comm.name); +	/* gpu reset is fallback for failed and default cases. +	 * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. +	 */ +	if (poison_stat && !con->is_rma) { +		event_id = amdgpu_ras_acquire_event_id(adev, type); +		RAS_EVENT_LOG(adev, event_id, +			      "GPU reset for %s RAS poison consumption is issued!\n", +			      block_obj->ras_comm.name);  		amdgpu_ras_reset_gpu(adev); -	} else { -		amdgpu_gfx_poison_consumption_handler(adev, entry);  	} + +	if (!poison_stat) +		amdgpu_gfx_poison_consumption_handler(adev, entry);  }  static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,  				struct amdgpu_iv_entry *entry)  { -	dev_info(obj->adev->dev, -		"Poison is created\n"); +	struct amdgpu_device *adev = obj->adev; +	enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; +	u64 event_id; +	int ret; + +	ret = amdgpu_ras_mark_ras_event(adev, type); +	if (ret) +		return; + +	event_id = amdgpu_ras_acquire_event_id(adev, type); +	RAS_EVENT_LOG(adev, event_id, "Poison is created\n");  	if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {  		struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); -		amdgpu_ras_put_poison_req(obj->adev, -			AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false); -  		atomic_inc(&con->page_retirement_req_cnt); +		atomic_inc(&con->poison_creation_count);  		wake_up(&con->page_retirement_wq);  	} @@ -2112,6 +2207,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,  	/* Let IP handle its data, maybe we need get the output  	 * from the callback to update the error type/count, etc  	 */ +	amdgpu_ras_set_fed(obj->adev, true);  	ret = data->cb(obj->adev, &err_data, entry);  	/* ue will trigger an interrupt, and in that case  	 * we need do a reset to recovery the whole system. @@ -2172,12 +2268,15 @@ static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)  int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,  		struct ras_dispatch_if *info)  { -	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); -	struct ras_ih_data *data = &obj->ih_data; +	struct ras_manager *obj; +	struct ras_ih_data *data; +	obj = amdgpu_ras_find_obj(adev, &info->head);  	if (!obj)  		return -EINVAL; +	data = &obj->ih_data; +  	if (data->inuse == 0)  		return 0; @@ -2274,7 +2373,7 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)  /* ih end */  /* traversal all IPs except NBIO to query error counter */ -static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) +static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev, enum ras_event_type type)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj; @@ -2307,7 +2406,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  		     IP_VERSION(13, 0, 2)))  			continue; -		amdgpu_ras_query_error_status(adev, &info); +		amdgpu_ras_query_error_status_with_event(adev, &info, type);  		if (amdgpu_ip_version(adev, MP0_HWIP, 0) !=  			    IP_VERSION(11, 0, 2) && @@ -2429,6 +2528,31 @@ static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev,  	}  } +bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) +{ +	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +	int hive_ras_recovery = 0; + +	if (hive) { +		hive_ras_recovery = atomic_read(&hive->ras_recovery); +		amdgpu_put_xgmi_hive(hive); +	} + +	if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) +		return true; + +	return false; +} + +static enum ras_event_type amdgpu_ras_get_fatal_error_event(struct amdgpu_device *adev) +{ +	if (amdgpu_ras_intr_triggered()) +		return RAS_EVENT_TYPE_FATAL; +	else +		return RAS_EVENT_TYPE_POISON_CONSUMPTION; +} +  static void amdgpu_ras_do_recovery(struct work_struct *work)  {  	struct amdgpu_ras *ras = @@ -2437,6 +2561,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  	struct amdgpu_device *adev = ras->adev;  	struct list_head device_list, *device_list_handle =  NULL;  	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); +	enum ras_event_type type;  	if (hive) {  		atomic_set(&hive->ras_recovery, 1); @@ -2464,10 +2589,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  			device_list_handle = &device_list;  		} +		type = amdgpu_ras_get_fatal_error_event(adev);  		list_for_each_entry(remote_adev,  				device_list_handle, gmc.xgmi.head) {  			amdgpu_ras_query_err_status(remote_adev); -			amdgpu_ras_log_on_err_counter(remote_adev); +			amdgpu_ras_log_on_err_counter(remote_adev, type);  		}  	} @@ -2478,6 +2604,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  		reset_context.method = AMD_RESET_METHOD_NONE;  		reset_context.reset_req_dev = adev; +		reset_context.src = AMDGPU_RESET_SRC_RAS;  		/* Perform full reset in fatal error mode */  		if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) @@ -2758,7 +2885,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)  	memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));  	INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); -	ecc_log->de_updated = false; +	ecc_log->de_queried_count = 0; +	ecc_log->prev_de_queried_count = 0;  }  static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -2777,7 +2905,25 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)  	mutex_unlock(&ecc_log->lock);  	mutex_destroy(&ecc_log->lock); -	ecc_log->de_updated = false; +	ecc_log->de_queried_count = 0; +	ecc_log->prev_de_queried_count = 0; +} + +static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, +				uint32_t delayed_ms) +{ +	int ret; + +	mutex_lock(&con->umc_ecc_log.lock); +	ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree, +			UMC_ECC_NEW_DETECTED_TAG); +	mutex_unlock(&con->umc_ecc_log.lock); + +	if (ret) +		schedule_delayed_work(&con->page_retirement_dwork, +			msecs_to_jiffies(delayed_ms)); + +	return ret ? true : false;  }  static void amdgpu_ras_do_page_retirement(struct work_struct *work) @@ -2786,87 +2932,140 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)  					      page_retirement_dwork.work);  	struct amdgpu_device *adev = con->adev;  	struct ras_err_data err_data; +	unsigned long err_cnt; -	if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) +	/* If gpu reset is ongoing, delay retiring the bad pages */ +	if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { +		amdgpu_ras_schedule_retirement_dwork(con, +				AMDGPU_RAS_RETIRE_PAGE_INTERVAL * 3);  		return; +	}  	amdgpu_ras_error_data_init(&err_data);  	amdgpu_umc_handle_bad_pages(adev, &err_data); +	err_cnt = err_data.err_addr_cnt;  	amdgpu_ras_error_data_fini(&err_data); -	mutex_lock(&con->umc_ecc_log.lock); -	if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree, -				UMC_ECC_NEW_DETECTED_TAG)) -		schedule_delayed_work(&con->page_retirement_dwork, -			msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL)); -	mutex_unlock(&con->umc_ecc_log.lock); +	if (err_cnt && con->is_rma) +		amdgpu_ras_reset_gpu(adev); + +	amdgpu_ras_schedule_retirement_dwork(con, +			AMDGPU_RAS_RETIRE_PAGE_INTERVAL);  } -static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev, -			enum amdgpu_ras_block ras_block, uint32_t timeout_ms) +static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, +				uint32_t poison_creation_count)  {  	int ret = 0;  	struct ras_ecc_log_info *ecc_log;  	struct ras_query_if info; -	uint32_t timeout = timeout_ms; +	uint32_t timeout = 0;  	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +	uint64_t de_queried_count; +	uint32_t new_detect_count, total_detect_count; +	uint32_t need_query_count = poison_creation_count; +	bool query_data_timeout = false; +	enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION;  	memset(&info, 0, sizeof(info)); -	info.head.block = ras_block; +	info.head.block = AMDGPU_RAS_BLOCK__UMC;  	ecc_log = &ras->umc_ecc_log; -	ecc_log->de_updated = false; +	total_detect_count = 0;  	do { -		ret = amdgpu_ras_query_error_status(adev, &info); -		if (ret) { -			dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret); +		ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); +		if (ret)  			return ret; + +		de_queried_count = ecc_log->de_queried_count; +		if (de_queried_count > ecc_log->prev_de_queried_count) { +			new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; +			ecc_log->prev_de_queried_count = de_queried_count; +			timeout = 0; +		} else { +			new_detect_count = 0;  		} -		if (timeout && !ecc_log->de_updated) { -			msleep(1); -			timeout--; +		if (new_detect_count) { +			total_detect_count += new_detect_count; +		} else { +			if (!timeout && need_query_count) +				timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; + +			if (timeout) { +				if (!--timeout) { +					query_data_timeout = true; +					break; +				} +				msleep(1); +			}  		} -	} while (timeout && !ecc_log->de_updated); +	} while (total_detect_count < need_query_count); -	if (timeout_ms && !timeout) { -		dev_warn(adev->dev, "Can't find deferred error\n"); -		return -ETIMEDOUT; +	if (query_data_timeout) { +		dev_warn(adev->dev, "Can't find deferred error! count: %u\n", +			(need_query_count - total_detect_count)); +		return -ENOENT;  	} +	if (total_detect_count) +		schedule_delayed_work(&ras->page_retirement_dwork, 0); +  	return 0;  } -static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, -					uint32_t timeout) +static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_poison_msg msg;  	int ret; -	ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout); -	if (!ret) -		schedule_delayed_work(&con->page_retirement_dwork, 0); +	do { +		ret = kfifo_get(&con->poison_fifo, &msg); +	} while (ret);  }  static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, -			struct ras_poison_msg *poison_msg) +			uint32_t msg_count, uint32_t *gpu_reset)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	uint32_t reset = poison_msg->reset; -	uint16_t pasid = poison_msg->pasid; +	uint32_t reset_flags = 0, reset = 0; +	struct ras_poison_msg msg; +	int ret, i;  	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); -	if (poison_msg->pasid_fn) -		poison_msg->pasid_fn(adev, pasid, poison_msg->data); +	for (i = 0; i < msg_count; i++) { +		ret = amdgpu_ras_get_poison_req(adev, &msg); +		if (!ret) +			continue; + +		if (msg.pasid_fn) +			msg.pasid_fn(adev, msg.pasid, msg.data); + +		reset_flags |= msg.reset; +	} + +	/* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ +	if (reset_flags && !con->is_rma) { +		if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) +			reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET; +		else if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE2_RESET) +			reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; +		else +			reset = reset_flags; -	if (reset) {  		flush_delayed_work(&con->page_retirement_dwork);  		con->gpu_reset_flags |= reset;  		amdgpu_ras_reset_gpu(adev); + +		*gpu_reset = reset; + +		/* Wait for gpu recovery to complete */ +		flush_work(&con->recovery_work);  	}  	return 0; @@ -2876,9 +3075,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)  {  	struct amdgpu_device *adev = (struct amdgpu_device *)param;  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	struct ras_poison_msg poison_msg; -	enum amdgpu_ras_block ras_block; -	bool poison_creation_is_handled = false; +	uint32_t poison_creation_count, msg_count; +	uint32_t gpu_reset; +	int ret;  	while (!kthread_should_stop()) { @@ -2889,33 +3088,61 @@ static int amdgpu_ras_page_retirement_thread(void *param)  		if (kthread_should_stop())  			break; -		atomic_dec(&con->page_retirement_req_cnt); +		gpu_reset = 0; -		if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) -			continue; +		do { +			poison_creation_count = atomic_read(&con->poison_creation_count); +			ret = amdgpu_ras_poison_creation_handler(adev, poison_creation_count); +			if (ret == -EIO) +				break; -		ras_block = poison_msg.block; +			if (poison_creation_count) { +				atomic_sub(poison_creation_count, &con->poison_creation_count); +				atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); +			} +		} while (atomic_read(&con->poison_creation_count)); + +		if (ret != -EIO) { +			msg_count = kfifo_len(&con->poison_fifo); +			if (msg_count) { +				ret = amdgpu_ras_poison_consumption_handler(adev, +						msg_count, &gpu_reset); +				if ((ret != -EIO) && +				    (gpu_reset != AMDGPU_RAS_GPU_RESET_MODE1_RESET)) +					atomic_sub(msg_count, &con->page_retirement_req_cnt); +			} +		} -		dev_info(adev->dev, "Start processing ras block %s(%d)\n", -				ras_block_str(ras_block), ras_block); +		if ((ret == -EIO) || (gpu_reset == AMDGPU_RAS_GPU_RESET_MODE1_RESET)) { +			/* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ +			/* Clear poison creation request */ +			atomic_set(&con->poison_creation_count, 0); -		if (ras_block == AMDGPU_RAS_BLOCK__UMC) { -			amdgpu_ras_poison_creation_handler(adev, -				MAX_UMC_POISON_POLLING_TIME_ASYNC); -			poison_creation_is_handled = true; -		} else { -			/* poison_creation_is_handled: -			 *   false: no poison creation interrupt, but it has poison -			 *          consumption interrupt. -			 *   true: It has poison creation interrupt at the beginning, -			 *         but it has no poison creation interrupt later. -			 */ -			amdgpu_ras_poison_creation_handler(adev, -					poison_creation_is_handled ? -					0 : MAX_UMC_POISON_POLLING_TIME_ASYNC); +			/* Clear poison fifo */ +			amdgpu_ras_clear_poison_fifo(adev); + +			/* Clear all poison requests */ +			atomic_set(&con->page_retirement_req_cnt, 0); + +			if (ret == -EIO) { +				/* Wait for mode-1 reset to complete */ +				down_read(&adev->reset_domain->sem); +				up_read(&adev->reset_domain->sem); +			} + +			/* Wake up work to save bad pages to eeprom */ +			schedule_delayed_work(&con->page_retirement_dwork, 0); +		} else if (gpu_reset) { +			/* gpu just completed mode-2 reset or other reset */ +			/* Clear poison consumption messages cached in fifo */ +			msg_count = kfifo_len(&con->poison_fifo); +			if (msg_count) { +				amdgpu_ras_clear_poison_fifo(adev); +				atomic_sub(msg_count, &con->page_retirement_req_cnt); +			} -			amdgpu_ras_poison_consumption_handler(adev, &poison_msg); -			poison_creation_is_handled = false; +			/* Wake up work to save bad pages to eeprom */ +			schedule_delayed_work(&con->page_retirement_dwork, 0);  		}  	} @@ -2927,7 +3154,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_err_handler_data **data;  	u32  max_eeprom_records_count = 0; -	bool exc_err_limit = false;  	int ret;  	if (!con || amdgpu_sriov_vf(adev)) @@ -2964,12 +3190,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  	 */  	if (adev->gmc.xgmi.pending_reset)  		return 0; -	ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); +	ret = amdgpu_ras_eeprom_init(&con->eeprom_control);  	/* -	 * This calling fails when exc_err_limit is true or +	 * This calling fails when is_rma is true or  	 * ret != 0.  	 */ -	if (exc_err_limit || ret) +	if (con->is_rma || ret)  		goto free;  	if (con->eeprom_control.ras_num_recs) { @@ -2990,6 +3216,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  	mutex_init(&con->page_retirement_lock);  	init_waitqueue_head(&con->page_retirement_wq);  	atomic_set(&con->page_retirement_req_cnt, 0); +	atomic_set(&con->poison_creation_count, 0);  	con->page_retirement_thread =  		kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement");  	if (IS_ERR(con->page_retirement_thread)) { @@ -3017,7 +3244,7 @@ out:  	 * Except error threshold exceeding case, other failure cases in this  	 * function would not fail amdgpu driver init.  	 */ -	if (!exc_err_limit) +	if (!con->is_rma)  		ret = 0;  	else  		ret = -EINVAL; @@ -3029,15 +3256,24 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_err_handler_data *data = con->eh_data; +	int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES; +	bool ret;  	/* recovery_init failed to init it, fini is useless */  	if (!data)  		return 0; +	/* Save all cached bad pages to eeprom */ +	do { +		flush_delayed_work(&con->page_retirement_dwork); +		ret = amdgpu_ras_schedule_retirement_dwork(con, 0); +	} while (ret && max_flush_timeout--); +  	if (con->page_retirement_thread)  		kthread_stop(con->page_retirement_thread);  	atomic_set(&con->page_retirement_req_cnt, 0); +	atomic_set(&con->poison_creation_count, 0);  	mutex_destroy(&con->page_rsv_lock); @@ -3063,6 +3299,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  		switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {  		case IP_VERSION(13, 0, 2):  		case IP_VERSION(13, 0, 6): +		case IP_VERSION(13, 0, 14):  			return true;  		default:  			return false; @@ -3074,6 +3311,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  		case IP_VERSION(13, 0, 0):  		case IP_VERSION(13, 0, 6):  		case IP_VERSION(13, 0, 10): +		case IP_VERSION(13, 0, 14):  			return true;  		default:  			return false; @@ -3270,10 +3508,17 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)  static void ras_event_mgr_init(struct ras_event_manager *mgr)  { +	struct ras_event_state *event_state;  	int i; -	for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++) -		atomic64_set(&mgr->seqnos[i], 0); +	memset(mgr, 0, sizeof(*mgr)); +	atomic64_set(&mgr->seqno, 0); + +	for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { +		event_state = &mgr->event_state[i]; +		event_state->last_seqno = RAS_EVENT_INVALID_ID; +		atomic64_set(&event_state->count, 0); +	}  }  static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) @@ -3297,6 +3542,24 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)  		amdgpu_put_xgmi_hive(hive);  } +static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + +	if (!con || (adev->flags & AMD_IS_APU)) +		return; + +	switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { +	case IP_VERSION(13, 0, 2): +	case IP_VERSION(13, 0, 6): +	case IP_VERSION(13, 0, 14): +		con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE; +		break; +	default: +		break; +	} +} +  int amdgpu_ras_init(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3402,11 +3665,22 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	/* Get RAS schema for particular SOC */  	con->schema = amdgpu_get_ras_schema(adev); +	amdgpu_ras_init_reserved_vram_size(adev); +  	if (amdgpu_ras_fs_init(adev)) {  		r = -EINVAL;  		goto release_con;  	} +	if (amdgpu_ras_aca_is_supported(adev)) { +		if (amdgpu_aca_is_enabled(adev)) +			r = amdgpu_aca_init(adev); +		else +			r = amdgpu_mca_init(adev); +		if (r) +			goto release_con; +	} +  	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "  		 "hardware ability[%x] ras_mask[%x]\n",  		 adev->ras_hw_enabled, adev->ras_enabled); @@ -3613,25 +3887,30 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)  	struct amdgpu_ras_block_object *obj;  	int r; -	/* Guest side doesn't need init ras feature */ -	if (amdgpu_sriov_vf(adev)) -		return 0; -  	amdgpu_ras_event_mgr_init(adev); -	if (amdgpu_aca_is_enabled(adev)) { -		if (amdgpu_in_reset(adev)) -			r = amdgpu_aca_reset(adev); -		 else -			r = amdgpu_aca_init(adev); -		if (r) -			return r; +	if (amdgpu_ras_aca_is_supported(adev)) { +		if (amdgpu_in_reset(adev)) { +			if (amdgpu_aca_is_enabled(adev)) +				r = amdgpu_aca_reset(adev); +			else +				r = amdgpu_mca_reset(adev); +			if (r) +				return r; +		} -		amdgpu_ras_set_aca_debug_mode(adev, false); -	} else { -		amdgpu_ras_set_mca_debug_mode(adev, false); +		if (!amdgpu_sriov_vf(adev)) { +			if (amdgpu_aca_is_enabled(adev)) +				amdgpu_ras_set_aca_debug_mode(adev, false); +			else +				amdgpu_ras_set_mca_debug_mode(adev, false); +		}  	} +	/* Guest side doesn't need init ras feature */ +	if (amdgpu_sriov_vf(adev)) +		return 0; +  	list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {  		obj = node->ras_obj;  		if (!obj) { @@ -3699,8 +3978,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)  	amdgpu_ras_fs_fini(adev);  	amdgpu_ras_interrupt_remove_all(adev); -	if (amdgpu_aca_is_enabled(adev)) -		amdgpu_aca_fini(adev); +	if (amdgpu_ras_aca_is_supported(adev)) { +		if (amdgpu_aca_is_enabled(adev)) +			amdgpu_aca_fini(adev); +		else +			amdgpu_mca_fini(adev); +	}  	WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); @@ -3735,23 +4018,68 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)  		atomic_set(&ras->fed, !!status);  } -bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id) +static struct ras_event_manager *__get_ras_event_mgr(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *ras; + +	ras = amdgpu_ras_get_context(adev); +	if (!ras) +		return NULL; + +	return ras->event_mgr; +} + +int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, +				     const void *caller)  { -	return !(id & BIT_ULL(63)); +	struct ras_event_manager *event_mgr; +	struct ras_event_state *event_state; +	int ret = 0; + +	if (type >= RAS_EVENT_TYPE_COUNT) { +		ret = -EINVAL; +		goto out; +	} + +	event_mgr = __get_ras_event_mgr(adev); +	if (!event_mgr) { +		ret = -EINVAL; +		goto out; +	} + +	event_state = &event_mgr->event_state[type]; +	event_state->last_seqno = atomic64_inc_return(&event_mgr->seqno); +	atomic64_inc(&event_state->count); + +out: +	if (ret && caller) +		dev_warn(adev->dev, "failed mark ras event (%d) in %ps, ret:%d\n", +			 (int)type, caller, ret); + +	return ret;  }  u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type)  { -	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +	struct ras_event_manager *event_mgr;  	u64 id; +	if (type >= RAS_EVENT_TYPE_COUNT) +		return RAS_EVENT_INVALID_ID; +  	switch (type) { -	case RAS_EVENT_TYPE_ISR: -		id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]); +	case RAS_EVENT_TYPE_FATAL: +	case RAS_EVENT_TYPE_POISON_CREATION: +	case RAS_EVENT_TYPE_POISON_CONSUMPTION: +		event_mgr = __get_ras_event_mgr(adev); +		if (!event_mgr) +			return RAS_EVENT_INVALID_ID; + +		id = event_mgr->event_state[type].last_seqno;  		break;  	case RAS_EVENT_TYPE_INVALID:  	default: -		id = BIT_ULL(63) | 0ULL; +		id = RAS_EVENT_INVALID_ID;  		break;  	} @@ -3762,7 +4090,13 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)  {  	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {  		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); -		u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]); +		enum ras_event_type type = RAS_EVENT_TYPE_FATAL; +		u64 event_id; + +		if (amdgpu_ras_mark_ras_event(adev, type)) +			return; + +		event_id = amdgpu_ras_acquire_event_id(adev, type);  		RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error"  			      "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); @@ -3952,6 +4286,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)  {  	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +	/* mode1 is the only selection for RMA status */ +	if (ras->is_rma) { +		ras->gpu_reset_flags = 0; +		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; +	} +  	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)  		amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);  	return 0; @@ -4284,21 +4624,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d  void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr)  { -	struct ras_err_addr *mca_err_addr; -  	/* This function will be retired. */  	return; -	mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL); -	if (!mca_err_addr) -		return; - -	INIT_LIST_HEAD(&mca_err_addr->node); - -	mca_err_addr->err_status = err_addr->err_status; -	mca_err_addr->err_ipid = err_addr->err_ipid; -	mca_err_addr->err_addr = err_addr->err_addr; - -	list_add_tail(&mca_err_addr->node, &err_info->err_addr_list);  }  void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr) @@ -4382,64 +4709,74 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,  #define mmMP0_SMN_C2PMSG_92	0x1609C  #define mmMP0_SMN_C2PMSG_126	0x160BE  static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev, -						 u32 instance, u32 boot_error) +						 u32 instance)  {  	u32 socket_id, aid_id, hbm_id; -	u32 reg_data; +	u32 fw_status; +	u32 boot_error;  	u64 reg_addr; -	socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); -	aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); -	hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); -  	/* The pattern for smn addressing in other SOC could be different from  	 * the one for aqua_vanjaram. We should revisit the code if the pattern  	 * is changed. In such case, replace the aqua_vanjaram implementation  	 * with more common helper */  	reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +  		   aqua_vanjaram_encode_ext_smn_addressing(instance); +	fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr); -	reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); -	dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n", -		socket_id, aid_id, reg_data); +	reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + +		   aqua_vanjaram_encode_ext_smn_addressing(instance); +	boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr); + +	socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); +	aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); +	hbm_id = ((1 == AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error)) ? 0 : 1);  	if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) -		dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n", -			 socket_id, aid_id, hbm_id); +		dev_info(adev->dev, +			 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n", +			 socket_id, aid_id, hbm_id, fw_status);  	if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) -		dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n", -			 socket_id, aid_id); +		dev_info(adev->dev, +			 "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n", +			 socket_id, aid_id, fw_status);  	if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) -		dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n", -			 socket_id, aid_id); +		dev_info(adev->dev, +			 "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n", +			 socket_id, aid_id, fw_status);  	if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) -		dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n", -			 socket_id, aid_id); +		dev_info(adev->dev, +			 "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n", +			 socket_id, aid_id, fw_status);  	if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) -		dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n", -			 socket_id, aid_id); +		dev_info(adev->dev, +			 "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n", +			 socket_id, aid_id, fw_status);  	if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) -		dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n", -			 socket_id, aid_id); +		dev_info(adev->dev, +			 "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n", +			 socket_id, aid_id, fw_status);  	if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) -		dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n", -			 socket_id, aid_id, hbm_id); +		dev_info(adev->dev, +			 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n", +			 socket_id, aid_id, hbm_id, fw_status);  	if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) -		dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n", -			 socket_id, aid_id, hbm_id); +		dev_info(adev->dev, +			 "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n", +			 socket_id, aid_id, hbm_id, fw_status);  } -static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, -					     u32 instance, u32 *boot_error) +static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, +					   u32 instance)  { -	u32 reg_addr; +	u64 reg_addr;  	u32 reg_data;  	int retry_loop; @@ -4448,41 +4785,22 @@ static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,  	for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {  		reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); -		if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) { -			*boot_error = AMDGPU_RAS_BOOT_SUCEESS; -			return 0; -		} -		msleep(1); -	} - -	/* The pattern for smn addressing in other SOC could be different from -	 * the one for aqua_vanjaram. We should revisit the code if the pattern -	 * is changed. In such case, replace the aqua_vanjaram implementation -	 * with more common helper */ -	reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + -		   aqua_vanjaram_encode_ext_smn_addressing(instance); - -	for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { -		reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); -		if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) { -			*boot_error = reg_data; -			return 0; -		} -		msleep(1); +		if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) +			return false; +		else +			msleep(1);  	} -	*boot_error = reg_data; -	return -ETIME; +	return true;  }  void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)  { -	u32 boot_error = 0;  	u32 i;  	for (i = 0; i < num_instances; i++) { -		if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error)) -			amdgpu_ras_boot_time_error_reporting(adev, i, boot_error); +		if (amdgpu_ras_boot_error_detected(adev, i)) +			amdgpu_ras_boot_time_error_reporting(adev, i);  	}  } @@ -4501,3 +4819,21 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)  	return ret;  } + +void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, +				const char *fmt, ...) +{ +	struct va_format vaf; +	va_list args; + +	va_start(args, fmt); +	vaf.fmt = fmt; +	vaf.va = &args; + +	if (RAS_EVENT_ID_IS_VALID(event_id)) +		dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf); +	else +		dev_printk(KERN_INFO, adev->dev, "%pV", &vaf); + +	va_end(args); +} |