diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 169 | 
1 files changed, 96 insertions, 73 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 493982f94649..c50202215f6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -28,7 +28,7 @@  #define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype} -typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data); +typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);  struct aca_banks {  	int nr_banks; @@ -86,7 +86,7 @@ static void aca_banks_release(struct aca_banks *banks)  	}  } -static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count) +static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)  {  	struct amdgpu_aca *aca = &adev->aca;  	const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -116,20 +116,22 @@ static struct aca_regs_dump {  	{"CONTROL_MASK",	ACA_REG_IDX_CTL_MASK},  }; -static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank) +static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank, +			      struct ras_query_context *qctx)  { +	u64 event_id = qctx ? qctx->event_id : 0ULL;  	int i; -	dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n"); +	RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");  	/* plus 1 for output format, e.g: ACA[08/08]: xxxx */  	for (i = 0; i < ARRAY_SIZE(aca_regs); i++) -		dev_info(adev->dev, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", -			 idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); +		RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", +			      idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);  } -static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type, +static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,  				       int start, int count, -				       struct aca_banks *banks) +				       struct aca_banks *banks, struct ras_query_context *qctx)  {  	struct amdgpu_aca *aca = &adev->aca;  	const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -143,13 +145,12 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro  		return -EOPNOTSUPP;  	switch (type) { -	case ACA_ERROR_TYPE_UE: +	case ACA_SMU_TYPE_UE:  		max_count = smu_funcs->max_ue_bank_count;  		break; -	case ACA_ERROR_TYPE_CE: +	case ACA_SMU_TYPE_CE:  		max_count = smu_funcs->max_ce_bank_count;  		break; -	case ACA_ERROR_TYPE_DEFERRED:  	default:  		return -EINVAL;  	} @@ -164,7 +165,9 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro  		if (ret)  			return ret; -		aca_smu_bank_dump(adev, i, count, &bank); +		bank.type = type; + +		aca_smu_bank_dump(adev, i, count, &bank, qctx);  		ret = aca_banks_add_bank(banks, &bank);  		if (ret) @@ -195,7 +198,7 @@ static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type t  	return hwip->hwid == hwid && hwip->mcatype == mcatype;  } -static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type) +static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)  {  	const struct aca_bank_ops *bank_ops = handle->bank_ops; @@ -273,59 +276,49 @@ static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_  	return new_bank_error(aerr, info);  } -static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type, -			  struct aca_bank_report *report) +int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, +				   enum aca_error_type type, u64 count)  {  	struct aca_error_cache *error_cache = &handle->error_cache;  	struct aca_bank_error *bank_error;  	struct aca_error *aerr; -	if (!handle || !report) +	if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT)  		return -EINVAL; -	if (!report->count[type]) +	if (!count)  		return 0;  	aerr = &error_cache->errors[type]; -	bank_error = get_bank_error(aerr, &report->info); +	bank_error = get_bank_error(aerr, info);  	if (!bank_error)  		return -ENOMEM; -	bank_error->count[type] += report->count[type]; +	bank_error->count += count;  	return 0;  } -static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank, -				    enum aca_error_type type, struct aca_bank_report *report) +static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)  {  	const struct aca_bank_ops *bank_ops = handle->bank_ops; -	if (!bank || !report) +	if (!bank)  		return -EINVAL; -	if (!bank_ops->aca_bank_generate_report) +	if (!bank_ops->aca_bank_parser)  		return -EOPNOTSUPP; -	memset(report, 0, sizeof(*report)); -	return bank_ops->aca_bank_generate_report(handle, bank, type, -						  report, handle->data); +	return bank_ops->aca_bank_parser(handle, bank, type, +					 handle->data);  }  static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank, -				      enum aca_error_type type, void *data) +				      enum aca_smu_type type, void *data)  { -	struct aca_bank_report report;  	int ret; -	ret = aca_generate_bank_report(handle, bank, type, &report); -	if (ret) -		return ret; - -	if (!report.count[type]) -		return 0; - -	ret = aca_log_errors(handle, type, &report); +	ret = aca_bank_parser(handle, bank, type);  	if (ret)  		return ret; @@ -333,7 +326,7 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank  }  static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank, -			     enum aca_error_type type, bank_handler_t handler, void *data) +			     enum aca_smu_type type, bank_handler_t handler, void *data)  {  	struct aca_handle *handle;  	int ret; @@ -354,7 +347,7 @@ static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *ba  }  static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks, -			      enum aca_error_type type, bank_handler_t handler, void *data) +			      enum aca_smu_type type, bank_handler_t handler, void *data)  {  	struct aca_bank_node *node;  	struct aca_bank *bank; @@ -378,8 +371,28 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *  	return 0;  } -static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type, -			    bank_handler_t handler, void *data) +static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type) +{ +	struct amdgpu_aca *aca = &adev->aca; +	bool ret = true; + +	/* +	 * Because the UE Valid MCA count will only be cleared after reset, +	 * in order to avoid repeated counting of the error count, +	 * the aca bank is only updated once during the gpu recovery stage. +	 */ +	if (type == ACA_SMU_TYPE_UE) { +		if (amdgpu_ras_intr_triggered()) +			ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0; +		else +			atomic_set(&aca->ue_update_flag, 0); +	} + +	return ret; +} + +static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, +			    bank_handler_t handler, struct ras_query_context *qctx, void *data)  {  	struct amdgpu_aca *aca = &adev->aca;  	struct aca_banks banks; @@ -389,9 +402,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type  	if (list_empty(&aca->mgr.list))  		return 0; -	/* NOTE: pmfw is only support UE and CE */ -	if (type == ACA_ERROR_TYPE_DEFERRED) -		type = ACA_ERROR_TYPE_CE; +	if (!aca_bank_should_update(adev, type)) +		return 0;  	ret = aca_smu_get_valid_aca_count(adev, type, &count);  	if (ret) @@ -402,7 +414,7 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type  	aca_banks_init(&banks); -	ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks); +	ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);  	if (ret)  		goto err_release_banks; @@ -431,7 +443,7 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er  	if (type >= ACA_ERROR_TYPE_COUNT)  		return -EINVAL; -	count = bank_error->count[type]; +	count = bank_error->count;  	if (!count)  		return 0; @@ -447,6 +459,8 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er  		amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count);  		break;  	case ACA_ERROR_TYPE_DEFERRED: +		amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, count); +		break;  	default:  		break;  	} @@ -477,12 +491,25 @@ out_unlock:  }  static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type, -				struct ras_err_data *err_data) +				struct ras_err_data *err_data, struct ras_query_context *qctx)  { +	enum aca_smu_type smu_type;  	int ret; +	switch (type) { +	case ACA_ERROR_TYPE_UE: +		smu_type = ACA_SMU_TYPE_UE; +		break; +	case ACA_ERROR_TYPE_CE: +	case ACA_ERROR_TYPE_DEFERRED: +		smu_type = ACA_SMU_TYPE_CE; +		break; +	default: +		return -EINVAL; +	} +  	/* udpate aca bank to aca source error_cache first */ -	ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL); +	ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);  	if (ret)  		return ret; @@ -498,10 +525,9 @@ static bool aca_handle_is_valid(struct aca_handle *handle)  }  int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, -			      enum aca_error_type type, void *data) +			      enum aca_error_type type, struct ras_err_data *err_data, +			      struct ras_query_context *qctx)  { -	struct ras_err_data *err_data = (struct ras_err_data *)data; -  	if (!handle || !err_data)  		return -EINVAL; @@ -511,7 +537,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *han  	if (!(BIT(type) & handle->mask))  		return  0; -	return __aca_get_error_data(adev, handle, type, err_data); +	return __aca_get_error_data(adev, handle, type, err_data, qctx);  }  static void aca_error_init(struct aca_error *aerr, enum aca_error_type type) @@ -668,6 +694,8 @@ int amdgpu_aca_init(struct amdgpu_device *adev)  	struct amdgpu_aca *aca = &adev->aca;  	int ret; +	atomic_set(&aca->ue_update_flag, 0); +  	ret = aca_manager_init(&aca->mgr);  	if (ret)  		return ret; @@ -680,6 +708,8 @@ void amdgpu_aca_fini(struct amdgpu_device *adev)  	struct amdgpu_aca *aca = &adev->aca;  	aca_manager_fini(&aca->mgr); + +	atomic_set(&aca->ue_update_flag, 0);  }  int amdgpu_aca_reset(struct amdgpu_device *adev) @@ -723,23 +753,13 @@ int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info)  static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank)  { -	int error_code; - -	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { -	case IP_VERSION(13, 0, 6): -		if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) { -			error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]); -			return error_code & 0xff; -		} -		break; -	default: -		break; -	} +	struct amdgpu_aca *aca = &adev->aca; +	const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; -	/* NOTE: the true error code is encoded in status.errorcode[0:7] */ -	error_code = ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]); +	if (!smu_funcs || !smu_funcs->parse_error_code) +		return -EOPNOTSUPP; -	return error_code & 0xff; +	return smu_funcs->parse_error_code(adev, bank);  }  int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size) @@ -750,6 +770,9 @@ int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank  		return -EINVAL;  	error_code = aca_bank_get_error_code(adev, bank); +	if (error_code < 0) +		return error_code; +  	for (i = 0; i < size; i++) {  		if (err_codes[i] == error_code)  			return 0; @@ -784,7 +807,7 @@ static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)  	return 0;  } -static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx) +static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)  {  	struct aca_bank_info info;  	int i, ret; @@ -793,7 +816,7 @@ static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_e  	if (ret)  		return; -	seq_printf(m, "aca entry[%d].type: %s\n", idx, type ==  ACA_ERROR_TYPE_UE ? "UE" : "CE"); +	seq_printf(m, "aca entry[%d].type: %s\n", idx, type ==  ACA_SMU_TYPE_UE ? "UE" : "CE");  	seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",  		   idx, info.socket_id, info.die_id, info.hwid, info.mcatype); @@ -807,7 +830,7 @@ struct aca_dump_context {  };  static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank, -				 enum aca_error_type type, void *data) +				 enum aca_smu_type type, void *data)  {  	struct aca_dump_context *ctx = (struct aca_dump_context *)data; @@ -816,7 +839,7 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban  	return handler_aca_log_bank_error(handle, bank, type, NULL);  } -static int aca_dump_show(struct seq_file *m, enum aca_error_type type) +static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)  {  	struct amdgpu_device *adev = (struct amdgpu_device *)m->private;  	struct aca_dump_context context = { @@ -824,12 +847,12 @@ static int aca_dump_show(struct seq_file *m, enum aca_error_type type)  		.idx = 0,  	}; -	return aca_banks_update(adev, type, handler_aca_bank_dump, (void *)&context); +	return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);  }  static int aca_dump_ce_show(struct seq_file *m, void *unused)  { -	return aca_dump_show(m, ACA_ERROR_TYPE_CE); +	return aca_dump_show(m, ACA_SMU_TYPE_CE);  }  static int aca_dump_ce_open(struct inode *inode, struct file *file) @@ -847,7 +870,7 @@ static const struct file_operations aca_ce_dump_debug_fops = {  static int aca_dump_ue_show(struct seq_file *m, void *unused)  { -	return aca_dump_show(m, ACA_ERROR_TYPE_UE); +	return aca_dump_show(m, ACA_SMU_TYPE_UE);  }  static int aca_dump_ue_open(struct inode *inode, struct file *file) |