diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
150 files changed, 3506 insertions, 1146 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile b/drivers/gpu/drm/amd/amdgpu/Makefile index 4536c8ad0e11..1f6b56ec99f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/Makefile +++ b/drivers/gpu/drm/amd/amdgpu/Makefile @@ -70,7 +70,8 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \  	amdgpu_cs.o amdgpu_bios.o amdgpu_benchmark.o \  	atombios_dp.o amdgpu_afmt.o amdgpu_trace_points.o \  	atombios_encoders.o amdgpu_sa.o atombios_i2c.o \ -	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_ib.o amdgpu_pll.o \ +	amdgpu_dma_buf.o amdgpu_vm.o amdgpu_vm_pt.o amdgpu_vm_tlb_fence.o \ +	amdgpu_ib.o amdgpu_pll.o \  	amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \  	amdgpu_gtt_mgr.o amdgpu_preempt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o \  	amdgpu_atomfirmware.o amdgpu_vf_error.o amdgpu_sched.o \ @@ -80,7 +81,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o amdgpu_kms.o \  	amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \  	amdgpu_fw_attestation.o amdgpu_securedisplay.o \  	amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \ -	amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o +	amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o amdgpu_dev_coredump.o  amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o @@ -247,7 +248,8 @@ amdgpu-y += \  	smuio_v11_0_6.o \  	smuio_v13_0.o \  	smuio_v13_0_3.o \ -	smuio_v13_0_6.o +	smuio_v13_0_6.o \ +	smuio_v14_0_2.o  # add reset block  amdgpu-y += \ diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c b/drivers/gpu/drm/amd/amdgpu/aldebaran.c index 576067d66bb9..d0a8da67dc2a 100644 --- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c +++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c @@ -97,7 +97,7 @@ static int aldebaran_mode2_suspend_ip(struct amdgpu_device *adev)  		adev->ip_blocks[i].status.hw = false;  	} -	return r; +	return 0;  }  static int diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 9c62552bec34..f87d53e183c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -139,6 +139,14 @@ enum amdgpu_ss {  	AMDGPU_SS_DRV_UNLOAD  }; +struct amdgpu_hwip_reg_entry { +	u32		hwip; +	u32		inst; +	u32		seg; +	u32		reg_offset; +	const char	*reg_name; +}; +  struct amdgpu_watchdog_timer {  	bool timeout_fatal_disable;  	uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */ @@ -210,6 +218,7 @@ extern int amdgpu_async_gfx_ring;  extern int amdgpu_mcbp;  extern int amdgpu_discovery;  extern int amdgpu_mes; +extern int amdgpu_mes_log_enable;  extern int amdgpu_mes_kiq;  extern int amdgpu_noretry;  extern int amdgpu_force_asic_type; @@ -493,6 +502,7 @@ struct amdgpu_wb {  	uint64_t		gpu_addr;  	u32			num_wb;	/* Number of wb slots actually reserved for amdgpu. */  	unsigned long		used[DIV_ROUND_UP(AMDGPU_MAX_WB, BITS_PER_LONG)]; +	spinlock_t		lock;  };  int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb); @@ -605,7 +615,7 @@ struct amdgpu_asic_funcs {  	/* PCIe replay counter */  	uint64_t (*get_pcie_replay_count)(struct amdgpu_device *adev);  	/* device supports BACO */ -	bool (*supports_baco)(struct amdgpu_device *adev); +	int (*supports_baco)(struct amdgpu_device *adev);  	/* pre asic_init quirks */  	void (*pre_asic_init)(struct amdgpu_device *adev);  	/* enter/exit umd stable pstate */ @@ -1407,7 +1417,8 @@ bool amdgpu_device_supports_atpx(struct drm_device *dev);  bool amdgpu_device_supports_px(struct drm_device *dev);  bool amdgpu_device_supports_boco(struct drm_device *dev);  bool amdgpu_device_supports_smart_shift(struct drm_device *dev); -bool amdgpu_device_supports_baco(struct drm_device *dev); +int amdgpu_device_supports_baco(struct drm_device *dev); +void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev);  bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,  				      struct amdgpu_device *peer_adev);  int amdgpu_device_baco_enter(struct drm_device *dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 493982f94649..c50202215f6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -28,7 +28,7 @@  #define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] = {hwid, mcatype} -typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, void *data); +typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data);  struct aca_banks {  	int nr_banks; @@ -86,7 +86,7 @@ static void aca_banks_release(struct aca_banks *banks)  	}  } -static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_error_type type, u32 *count) +static int aca_smu_get_valid_aca_count(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count)  {  	struct amdgpu_aca *aca = &adev->aca;  	const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -116,20 +116,22 @@ static struct aca_regs_dump {  	{"CONTROL_MASK",	ACA_REG_IDX_CTL_MASK},  }; -static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank) +static void aca_smu_bank_dump(struct amdgpu_device *adev, int idx, int total, struct aca_bank *bank, +			      struct ras_query_context *qctx)  { +	u64 event_id = qctx ? qctx->event_id : 0ULL;  	int i; -	dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n"); +	RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n");  	/* plus 1 for output format, e.g: ACA[08/08]: xxxx */  	for (i = 0; i < ARRAY_SIZE(aca_regs); i++) -		dev_info(adev->dev, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", -			 idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); +		RAS_EVENT_LOG(adev, event_id, HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", +			      idx + 1, total, aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);  } -static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_error_type type, +static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_smu_type type,  				       int start, int count, -				       struct aca_banks *banks) +				       struct aca_banks *banks, struct ras_query_context *qctx)  {  	struct amdgpu_aca *aca = &adev->aca;  	const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; @@ -143,13 +145,12 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro  		return -EOPNOTSUPP;  	switch (type) { -	case ACA_ERROR_TYPE_UE: +	case ACA_SMU_TYPE_UE:  		max_count = smu_funcs->max_ue_bank_count;  		break; -	case ACA_ERROR_TYPE_CE: +	case ACA_SMU_TYPE_CE:  		max_count = smu_funcs->max_ce_bank_count;  		break; -	case ACA_ERROR_TYPE_DEFERRED:  	default:  		return -EINVAL;  	} @@ -164,7 +165,9 @@ static int aca_smu_get_valid_aca_banks(struct amdgpu_device *adev, enum aca_erro  		if (ret)  			return ret; -		aca_smu_bank_dump(adev, i, count, &bank); +		bank.type = type; + +		aca_smu_bank_dump(adev, i, count, &bank, qctx);  		ret = aca_banks_add_bank(banks, &bank);  		if (ret) @@ -195,7 +198,7 @@ static bool aca_bank_hwip_is_matched(struct aca_bank *bank, enum aca_hwip_type t  	return hwip->hwid == hwid && hwip->mcatype == mcatype;  } -static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type) +static bool aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)  {  	const struct aca_bank_ops *bank_ops = handle->bank_ops; @@ -273,59 +276,49 @@ static struct aca_bank_error *get_bank_error(struct aca_error *aerr, struct aca_  	return new_bank_error(aerr, info);  } -static int aca_log_errors(struct aca_handle *handle, enum aca_error_type type, -			  struct aca_bank_report *report) +int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, +				   enum aca_error_type type, u64 count)  {  	struct aca_error_cache *error_cache = &handle->error_cache;  	struct aca_bank_error *bank_error;  	struct aca_error *aerr; -	if (!handle || !report) +	if (!handle || !info || type >= ACA_ERROR_TYPE_COUNT)  		return -EINVAL; -	if (!report->count[type]) +	if (!count)  		return 0;  	aerr = &error_cache->errors[type]; -	bank_error = get_bank_error(aerr, &report->info); +	bank_error = get_bank_error(aerr, info);  	if (!bank_error)  		return -ENOMEM; -	bank_error->count[type] += report->count[type]; +	bank_error->count += count;  	return 0;  } -static int aca_generate_bank_report(struct aca_handle *handle, struct aca_bank *bank, -				    enum aca_error_type type, struct aca_bank_report *report) +static int aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type)  {  	const struct aca_bank_ops *bank_ops = handle->bank_ops; -	if (!bank || !report) +	if (!bank)  		return -EINVAL; -	if (!bank_ops->aca_bank_generate_report) +	if (!bank_ops->aca_bank_parser)  		return -EOPNOTSUPP; -	memset(report, 0, sizeof(*report)); -	return bank_ops->aca_bank_generate_report(handle, bank, type, -						  report, handle->data); +	return bank_ops->aca_bank_parser(handle, bank, type, +					 handle->data);  }  static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank *bank, -				      enum aca_error_type type, void *data) +				      enum aca_smu_type type, void *data)  { -	struct aca_bank_report report;  	int ret; -	ret = aca_generate_bank_report(handle, bank, type, &report); -	if (ret) -		return ret; - -	if (!report.count[type]) -		return 0; - -	ret = aca_log_errors(handle, type, &report); +	ret = aca_bank_parser(handle, bank, type);  	if (ret)  		return ret; @@ -333,7 +326,7 @@ static int handler_aca_log_bank_error(struct aca_handle *handle, struct aca_bank  }  static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *bank, -			     enum aca_error_type type, bank_handler_t handler, void *data) +			     enum aca_smu_type type, bank_handler_t handler, void *data)  {  	struct aca_handle *handle;  	int ret; @@ -354,7 +347,7 @@ static int aca_dispatch_bank(struct aca_handle_manager *mgr, struct aca_bank *ba  }  static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *banks, -			      enum aca_error_type type, bank_handler_t handler, void *data) +			      enum aca_smu_type type, bank_handler_t handler, void *data)  {  	struct aca_bank_node *node;  	struct aca_bank *bank; @@ -378,8 +371,28 @@ static int aca_dispatch_banks(struct aca_handle_manager *mgr, struct aca_banks *  	return 0;  } -static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type, -			    bank_handler_t handler, void *data) +static bool aca_bank_should_update(struct amdgpu_device *adev, enum aca_smu_type type) +{ +	struct amdgpu_aca *aca = &adev->aca; +	bool ret = true; + +	/* +	 * Because the UE Valid MCA count will only be cleared after reset, +	 * in order to avoid repeated counting of the error count, +	 * the aca bank is only updated once during the gpu recovery stage. +	 */ +	if (type == ACA_SMU_TYPE_UE) { +		if (amdgpu_ras_intr_triggered()) +			ret = atomic_cmpxchg(&aca->ue_update_flag, 0, 1) == 0; +		else +			atomic_set(&aca->ue_update_flag, 0); +	} + +	return ret; +} + +static int aca_banks_update(struct amdgpu_device *adev, enum aca_smu_type type, +			    bank_handler_t handler, struct ras_query_context *qctx, void *data)  {  	struct amdgpu_aca *aca = &adev->aca;  	struct aca_banks banks; @@ -389,9 +402,8 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type  	if (list_empty(&aca->mgr.list))  		return 0; -	/* NOTE: pmfw is only support UE and CE */ -	if (type == ACA_ERROR_TYPE_DEFERRED) -		type = ACA_ERROR_TYPE_CE; +	if (!aca_bank_should_update(adev, type)) +		return 0;  	ret = aca_smu_get_valid_aca_count(adev, type, &count);  	if (ret) @@ -402,7 +414,7 @@ static int aca_banks_update(struct amdgpu_device *adev, enum aca_error_type type  	aca_banks_init(&banks); -	ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks); +	ret = aca_smu_get_valid_aca_banks(adev, type, 0, count, &banks, qctx);  	if (ret)  		goto err_release_banks; @@ -431,7 +443,7 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er  	if (type >= ACA_ERROR_TYPE_COUNT)  		return -EINVAL; -	count = bank_error->count[type]; +	count = bank_error->count;  	if (!count)  		return 0; @@ -447,6 +459,8 @@ static int aca_log_aca_error_data(struct aca_bank_error *bank_error, enum aca_er  		amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, NULL, count);  		break;  	case ACA_ERROR_TYPE_DEFERRED: +		amdgpu_ras_error_statistic_de_count(err_data, &mcm_info, NULL, count); +		break;  	default:  		break;  	} @@ -477,12 +491,25 @@ out_unlock:  }  static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, enum aca_error_type type, -				struct ras_err_data *err_data) +				struct ras_err_data *err_data, struct ras_query_context *qctx)  { +	enum aca_smu_type smu_type;  	int ret; +	switch (type) { +	case ACA_ERROR_TYPE_UE: +		smu_type = ACA_SMU_TYPE_UE; +		break; +	case ACA_ERROR_TYPE_CE: +	case ACA_ERROR_TYPE_DEFERRED: +		smu_type = ACA_SMU_TYPE_CE; +		break; +	default: +		return -EINVAL; +	} +  	/* udpate aca bank to aca source error_cache first */ -	ret = aca_banks_update(adev, type, handler_aca_log_bank_error, NULL); +	ret = aca_banks_update(adev, smu_type, handler_aca_log_bank_error, qctx, NULL);  	if (ret)  		return ret; @@ -498,10 +525,9 @@ static bool aca_handle_is_valid(struct aca_handle *handle)  }  int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, -			      enum aca_error_type type, void *data) +			      enum aca_error_type type, struct ras_err_data *err_data, +			      struct ras_query_context *qctx)  { -	struct ras_err_data *err_data = (struct ras_err_data *)data; -  	if (!handle || !err_data)  		return -EINVAL; @@ -511,7 +537,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *han  	if (!(BIT(type) & handle->mask))  		return  0; -	return __aca_get_error_data(adev, handle, type, err_data); +	return __aca_get_error_data(adev, handle, type, err_data, qctx);  }  static void aca_error_init(struct aca_error *aerr, enum aca_error_type type) @@ -668,6 +694,8 @@ int amdgpu_aca_init(struct amdgpu_device *adev)  	struct amdgpu_aca *aca = &adev->aca;  	int ret; +	atomic_set(&aca->ue_update_flag, 0); +  	ret = aca_manager_init(&aca->mgr);  	if (ret)  		return ret; @@ -680,6 +708,8 @@ void amdgpu_aca_fini(struct amdgpu_device *adev)  	struct amdgpu_aca *aca = &adev->aca;  	aca_manager_fini(&aca->mgr); + +	atomic_set(&aca->ue_update_flag, 0);  }  int amdgpu_aca_reset(struct amdgpu_device *adev) @@ -723,23 +753,13 @@ int aca_bank_info_decode(struct aca_bank *bank, struct aca_bank_info *info)  static int aca_bank_get_error_code(struct amdgpu_device *adev, struct aca_bank *bank)  { -	int error_code; - -	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { -	case IP_VERSION(13, 0, 6): -		if (!(adev->flags & AMD_IS_APU) && adev->pm.fw_version >= 0x00555600) { -			error_code = ACA_REG__SYND__ERRORINFORMATION(bank->regs[ACA_REG_IDX_SYND]); -			return error_code & 0xff; -		} -		break; -	default: -		break; -	} +	struct amdgpu_aca *aca = &adev->aca; +	const struct aca_smu_funcs *smu_funcs = aca->smu_funcs; -	/* NOTE: the true error code is encoded in status.errorcode[0:7] */ -	error_code = ACA_REG__STATUS__ERRORCODE(bank->regs[ACA_REG_IDX_STATUS]); +	if (!smu_funcs || !smu_funcs->parse_error_code) +		return -EOPNOTSUPP; -	return error_code & 0xff; +	return smu_funcs->parse_error_code(adev, bank);  }  int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank, int *err_codes, int size) @@ -750,6 +770,9 @@ int aca_bank_check_error_codes(struct amdgpu_device *adev, struct aca_bank *bank  		return -EINVAL;  	error_code = aca_bank_get_error_code(adev, bank); +	if (error_code < 0) +		return error_code; +  	for (i = 0; i < size; i++) {  		if (err_codes[i] == error_code)  			return 0; @@ -784,7 +807,7 @@ static int amdgpu_aca_smu_debug_mode_set(void *data, u64 val)  	return 0;  } -static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_error_type type, int idx) +static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_smu_type type, int idx)  {  	struct aca_bank_info info;  	int i, ret; @@ -793,7 +816,7 @@ static void aca_dump_entry(struct seq_file *m, struct aca_bank *bank, enum aca_e  	if (ret)  		return; -	seq_printf(m, "aca entry[%d].type: %s\n", idx, type ==  ACA_ERROR_TYPE_UE ? "UE" : "CE"); +	seq_printf(m, "aca entry[%d].type: %s\n", idx, type ==  ACA_SMU_TYPE_UE ? "UE" : "CE");  	seq_printf(m, "aca entry[%d].info: socketid:%d aid:%d hwid:0x%03x mcatype:0x%04x\n",  		   idx, info.socket_id, info.die_id, info.hwid, info.mcatype); @@ -807,7 +830,7 @@ struct aca_dump_context {  };  static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *bank, -				 enum aca_error_type type, void *data) +				 enum aca_smu_type type, void *data)  {  	struct aca_dump_context *ctx = (struct aca_dump_context *)data; @@ -816,7 +839,7 @@ static int handler_aca_bank_dump(struct aca_handle *handle, struct aca_bank *ban  	return handler_aca_log_bank_error(handle, bank, type, NULL);  } -static int aca_dump_show(struct seq_file *m, enum aca_error_type type) +static int aca_dump_show(struct seq_file *m, enum aca_smu_type type)  {  	struct amdgpu_device *adev = (struct amdgpu_device *)m->private;  	struct aca_dump_context context = { @@ -824,12 +847,12 @@ static int aca_dump_show(struct seq_file *m, enum aca_error_type type)  		.idx = 0,  	}; -	return aca_banks_update(adev, type, handler_aca_bank_dump, (void *)&context); +	return aca_banks_update(adev, type, handler_aca_bank_dump, NULL, (void *)&context);  }  static int aca_dump_ce_show(struct seq_file *m, void *unused)  { -	return aca_dump_show(m, ACA_ERROR_TYPE_CE); +	return aca_dump_show(m, ACA_SMU_TYPE_CE);  }  static int aca_dump_ce_open(struct inode *inode, struct file *file) @@ -847,7 +870,7 @@ static const struct file_operations aca_ce_dump_debug_fops = {  static int aca_dump_ue_show(struct seq_file *m, void *unused)  { -	return aca_dump_show(m, ACA_ERROR_TYPE_UE); +	return aca_dump_show(m, ACA_SMU_TYPE_UE);  }  static int aca_dump_ue_open(struct inode *inode, struct file *file) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 2da50e095883..5ef6b745f222 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -26,6 +26,9 @@  #include <linux/list.h> +struct ras_err_data; +struct ras_query_context; +  #define ACA_MAX_REGS_COUNT	(16)  #define ACA_REG_FIELD(x, h, l)			(((x) & GENMASK_ULL(h, l)) >> l) @@ -99,7 +102,14 @@ enum aca_error_type {  	ACA_ERROR_TYPE_COUNT  }; +enum aca_smu_type { +	ACA_SMU_TYPE_UE = 0, +	ACA_SMU_TYPE_CE, +	ACA_SMU_TYPE_COUNT, +}; +  struct aca_bank { +	enum aca_smu_type type;  	u64 regs[ACA_MAX_REGS_COUNT];  }; @@ -115,15 +125,10 @@ struct aca_bank_info {  	int mcatype;  }; -struct aca_bank_report { -	struct aca_bank_info info; -	u64 count[ACA_ERROR_TYPE_COUNT]; -}; -  struct aca_bank_error {  	struct list_head node;  	struct aca_bank_info info; -	u64 count[ACA_ERROR_TYPE_COUNT]; +	u64 count;  };  struct aca_error { @@ -157,9 +162,8 @@ struct aca_handle {  };  struct aca_bank_ops { -	int (*aca_bank_generate_report)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, -					struct aca_bank_report *report, void *data); -	bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, +	int (*aca_bank_parser)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type, void *data); +	bool (*aca_bank_is_valid)(struct aca_handle *handle, struct aca_bank *bank, enum aca_smu_type type,  				  void *data);  }; @@ -167,13 +171,15 @@ struct aca_smu_funcs {  	int max_ue_bank_count;  	int max_ce_bank_count;  	int (*set_debug_mode)(struct amdgpu_device *adev, bool enable); -	int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_error_type type, u32 *count); -	int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_error_type type, int idx, struct aca_bank *bank); +	int (*get_valid_aca_count)(struct amdgpu_device *adev, enum aca_smu_type type, u32 *count); +	int (*get_valid_aca_bank)(struct amdgpu_device *adev, enum aca_smu_type type, int idx, struct aca_bank *bank); +	int (*parse_error_code)(struct amdgpu_device *adev, struct aca_bank *bank);  };  struct amdgpu_aca {  	struct aca_handle_manager mgr;  	const struct aca_smu_funcs *smu_funcs; +	atomic_t ue_update_flag;  	bool is_enabled;  }; @@ -196,7 +202,10 @@ int amdgpu_aca_add_handle(struct amdgpu_device *adev, struct aca_handle *handle,  			  const char *name, const struct aca_info *aca_info, void *data);  void amdgpu_aca_remove_handle(struct aca_handle *handle);  int amdgpu_aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *handle, -				     enum aca_error_type type, void *data); +			      enum aca_error_type type, struct ras_err_data *err_data, +			      struct ras_query_context *qctx);  int amdgpu_aca_smu_set_debug_mode(struct amdgpu_device *adev, bool en);  void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root); +int aca_error_cache_log_bank_error(struct aca_handle *handle, struct aca_bank_info *info, +				   enum aca_error_type type, u64 count);  #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c index 6d72355ac492..bf6c4a0d0525 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c @@ -637,6 +637,8 @@ static const struct amd_ip_funcs acp_ip_funcs = {  	.soft_reset = acp_soft_reset,  	.set_clockgating_state = acp_set_clockgating_state,  	.set_powergating_state = acp_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version acp_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 35dd6effa9a3..e3738d417245 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev,  		else  			mem_info->local_mem_size_private =  					KFD_XCP_MEMORY_SIZE(adev, xcp->id); +	} else if (adev->flags & AMD_IS_APU) { +		mem_info->local_mem_size_public = (ttm_tt_pages_limit() << PAGE_SHIFT); +		mem_info->local_mem_size_private = 0;  	} else {  		mem_info->local_mem_size_public = adev->gmc.visible_vram_size;  		mem_info->local_mem_size_private = adev->gmc.real_vram_size - @@ -747,10 +750,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)  	return amdgpu_ras_get_fed_status(adev);  } +void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev, +				enum amdgpu_ras_block block, uint16_t pasid, +				pasid_notify pasid_fn, void *data, uint32_t reset) +{ +	amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn, data, reset); +} +  void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, -	enum amdgpu_ras_block block, bool reset) +	enum amdgpu_ras_block block, uint32_t reset)  { -	amdgpu_umc_poison_handler(adev, block, reset); +	amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL, reset);  }  int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, @@ -769,12 +779,20 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,  	return 0;  } -bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev) +bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, +			int hub_inst, int hub_type)  { -	if (adev->gfx.ras && adev->gfx.ras->query_utcl2_poison_status) -		return adev->gfx.ras->query_utcl2_poison_status(adev); -	else -		return false; +	if (!hub_type) { +		if (adev->gfxhub.funcs->query_utcl2_poison_status) +			return adev->gfxhub.funcs->query_utcl2_poison_status(adev, hub_inst); +		else +			return false; +	} else { +		if (adev->mmhub.funcs->query_utcl2_poison_status) +			return adev->mmhub.funcs->query_utcl2_poison_status(adev, hub_inst); +		else +			return false; +	}  }  int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev) @@ -809,6 +827,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id)  		}  		do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);  		return ALIGN_DOWN(tmp, PAGE_SIZE); +	} else if (adev->flags & AMD_IS_APU) { +		return (ttm_tt_pages_limit() << PAGE_SHIFT);  	} else {  		return adev->gmc.real_vram_size;  	} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 0ef223c2affb..1de021ebdd46 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -336,12 +336,18 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);  int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,  				struct tile_config *config);  void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, -			enum amdgpu_ras_block block, bool reset); +			enum amdgpu_ras_block block, uint32_t reset); + +void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device *adev, +			enum amdgpu_ras_block block, uint16_t pasid, +			pasid_notify pasid_fn, void *data, uint32_t reset); +  bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);  bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p);  int amdgpu_amdkfd_criu_resume(void *p); -bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev); +bool amdgpu_amdkfd_ras_query_utcl2_poison_status(struct amdgpu_device *adev, +			int hub_inst, int hub_type);  int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,  		uint64_t size, u32 alloc_flag, int8_t xcp_id);  void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 69810b3f1c63..3ab6c3aa0ad1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -881,6 +881,7 @@ uint32_t kgd_gfx_v10_set_wave_launch_mode(struct amdgpu_device *adev,  }  #define TCP_WATCH_STRIDE (mmTCP_WATCH1_ADDR_H - mmTCP_WATCH0_ADDR_H) +#define SQ_WATCH_STRIDE (mmSQ_WATCH1_ADDR_H - mmSQ_WATCH0_ADDR_H)  uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,  					uint64_t watch_address,  					uint32_t watch_address_mask, @@ -889,55 +890,93 @@ uint32_t kgd_gfx_v10_set_address_watch(struct amdgpu_device *adev,  					uint32_t debug_vmid,  					uint32_t inst)  { +	/* SQ_WATCH?_ADDR_* and TCP_WATCH?_ADDR_* are programmed with the +	 * same values. +	 */  	uint32_t watch_address_high;  	uint32_t watch_address_low; -	uint32_t watch_address_cntl; - -	watch_address_cntl = 0; +	uint32_t tcp_watch_address_cntl; +	uint32_t sq_watch_address_cntl;  	watch_address_low = lower_32_bits(watch_address);  	watch_address_high = upper_32_bits(watch_address) & 0xffff; -	watch_address_cntl = REG_SET_FIELD(watch_address_cntl, +	tcp_watch_address_cntl = 0; +	tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,  			TCP_WATCH0_CNTL,  			VMID,  			debug_vmid); -	watch_address_cntl = REG_SET_FIELD(watch_address_cntl, +	tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,  			TCP_WATCH0_CNTL,  			MODE,  			watch_mode); -	watch_address_cntl = REG_SET_FIELD(watch_address_cntl, +	tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,  			TCP_WATCH0_CNTL,  			MASK,  			watch_address_mask >> 7); +	sq_watch_address_cntl = 0; +	sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, +			SQ_WATCH0_CNTL, +			VMID, +			debug_vmid); +	sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, +			SQ_WATCH0_CNTL, +			MODE, +			watch_mode); +	sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, +			SQ_WATCH0_CNTL, +			MASK, +			watch_address_mask >> 6); +  	/* Turning off this watch point until we set all the registers */ -	watch_address_cntl = REG_SET_FIELD(watch_address_cntl, +	tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,  			TCP_WATCH0_CNTL,  			VALID,  			0); -  	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +  			(watch_id * TCP_WATCH_STRIDE)), -			watch_address_cntl); +			tcp_watch_address_cntl); + +	sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, +			SQ_WATCH0_CNTL, +			VALID, +			0); +	WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) + +			(watch_id * SQ_WATCH_STRIDE)), +			sq_watch_address_cntl); +	/* Program {TCP,SQ}_WATCH?_ADDR* */  	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_H) +  			(watch_id * TCP_WATCH_STRIDE)),  			watch_address_high); -  	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_ADDR_L) +  			(watch_id * TCP_WATCH_STRIDE)),  			watch_address_low); +	WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_H) + +			(watch_id * SQ_WATCH_STRIDE)), +			watch_address_high); +	WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_ADDR_L) + +			(watch_id * SQ_WATCH_STRIDE)), +			watch_address_low); +  	/* Enable the watch point */ -	watch_address_cntl = REG_SET_FIELD(watch_address_cntl, +	tcp_watch_address_cntl = REG_SET_FIELD(tcp_watch_address_cntl,  			TCP_WATCH0_CNTL,  			VALID,  			1); -  	WREG32((SOC15_REG_OFFSET(GC, 0, mmTCP_WATCH0_CNTL) +  			(watch_id * TCP_WATCH_STRIDE)), -			watch_address_cntl); +			tcp_watch_address_cntl); + +	sq_watch_address_cntl = REG_SET_FIELD(sq_watch_address_cntl, +			SQ_WATCH0_CNTL, +			VALID, +			1); +	WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) + +			(watch_id * SQ_WATCH_STRIDE)), +			sq_watch_address_cntl);  	return 0;  } @@ -953,8 +992,14 @@ uint32_t kgd_gfx_v10_clear_address_watch(struct amdgpu_device *adev,  			(watch_id * TCP_WATCH_STRIDE)),  			watch_address_cntl); +	WREG32((SOC15_REG_OFFSET(GC, 0, mmSQ_WATCH0_CNTL) + +			(watch_id * SQ_WATCH_STRIDE)), +			watch_address_cntl); +  	return 0;  } +#undef TCP_WATCH_STRIDE +#undef SQ_WATCH_STRIDE  /* kgd_gfx_v10_get_iq_wait_times: Returns the mmCP_IQ_WAIT_TIME1/2 values diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index df58a6a1a67e..8975cf41a91a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,  			return -EINVAL;  		vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id); -		if (adev->gmc.is_app_apu) { +		if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {  			system_mem_needed = size;  			ttm_mem_needed = size;  		} @@ -220,7 +220,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,  	    (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >  	     kfd_mem_limit.max_ttm_mem_limit) ||  	    (adev && xcp_id >= 0 && adev->kfd.vram_used[xcp_id] + vram_needed > -	     vram_size - reserved_for_pt)) { +	     vram_size - reserved_for_pt - atomic64_read(&adev->vram_pin_size))) {  		ret = -ENOMEM;  		goto release;  	} @@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,  		  "adev reference can't be null when vram is used");  	if (adev && xcp_id >= 0) {  		adev->kfd.vram_used[xcp_id] += vram_needed; -		adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ? +		adev->kfd.vram_used_aligned[xcp_id] += +				(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?  				vram_needed :  				ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);  	} @@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,  		if (adev) {  			adev->kfd.vram_used[xcp_id] -= size; -			if (adev->gmc.is_app_apu) { +			if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {  				adev->kfd.vram_used_aligned[xcp_id] -= size;  				kfd_mem_limit.system_mem_used -= size;  				kfd_mem_limit.ttm_mem_used -= size; @@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,  	 * if peer device has large BAR. In contrast, access over xGMI is  	 * allowed for both small and large BAR configurations of peer device  	 */ -	if ((adev != bo_adev && !adev->gmc.is_app_apu) && +	if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) &&  	    ((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||  	     (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||  	     (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) { @@ -1188,7 +1189,8 @@ static int reserve_bo_and_cond_vms(struct kgd_mem *mem,  	int ret;  	ctx->sync = &mem->sync; -	drm_exec_init(&ctx->exec, DRM_EXEC_INTERRUPTIBLE_WAIT, 0); +	drm_exec_init(&ctx->exec, DRM_EXEC_INTERRUPTIBLE_WAIT | +		      DRM_EXEC_IGNORE_DUPLICATES, 0);  	drm_exec_until_all_locked(&ctx->exec) {  		ctx->n_vms = 0;  		list_for_each_entry(entry, &mem->attachments, list) { @@ -1656,7 +1658,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,  		- atomic64_read(&adev->vram_pin_size)  		- reserved_for_pt; -	if (adev->gmc.is_app_apu) { +	if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {  		system_mem_available = no_system_mem_limit ?  					kfd_mem_limit.max_system_mem_limit :  					kfd_mem_limit.max_system_mem_limit - @@ -1704,7 +1706,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(  	if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {  		domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM; -		if (adev->gmc.is_app_apu) { +		if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {  			domain = AMDGPU_GEM_DOMAIN_GTT;  			alloc_domain = AMDGPU_GEM_DOMAIN_GTT;  			alloc_flags = 0; @@ -1854,6 +1856,7 @@ err_node_allow:  err_bo_create:  	amdgpu_amdkfd_unreserve_mem_limit(adev, aligned_size, flags, xcp_id);  err_reserve_limit: +	amdgpu_sync_free(&(*mem)->sync);  	mutex_destroy(&(*mem)->lock);  	if (gobj)  		drm_gem_object_put(gobj); @@ -1950,7 +1953,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(  	if (size) {  		if (!is_imported &&  		   (mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM || -		   (adev->gmc.is_app_apu && +		   ((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) &&  		    mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT)))  			*size = bo_size;  		else @@ -2372,8 +2375,9 @@ static int import_obj_create(struct amdgpu_device *adev,  	(*mem)->dmabuf = dma_buf;  	(*mem)->bo = bo;  	(*mem)->va = va; -	(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) && !adev->gmc.is_app_apu ? -		AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT; +	(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) && +			 !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ? +			 AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;  	(*mem)->mapped_to_gpu_memory = 0;  	(*mem)->process_info = avm->process_info; @@ -2900,13 +2904,12 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *  	amdgpu_sync_create(&sync_obj); -	/* Validate BOs and map them to GPUVM (update VM page tables). */ +	/* Validate BOs managed by KFD */  	list_for_each_entry(mem, &process_info->kfd_bo_list,  			    validate_list) {  		struct amdgpu_bo *bo = mem->bo;  		uint32_t domain = mem->domain; -		struct kfd_mem_attachment *attachment;  		struct dma_resv_iter cursor;  		struct dma_fence *fence; @@ -2931,6 +2934,25 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *  				goto validate_map_fail;  			}  		} +	} + +	if (failed_size) +		pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size); + +	/* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO +	 * validations above would invalidate DMABuf imports again. +	 */ +	ret = process_validate_vms(process_info, &exec.ticket); +	if (ret) { +		pr_debug("Validating VMs failed, ret: %d\n", ret); +		goto validate_map_fail; +	} + +	/* Update mappings managed by KFD. */ +	list_for_each_entry(mem, &process_info->kfd_bo_list, +			    validate_list) { +		struct kfd_mem_attachment *attachment; +  		list_for_each_entry(attachment, &mem->attachments, list) {  			if (!attachment->is_mapped)  				continue; @@ -2947,18 +2969,6 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu *  		}  	} -	if (failed_size) -		pr_debug("0x%lx/0x%lx in system\n", failed_size, total_size); - -	/* Validate PDs, PTs and evicted DMABuf imports last. Otherwise BO -	 * validations above would invalidate DMABuf imports again. -	 */ -	ret = process_validate_vms(process_info, &exec.ticket); -	if (ret) { -		pr_debug("Validating VMs failed, ret: %d\n", ret); -		goto validate_map_fail; -	} -  	/* Update mappings not managed by KFD */  	list_for_each_entry(peer_vm, &process_info->vm_list_head,  			vm_list_node) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c index 6857c586ded7..108003bdf1e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c @@ -34,6 +34,7 @@ union firmware_info {  	struct atom_firmware_info_v3_2 v32;  	struct atom_firmware_info_v3_3 v33;  	struct atom_firmware_info_v3_4 v34; +	struct atom_firmware_info_v3_5 v35;  };  /* @@ -211,6 +212,7 @@ union igp_info {  	struct atom_integrated_system_info_v1_11 v11;  	struct atom_integrated_system_info_v1_12 v12;  	struct atom_integrated_system_info_v2_1 v21; +	struct atom_integrated_system_info_v2_3 v23;  };  union umc_info { @@ -359,6 +361,20 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device *adev,  					if (vram_type)  						*vram_type = convert_atom_mem_type_to_vram_type(adev, mem_type);  					break; +				case 3: +					mem_channel_number = igp_info->v23.umachannelnumber; +					if (!mem_channel_number) +						mem_channel_number = 1; +					mem_type = igp_info->v23.memorytype; +					if (mem_type == LpDdr5MemType) +						mem_channel_width = 32; +					else +						mem_channel_width = 64; +					if (vram_width) +						*vram_width = mem_channel_number * mem_channel_width; +					if (vram_type) +						*vram_type = convert_atom_mem_type_to_vram_type(adev, mem_type); +					break;  				default:  					return -EINVAL;  				} @@ -872,6 +888,10 @@ int amdgpu_atomfirmware_get_fw_reserved_fb_size(struct amdgpu_device *adev)  		fw_reserved_fb_size =  			(firmware_info->v34.fw_reserved_size_in_kb << 10);  		break; +	case 5: +		fw_reserved_fb_size = +			(firmware_info->v35.fw_reserved_size_in_kb << 10); +		break;  	default:  		fw_reserved_fb_size = 0;  		break; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_benchmark.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_benchmark.c index edc6377ec5ff..199693369c7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_benchmark.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_benchmark.c @@ -39,7 +39,7 @@ static int amdgpu_benchmark_do_move(struct amdgpu_device *adev, unsigned size,  	for (i = 0; i < n; i++) {  		struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;  		r = amdgpu_copy_buffer(ring, saddr, daddr, size, NULL, &fence, -				       false, false, false); +				       false, false, 0);  		if (r)  			goto exit_do_move;  		r = dma_fence_wait(fence, false); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 0a4b09709cfb..ec888fc6ead8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -819,7 +819,7 @@ retry:  	p->bytes_moved += ctx.bytes_moved;  	if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && -	    amdgpu_bo_in_cpu_visible_vram(bo)) +	    amdgpu_res_cpu_visible(adev, bo->tbo.resource))  		p->bytes_moved_vis += ctx.bytes_moved;  	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index f5d0fa207a88..b62ae3c91a9d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -2065,12 +2065,13 @@ static ssize_t amdgpu_reset_dump_register_list_write(struct file *f,  	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;  	char reg_offset[11];  	uint32_t *new = NULL, *tmp = NULL; -	int ret, i = 0, len = 0; +	unsigned int len = 0; +	int ret, i = 0;  	do {  		memset(reg_offset, 0, 11);  		if (copy_from_user(reg_offset, buf + len, -					min(10, ((int)size-len)))) { +					min(10, (size-len)))) {  			ret = -EFAULT;  			goto error_free;  		} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c new file mode 100644 index 000000000000..c1cb62683695 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <generated/utsrelease.h> +#include <linux/devcoredump.h> +#include "amdgpu_dev_coredump.h" +#include "atom.h" + +#ifndef CONFIG_DEV_COREDUMP +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, +		     struct amdgpu_reset_context *reset_context) +{ +} +#else + +const char *hw_ip_names[MAX_HWIP] = { +	[GC_HWIP]		= "GC", +	[HDP_HWIP]		= "HDP", +	[SDMA0_HWIP]		= "SDMA0", +	[SDMA1_HWIP]		= "SDMA1", +	[SDMA2_HWIP]		= "SDMA2", +	[SDMA3_HWIP]		= "SDMA3", +	[SDMA4_HWIP]		= "SDMA4", +	[SDMA5_HWIP]		= "SDMA5", +	[SDMA6_HWIP]		= "SDMA6", +	[SDMA7_HWIP]		= "SDMA7", +	[LSDMA_HWIP]		= "LSDMA", +	[MMHUB_HWIP]		= "MMHUB", +	[ATHUB_HWIP]		= "ATHUB", +	[NBIO_HWIP]		= "NBIO", +	[MP0_HWIP]		= "MP0", +	[MP1_HWIP]		= "MP1", +	[UVD_HWIP]		= "UVD/JPEG/VCN", +	[VCN1_HWIP]		= "VCN1", +	[VCE_HWIP]		= "VCE", +	[VPE_HWIP]		= "VPE", +	[DF_HWIP]		= "DF", +	[DCE_HWIP]		= "DCE", +	[OSSSYS_HWIP]		= "OSSSYS", +	[SMUIO_HWIP]		= "SMUIO", +	[PWR_HWIP]		= "PWR", +	[NBIF_HWIP]		= "NBIF", +	[THM_HWIP]		= "THM", +	[CLK_HWIP]		= "CLK", +	[UMC_HWIP]		= "UMC", +	[RSMU_HWIP]		= "RSMU", +	[XGMI_HWIP]		= "XGMI", +	[DCI_HWIP]		= "DCI", +	[PCIE_HWIP]		= "PCIE", +}; + +static void amdgpu_devcoredump_fw_info(struct amdgpu_device *adev, +				       struct drm_printer *p) +{ +	uint32_t version; +	uint32_t feature; +	uint8_t smu_program, smu_major, smu_minor, smu_debug; +	struct atom_context *ctx = adev->mode_info.atom_context; + +	drm_printf(p, "VCE feature version: %u, fw version: 0x%08x\n", +		   adev->vce.fb_version, adev->vce.fw_version); +	drm_printf(p, "UVD feature version: %u, fw version: 0x%08x\n", 0, +		   adev->uvd.fw_version); +	drm_printf(p, "GMC feature version: %u, fw version: 0x%08x\n", 0, +		   adev->gmc.fw_version); +	drm_printf(p, "ME feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.me_feature_version, adev->gfx.me_fw_version); +	drm_printf(p, "PFP feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.pfp_feature_version, adev->gfx.pfp_fw_version); +	drm_printf(p, "CE feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.ce_feature_version, adev->gfx.ce_fw_version); +	drm_printf(p, "RLC feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.rlc_feature_version, adev->gfx.rlc_fw_version); + +	drm_printf(p, "RLC SRLC feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.rlc_srlc_feature_version, +		   adev->gfx.rlc_srlc_fw_version); +	drm_printf(p, "RLC SRLG feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.rlc_srlg_feature_version, +		   adev->gfx.rlc_srlg_fw_version); +	drm_printf(p, "RLC SRLS feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.rlc_srls_feature_version, +		   adev->gfx.rlc_srls_fw_version); +	drm_printf(p, "RLCP feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.rlcp_ucode_feature_version, +		   adev->gfx.rlcp_ucode_version); +	drm_printf(p, "RLCV feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.rlcv_ucode_feature_version, +		   adev->gfx.rlcv_ucode_version); +	drm_printf(p, "MEC feature version: %u, fw version: 0x%08x\n", +		   adev->gfx.mec_feature_version, adev->gfx.mec_fw_version); + +	if (adev->gfx.mec2_fw) +		drm_printf(p, "MEC2 feature version: %u, fw version: 0x%08x\n", +			   adev->gfx.mec2_feature_version, +			   adev->gfx.mec2_fw_version); + +	drm_printf(p, "IMU feature version: %u, fw version: 0x%08x\n", 0, +		   adev->gfx.imu_fw_version); +	drm_printf(p, "PSP SOS feature version: %u, fw version: 0x%08x\n", +		   adev->psp.sos.feature_version, adev->psp.sos.fw_version); +	drm_printf(p, "PSP ASD feature version: %u, fw version: 0x%08x\n", +		   adev->psp.asd_context.bin_desc.feature_version, +		   adev->psp.asd_context.bin_desc.fw_version); + +	drm_printf(p, "TA XGMI feature version: 0x%08x, fw version: 0x%08x\n", +		   adev->psp.xgmi_context.context.bin_desc.feature_version, +		   adev->psp.xgmi_context.context.bin_desc.fw_version); +	drm_printf(p, "TA RAS feature version: 0x%08x, fw version: 0x%08x\n", +		   adev->psp.ras_context.context.bin_desc.feature_version, +		   adev->psp.ras_context.context.bin_desc.fw_version); +	drm_printf(p, "TA HDCP feature version: 0x%08x, fw version: 0x%08x\n", +		   adev->psp.hdcp_context.context.bin_desc.feature_version, +		   adev->psp.hdcp_context.context.bin_desc.fw_version); +	drm_printf(p, "TA DTM feature version: 0x%08x, fw version: 0x%08x\n", +		   adev->psp.dtm_context.context.bin_desc.feature_version, +		   adev->psp.dtm_context.context.bin_desc.fw_version); +	drm_printf(p, "TA RAP feature version: 0x%08x, fw version: 0x%08x\n", +		   adev->psp.rap_context.context.bin_desc.feature_version, +		   adev->psp.rap_context.context.bin_desc.fw_version); +	drm_printf(p, +		   "TA SECURE DISPLAY feature version: 0x%08x, fw version: 0x%08x\n", +		   adev->psp.securedisplay_context.context.bin_desc.feature_version, +		   adev->psp.securedisplay_context.context.bin_desc.fw_version); + +	/* SMC firmware */ +	version = adev->pm.fw_version; + +	smu_program = (version >> 24) & 0xff; +	smu_major = (version >> 16) & 0xff; +	smu_minor = (version >> 8) & 0xff; +	smu_debug = (version >> 0) & 0xff; +	drm_printf(p, +		   "SMC feature version: %u, program: %d, fw version: 0x%08x (%d.%d.%d)\n", +		   0, smu_program, version, smu_major, smu_minor, smu_debug); + +	/* SDMA firmware */ +	for (int i = 0; i < adev->sdma.num_instances; i++) { +		drm_printf(p, +			   "SDMA%d feature version: %u, firmware version: 0x%08x\n", +			   i, adev->sdma.instance[i].feature_version, +			   adev->sdma.instance[i].fw_version); +	} + +	drm_printf(p, "VCN feature version: %u, fw version: 0x%08x\n", 0, +		   adev->vcn.fw_version); +	drm_printf(p, "DMCU feature version: %u, fw version: 0x%08x\n", 0, +		   adev->dm.dmcu_fw_version); +	drm_printf(p, "DMCUB feature version: %u, fw version: 0x%08x\n", 0, +		   adev->dm.dmcub_fw_version); +	drm_printf(p, "PSP TOC feature version: %u, fw version: 0x%08x\n", +		   adev->psp.toc.feature_version, adev->psp.toc.fw_version); + +	version = adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK; +	feature = (adev->mes.kiq_version & AMDGPU_MES_FEAT_VERSION_MASK) >> +		  AMDGPU_MES_FEAT_VERSION_SHIFT; +	drm_printf(p, "MES_KIQ feature version: %u, fw version: 0x%08x\n", +		   feature, version); + +	version = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK; +	feature = (adev->mes.sched_version & AMDGPU_MES_FEAT_VERSION_MASK) >> +		  AMDGPU_MES_FEAT_VERSION_SHIFT; +	drm_printf(p, "MES feature version: %u, fw version: 0x%08x\n", feature, +		   version); + +	drm_printf(p, "VPE feature version: %u, fw version: 0x%08x\n", +		   adev->vpe.feature_version, adev->vpe.fw_version); + +	drm_printf(p, "\nVBIOS Information\n"); +	drm_printf(p, "vbios name       : %s\n", ctx->name); +	drm_printf(p, "vbios pn         : %s\n", ctx->vbios_pn); +	drm_printf(p, "vbios version    : %d\n", ctx->version); +	drm_printf(p, "vbios ver_str    : %s\n", ctx->vbios_ver_str); +	drm_printf(p, "vbios date       : %s\n", ctx->date); +} + +static ssize_t +amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, +			void *data, size_t datalen) +{ +	struct drm_printer p; +	struct amdgpu_coredump_info *coredump = data; +	struct drm_print_iterator iter; +	struct amdgpu_vm_fault_info *fault_info; +	int i, ver; + +	iter.data = buffer; +	iter.offset = 0; +	iter.start = offset; +	iter.remain = count; + +	p = drm_coredump_printer(&iter); + +	drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); +	drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n"); +	drm_printf(&p, "kernel: " UTS_RELEASE "\n"); +	drm_printf(&p, "module: " KBUILD_MODNAME "\n"); +	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, +		   coredump->reset_time.tv_nsec); + +	if (coredump->reset_task_info.pid) +		drm_printf(&p, "process_name: %s PID: %d\n", +			   coredump->reset_task_info.process_name, +			   coredump->reset_task_info.pid); + +	/* GPU IP's information of the SOC */ +	drm_printf(&p, "\nIP Information\n"); +	drm_printf(&p, "SOC Family: %d\n", coredump->adev->family); +	drm_printf(&p, "SOC Revision id: %d\n", coredump->adev->rev_id); +	drm_printf(&p, "SOC External Revision id: %d\n", coredump->adev->external_rev_id); + +	for (int i = 1; i < MAX_HWIP; i++) { +		for (int j = 0; j < HWIP_MAX_INSTANCE; j++) { +			ver = coredump->adev->ip_versions[i][j]; +			if (ver) +				drm_printf(&p, "HWIP: %s[%d][%d]: v%d.%d.%d.%d.%d\n", +					   hw_ip_names[i], i, j, +					   IP_VERSION_MAJ(ver), +					   IP_VERSION_MIN(ver), +					   IP_VERSION_REV(ver), +					   IP_VERSION_VARIANT(ver), +					   IP_VERSION_SUBREV(ver)); +		} +	} + +	/* IP firmware information */ +	drm_printf(&p, "\nIP Firmwares\n"); +	amdgpu_devcoredump_fw_info(coredump->adev, &p); + +	if (coredump->ring) { +		drm_printf(&p, "\nRing timed out details\n"); +		drm_printf(&p, "IP Type: %d Ring Name: %s\n", +			   coredump->ring->funcs->type, +			   coredump->ring->name); +	} + +	/* Add page fault information */ +	fault_info = &coredump->adev->vm_manager.fault_info; +	drm_printf(&p, "\n[%s] Page fault observed\n", +		   fault_info->vmhub ? "mmhub" : "gfxhub"); +	drm_printf(&p, "Faulty page starting at address: 0x%016llx\n", fault_info->addr); +	drm_printf(&p, "Protection fault status register: 0x%x\n\n", fault_info->status); + +	/* dump the ip state for each ip */ +	drm_printf(&p, "IP Dump\n"); +	for (int i = 0; i < coredump->adev->num_ip_blocks; i++) { +		if (coredump->adev->ip_blocks[i].version->funcs->print_ip_state) { +			drm_printf(&p, "IP: %s\n", +				   coredump->adev->ip_blocks[i] +					   .version->funcs->name); +			coredump->adev->ip_blocks[i] +				.version->funcs->print_ip_state( +					(void *)coredump->adev, &p); +			drm_printf(&p, "\n"); +		} +	} + +	/* Add ring buffer information */ +	drm_printf(&p, "Ring buffer information\n"); +	for (int i = 0; i < coredump->adev->num_rings; i++) { +		int j = 0; +		struct amdgpu_ring *ring = coredump->adev->rings[i]; + +		drm_printf(&p, "ring name: %s\n", ring->name); +		drm_printf(&p, "Rptr: 0x%llx Wptr: 0x%llx RB mask: %x\n", +			   amdgpu_ring_get_rptr(ring), +			   amdgpu_ring_get_wptr(ring), +			   ring->buf_mask); +		drm_printf(&p, "Ring size in dwords: %d\n", +			   ring->ring_size / 4); +		drm_printf(&p, "Ring contents\n"); +		drm_printf(&p, "Offset \t Value\n"); + +		while (j < ring->ring_size) { +			drm_printf(&p, "0x%x \t 0x%x\n", j, ring->ring[j / 4]); +			j += 4; +		} +	} + +	if (coredump->reset_vram_lost) +		drm_printf(&p, "VRAM is lost due to GPU reset!\n"); +	if (coredump->adev->reset_info.num_regs) { +		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n"); + +		for (i = 0; i < coredump->adev->reset_info.num_regs; i++) +			drm_printf(&p, "0x%08x: 0x%08x\n", +				   coredump->adev->reset_info.reset_dump_reg_list[i], +				   coredump->adev->reset_info.reset_dump_reg_value[i]); +	} + +	return count - iter.remain; +} + +static void amdgpu_devcoredump_free(void *data) +{ +	kfree(data); +} + +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, +		     struct amdgpu_reset_context *reset_context) +{ +	struct amdgpu_coredump_info *coredump; +	struct drm_device *dev = adev_to_drm(adev); +	struct amdgpu_job *job = reset_context->job; +	struct drm_sched_job *s_job; + +	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); + +	if (!coredump) { +		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__); +		return; +	} + +	coredump->reset_vram_lost = vram_lost; + +	if (reset_context->job && reset_context->job->vm) { +		struct amdgpu_task_info *ti; +		struct amdgpu_vm *vm = reset_context->job->vm; + +		ti = amdgpu_vm_get_task_info_vm(vm); +		if (ti) { +			coredump->reset_task_info = *ti; +			amdgpu_vm_put_task_info(ti); +		} +	} + +	if (job) { +		s_job = &job->base; +		coredump->ring = to_amdgpu_ring(s_job->sched); +	} + +	coredump->adev = adev; + +	ktime_get_ts64(&coredump->reset_time); + +	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, +		      amdgpu_devcoredump_read, amdgpu_devcoredump_free); +} +#endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h new file mode 100644 index 000000000000..52459512cb2b --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __AMDGPU_DEV_COREDUMP_H__ +#define __AMDGPU_DEV_COREDUMP_H__ + +#include "amdgpu.h" +#include "amdgpu_reset.h" + +#ifdef CONFIG_DEV_COREDUMP + +#define AMDGPU_COREDUMP_VERSION "1" + +struct amdgpu_coredump_info { +	struct amdgpu_device            *adev; +	struct amdgpu_task_info         reset_task_info; +	struct timespec64               reset_time; +	bool                            reset_vram_lost; +	struct amdgpu_ring              *ring; +}; +#endif + +void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, +		     struct amdgpu_reset_context *reset_context); + +#endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5dc24c971b41..861ccff78af9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -74,6 +74,7 @@  #include "amdgpu_fru_eeprom.h"  #include "amdgpu_reset.h"  #include "amdgpu_virt.h" +#include "amdgpu_dev_coredump.h"  #include <linux/suspend.h>  #include <drm/task_barrier.h> @@ -143,6 +144,8 @@ const char *amdgpu_asic_name[] = {  	"LAST",  }; +static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); +  /**   * DOC: pcie_replay_count   * @@ -335,16 +338,93 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)   *   * @dev: drm_device pointer   * - * Returns true if the device supporte BACO, - * otherwise return false. + * Return: + * 1 if the device supporte BACO; + * 3 if the device support MACO (only works if BACO is supported) + * otherwise return 0.   */ -bool amdgpu_device_supports_baco(struct drm_device *dev) +int amdgpu_device_supports_baco(struct drm_device *dev)  {  	struct amdgpu_device *adev = drm_to_adev(dev);  	return amdgpu_asic_supports_baco(adev);  } +void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) +{ +	struct drm_device *dev; +	int bamaco_support; + +	dev = adev_to_drm(adev); + +	adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; +	bamaco_support = amdgpu_device_supports_baco(dev); + +	switch (amdgpu_runtime_pm) { +	case 2: +		if (bamaco_support & MACO_SUPPORT) { +			adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; +			dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); +		} else if (bamaco_support == BACO_SUPPORT) { +			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; +			dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); +		} +		break; +	case 1: +		if (bamaco_support & BACO_SUPPORT) { +			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; +			dev_info(adev->dev, "Forcing BACO for runtime pm\n"); +		} +		break; +	case -1: +	case -2: +		if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ +			adev->pm.rpm_mode = AMDGPU_RUNPM_PX; +			dev_info(adev->dev, "Using ATPX for runtime pm\n"); +		} else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ +			adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; +			dev_info(adev->dev, "Using BOCO for runtime pm\n"); +		} else { +			if (!bamaco_support) +				goto no_runtime_pm; + +			switch (adev->asic_type) { +			case CHIP_VEGA20: +			case CHIP_ARCTURUS: +				/* BACO are not supported on vega20 and arctrus */ +				break; +			case CHIP_VEGA10: +				/* enable BACO as runpm mode if noretry=0 */ +				if (!adev->gmc.noretry) +					adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; +				break; +			default: +				/* enable BACO as runpm mode on CI+ */ +				adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; +				break; +			} + +			if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { +				if (bamaco_support & MACO_SUPPORT) { +					adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; +					dev_info(adev->dev, "Using BAMACO for runtime pm\n"); +				} else { +					dev_info(adev->dev, "Using BACO for runtime pm\n"); +				} +			} +		} +		break; +	case 0: +		dev_info(adev->dev, "runtime pm is manually disabled\n"); +		break; +	default: +		break; +	} + +no_runtime_pm: +	if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) +		dev_info(adev->dev, "Runtime PM not available\n"); +}  /**   * amdgpu_device_supports_smart_shift - Is the device dGPU with   * smart shift support @@ -1402,13 +1482,17 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev)   */  int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)  { -	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); +	unsigned long flags, offset; +	spin_lock_irqsave(&adev->wb.lock, flags); +	offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);  	if (offset < adev->wb.num_wb) {  		__set_bit(offset, adev->wb.used); +		spin_unlock_irqrestore(&adev->wb.lock, flags);  		*wb = offset << 3; /* convert to dw offset */  		return 0;  	} else { +		spin_unlock_irqrestore(&adev->wb.lock, flags);  		return -EINVAL;  	}  } @@ -1423,9 +1507,13 @@ int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)   */  void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)  { +	unsigned long flags; +  	wb >>= 3; +	spin_lock_irqsave(&adev->wb.lock, flags);  	if (wb < adev->wb.num_wb)  		__clear_bit(wb, adev->wb.used); +	spin_unlock_irqrestore(&adev->wb.lock, flags);  }  /** @@ -1455,7 +1543,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)  	/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */  	if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) -		DRM_WARN("System can't access extended configuration space,please check!!\n"); +		DRM_WARN("System can't access extended configuration space, please check!!\n");  	/* skip if the bios has already enabled large BAR */  	if (adev->gmc.real_vram_size && @@ -3981,6 +4069,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	spin_lock_init(&adev->se_cac_idx_lock);  	spin_lock_init(&adev->audio_endpt_idx_lock);  	spin_lock_init(&adev->mm_stats.lock); +	spin_lock_init(&adev->wb.lock);  	INIT_LIST_HEAD(&adev->shadow_list);  	mutex_init(&adev->shadow_list_lock); @@ -4069,6 +4158,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	/* Enable TMZ based on IP_VERSION */  	amdgpu_gmc_tmz_set(adev); +	if (amdgpu_sriov_vf(adev) && +	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) +		/* VF MMIO access (except mailbox range) from CPU +		 * will be blocked during sriov runtime +		 */ +		adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; +  	amdgpu_gmc_noretry_set(adev);  	/* Need to get xgmi info early to decide the reset behavior*/  	if (adev->gmc.xgmi.supported) { @@ -4135,18 +4231,22 @@ int amdgpu_device_init(struct amdgpu_device *adev,  					adev->ip_blocks[i].status.hw = true;  				}  			} +		} else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && +				   !amdgpu_device_has_display_hardware(adev)) { +					r = psp_gpu_reset(adev);  		} else { -			tmp = amdgpu_reset_method; -			/* It should do a default reset when loading or reloading the driver, -			 * regardless of the module parameter reset_method. -			 */ -			amdgpu_reset_method = AMD_RESET_METHOD_NONE; -			r = amdgpu_asic_reset(adev); -			amdgpu_reset_method = tmp; -			if (r) { -				dev_err(adev->dev, "asic reset on init failed\n"); -				goto failed; -			} +				tmp = amdgpu_reset_method; +				/* It should do a default reset when loading or reloading the driver, +				 * regardless of the module parameter reset_method. +				 */ +				amdgpu_reset_method = AMD_RESET_METHOD_NONE; +				r = amdgpu_asic_reset(adev); +				amdgpu_reset_method = tmp; +		} + +		if (r) { +		  dev_err(adev->dev, "asic reset on init failed\n"); +		  goto failed;  		}  	} @@ -4539,6 +4639,8 @@ int amdgpu_device_prepare(struct drm_device *dev)  	if (r)  		goto unprepare; +	flush_delayed_work(&adev->gfx.gfx_off_delay_work); +  	for (i = 0; i < adev->num_ip_blocks; i++) {  		if (!adev->ip_blocks[i].status.valid)  			continue; @@ -4968,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,  retry:  	amdgpu_amdkfd_pre_reset(adev); +	amdgpu_device_stop_pending_resets(adev); +  	if (from_hypervisor)  		r = amdgpu_virt_request_full_gpu(adev, true);  	else  		r = amdgpu_virt_reset_gpu(adev);  	if (r)  		return r; +	amdgpu_ras_set_fed(adev, false);  	amdgpu_irq_gpu_reset_resume_helper(adev);  	/* some sw clean up VF needs to do before recover */ @@ -5257,11 +5362,21 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  	struct amdgpu_device *tmp_adev = NULL;  	bool need_full_reset, skip_hw_reset, vram_lost = false;  	int r = 0; +	uint32_t i;  	/* Try reset handler method first */  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,  				    reset_list); -	amdgpu_reset_reg_dumps(tmp_adev); + +	if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { +		amdgpu_reset_reg_dumps(tmp_adev); + +		/* Trigger ip dump before we reset the asic */ +		for (i = 0; i < tmp_adev->num_ip_blocks; i++) +			if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) +				tmp_adev->ip_blocks[i].version->funcs +				->dump_ip_state((void *)tmp_adev); +	}  	reset_context->reset_device_list = device_list_handle;  	r = amdgpu_reset_perform_reset(tmp_adev, reset_context); @@ -5334,7 +5449,8 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  				vram_lost = amdgpu_device_check_vram_lost(tmp_adev); -				amdgpu_coredump(tmp_adev, vram_lost, reset_context); +				if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) +					amdgpu_coredump(tmp_adev, vram_lost, reset_context);  				if (vram_lost) {  					DRM_INFO("VRAM is lost due to GPU reset!\n"); @@ -5532,6 +5648,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)  } +static int amdgpu_device_health_check(struct list_head *device_list_handle) +{ +	struct amdgpu_device *tmp_adev; +	int ret = 0; +	u32 status; + +	list_for_each_entry(tmp_adev, device_list_handle, reset_list) { +		pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); +		if (PCI_POSSIBLE_ERROR(status)) { +			dev_err(tmp_adev->dev, "device lost from bus!"); +			ret = -ENODEV; +		} +	} + +	return ret; +} +  /**   * amdgpu_device_gpu_recover - reset the asic and recover scheduler   * @@ -5603,6 +5736,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  		device_list_handle = &device_list;  	} +	if (!amdgpu_sriov_vf(adev)) { +		r = amdgpu_device_health_check(device_list_handle); +		if (r) +			goto end_reset; +	} +  	/* We need to lock reset domain only once both for XGMI and single device */  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,  				    reset_list); @@ -5685,11 +5824,12 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  			tmp_adev->asic_reset_res = r;  		} -		/* -		 * Drop all pending non scheduler resets. Scheduler resets -		 * were already dropped during drm_sched_stop -		 */ -		amdgpu_device_stop_pending_resets(tmp_adev); +		if (!amdgpu_sriov_vf(tmp_adev)) +			/* +			* Drop all pending non scheduler resets. Scheduler resets +			* were already dropped during drm_sched_stop +			*/ +			amdgpu_device_stop_pending_resets(tmp_adev);  	}  	/* Actual ASIC resets if needed.*/ @@ -5768,6 +5908,7 @@ skip_sched_resume:  					    reset_list);  	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); +end_reset:  	if (hive) {  		mutex_unlock(&hive->hive_lock);  		amdgpu_put_xgmi_hive(hive); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index a07e4b87d4ca..0e31bdb4b7cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -97,6 +97,7 @@  #include "smuio_v13_0.h"  #include "smuio_v13_0_3.h"  #include "smuio_v13_0_6.h" +#include "smuio_v14_0_2.h"  #include "vcn_v5_0_0.h"  #include "jpeg_v5_0_0.h" @@ -245,6 +246,9 @@ static int amdgpu_discovery_read_binary_from_sysmem(struct amdgpu_device *adev,  	return -ENOENT;  } +#define IP_DISCOVERY_V2		2 +#define IP_DISCOVERY_V4		4 +  static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,  						 uint8_t *binary)  { @@ -259,14 +263,14 @@ static int amdgpu_discovery_read_binary_from_mem(struct amdgpu_device *adev,  	 * wait for this to complete.  Once the C2PMSG is updated, we can  	 * continue.  	 */ -	if (dev_is_removable(&adev->pdev->dev)) { -		for (i = 0; i < 1000; i++) { -			msg = RREG32(mmMP0_SMN_C2PMSG_33); -			if (msg & 0x80000000) -				break; -			msleep(1); -		} + +	for (i = 0; i < 1000; i++) { +		msg = RREG32(mmMP0_SMN_C2PMSG_33); +		if (msg & 0x80000000) +			break; +		usleep_range(1000, 1100);  	} +  	vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20;  	if (vram_size) { @@ -1896,6 +1900,9 @@ static int amdgpu_discovery_set_smu_ip_blocks(struct amdgpu_device *adev)  		amdgpu_device_ip_block_add(adev, &smu_v13_0_ip_block);  		break;  	case IP_VERSION(14, 0, 0): +	case IP_VERSION(14, 0, 1): +	case IP_VERSION(14, 0, 2): +	case IP_VERSION(14, 0, 3):  		amdgpu_device_ip_block_add(adev, &smu_v14_0_ip_block);  		break;  	default: @@ -2237,6 +2244,7 @@ static int amdgpu_discovery_set_umsch_mm_ip_blocks(struct amdgpu_device *adev)  {  	switch (amdgpu_ip_version(adev, VCN_HWIP, 0)) {  	case IP_VERSION(4, 0, 5): +	case IP_VERSION(4, 0, 6):  		if (amdgpu_umsch_mm & 0x1) {  			amdgpu_device_ip_block_add(adev, &umsch_mm_v4_0_ip_block);  			adev->enable_umsch_mm = true; @@ -2676,6 +2684,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)  	case IP_VERSION(14, 0, 1):  		adev->smuio.funcs = &smuio_v13_0_6_funcs;  		break; +	case IP_VERSION(14, 0, 2): +		adev->smuio.funcs = &smuio_v14_0_2_funcs; +		break;  	default:  		break;  	} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 80b9642f2bc4..ea14f1c8f430 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -195,6 +195,7 @@ int amdgpu_async_gfx_ring = 1;  int amdgpu_mcbp = -1;  int amdgpu_discovery = -1;  int amdgpu_mes; +int amdgpu_mes_log_enable = 0;  int amdgpu_mes_kiq;  int amdgpu_noretry = -1;  int amdgpu_force_asic_type = -1; @@ -668,6 +669,15 @@ MODULE_PARM_DESC(mes,  module_param_named(mes, amdgpu_mes, int, 0444);  /** + * DOC: mes_log_enable (int) + * Enable Micro Engine Scheduler log. This is used to enable/disable MES internal log. + * (0 = disabled (default), 1 = enabled) + */ +MODULE_PARM_DESC(mes_log_enable, +	"Enable Micro Engine Scheduler log (0 = disabled (default), 1 = enabled)"); +module_param_named(mes_log_enable, amdgpu_mes_log_enable, int, 0444); + +/**   * DOC: mes_kiq (int)   * Enable Micro Engine Scheduler KIQ. This is a new engine pipe for kiq.   * (0 = disabled (default), 1 = enabled) @@ -915,7 +925,7 @@ module_param_named(freesync_video, amdgpu_freesync_vid_mode, uint, 0444);   * GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco)   */  MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1, 3 = mode2, 4 = baco/bamaco)"); -module_param_named(reset_method, amdgpu_reset_method, int, 0444); +module_param_named(reset_method, amdgpu_reset_method, int, 0644);  /**   * DOC: bad_page_threshold (int) Bad page threshold is specifies the @@ -2471,6 +2481,7 @@ static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work)  	/* Use a common context, just need to make sure full reset is done */  	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); +	set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);  	r = amdgpu_do_asic_reset(&device_list, &reset_context);  	if (r) { @@ -2734,7 +2745,8 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev)  		drm_dev->switch_power_state = DRM_SWITCH_POWER_DYNAMIC_OFF;  	} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO) {  		/* nothing to do */ -	} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { +	} else if ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) || +			(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)) {  		amdgpu_device_baco_enter(drm_dev);  	} @@ -2774,7 +2786,8 @@ static int amdgpu_pmops_runtime_resume(struct device *dev)  		 * PCI core handles it for _PR3.  		 */  		pci_set_master(pdev); -	} else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { +	} else if ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) || +			(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)) {  		amdgpu_device_baco_exit(drm_dev);  	}  	ret = amdgpu_device_resume(drm_dev, false); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 55d5508987ff..1d955652f3ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1206,7 +1206,8 @@ void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev,  		fw_size = le32_to_cpu(cp_hdr_v2_0->data_size_bytes);  		break;  	default: -		break; +		dev_err(adev->dev, "Invalid ucode id %u\n", ucode_id); +		return;  	}  	if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 8fcf889ddce9..64f197bbc866 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -259,7 +259,6 @@ struct amdgpu_cu_info {  struct amdgpu_gfx_ras {  	struct amdgpu_ras_block_object  ras_block;  	void (*enable_watchdog_timer)(struct amdgpu_device *adev); -	bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);  	int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,  				struct amdgpu_irq_src *source,  				struct amdgpu_iv_entry *entry); @@ -434,6 +433,10 @@ struct amdgpu_gfx {  	uint32_t			num_xcc_per_xcp;  	struct mutex			partition_mutex;  	bool				mcbp; /* mid command buffer preemption */ + +	/* IP reg dump */ +	uint32_t			*ip_dump; +	uint32_t			reg_count;  };  struct amdgpu_gfx_ras_reg_entry { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h index c7b44aeb671b..103a837ccc71 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h @@ -38,6 +38,8 @@ struct amdgpu_gfxhub_funcs {  	void (*mode2_save_regs)(struct amdgpu_device *adev);  	void (*mode2_restore_regs)(struct amdgpu_device *adev);  	void (*halt)(struct amdgpu_device *adev); +	bool (*query_utcl2_poison_status)(struct amdgpu_device *adev, +			int xcc_id);  };  struct amdgpu_gfxhub { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c index d79cb13e1aa8..00d6211e0fbf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_i2c.c @@ -279,7 +279,7 @@ amdgpu_i2c_lookup(struct amdgpu_device *adev,  	return NULL;  } -static void amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus, +static int amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus,  				 u8 slave_addr,  				 u8 addr,  				 u8 *val) @@ -304,16 +304,18 @@ static void amdgpu_i2c_get_byte(struct amdgpu_i2c_chan *i2c_bus,  	out_buf[0] = addr;  	out_buf[1] = 0; -	if (i2c_transfer(&i2c_bus->adapter, msgs, 2) == 2) { -		*val = in_buf[0]; -		DRM_DEBUG("val = 0x%02x\n", *val); -	} else { -		DRM_DEBUG("i2c 0x%02x 0x%02x read failed\n", -			  addr, *val); +	if (i2c_transfer(&i2c_bus->adapter, msgs, 2) != 2) { +		DRM_DEBUG("i2c 0x%02x read failed\n", addr); +		return -EIO;  	} + +	*val = in_buf[0]; +	DRM_DEBUG("val = 0x%02x\n", *val); + +	return 0;  } -static void amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus, +static int amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus,  				 u8 slave_addr,  				 u8 addr,  				 u8 val) @@ -329,9 +331,12 @@ static void amdgpu_i2c_put_byte(struct amdgpu_i2c_chan *i2c_bus,  	out_buf[0] = addr;  	out_buf[1] = val; -	if (i2c_transfer(&i2c_bus->adapter, &msg, 1) != 1) -		DRM_DEBUG("i2c 0x%02x 0x%02x write failed\n", -			  addr, val); +	if (i2c_transfer(&i2c_bus->adapter, &msg, 1) != 1) { +		DRM_DEBUG("i2c 0x%02x 0x%02x write failed\n", addr, val); +		return -EIO; +	} + +	return 0;  }  /* ddc router switching */ @@ -346,16 +351,18 @@ amdgpu_i2c_router_select_ddc_port(const struct amdgpu_connector *amdgpu_connecto  	if (!amdgpu_connector->router_bus)  		return; -	amdgpu_i2c_get_byte(amdgpu_connector->router_bus, +	if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,  			    amdgpu_connector->router.i2c_addr, -			    0x3, &val); +			    0x3, &val)) +		return;  	val &= ~amdgpu_connector->router.ddc_mux_control_pin;  	amdgpu_i2c_put_byte(amdgpu_connector->router_bus,  			    amdgpu_connector->router.i2c_addr,  			    0x3, val); -	amdgpu_i2c_get_byte(amdgpu_connector->router_bus, +	if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,  			    amdgpu_connector->router.i2c_addr, -			    0x1, &val); +			    0x1, &val)) +		return;  	val &= ~amdgpu_connector->router.ddc_mux_control_pin;  	val |= amdgpu_connector->router.ddc_mux_state;  	amdgpu_i2c_put_byte(amdgpu_connector->router_bus, @@ -375,16 +382,18 @@ amdgpu_i2c_router_select_cd_port(const struct amdgpu_connector *amdgpu_connector  	if (!amdgpu_connector->router_bus)  		return; -	amdgpu_i2c_get_byte(amdgpu_connector->router_bus, +	if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,  			    amdgpu_connector->router.i2c_addr, -			    0x3, &val); +			    0x3, &val)) +		return;  	val &= ~amdgpu_connector->router.cd_mux_control_pin;  	amdgpu_i2c_put_byte(amdgpu_connector->router_bus,  			    amdgpu_connector->router.i2c_addr,  			    0x3, val); -	amdgpu_i2c_get_byte(amdgpu_connector->router_bus, +	if (amdgpu_i2c_get_byte(amdgpu_connector->router_bus,  			    amdgpu_connector->router.i2c_addr, -			    0x1, &val); +			    0x1, &val)) +		return;  	val &= ~amdgpu_connector->router.cd_mux_control_pin;  	val |= amdgpu_connector->router.cd_mux_state;  	amdgpu_i2c_put_byte(amdgpu_connector->router_bus, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index 7e6d09730e6d..665c63f55278 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@ -445,6 +445,14 @@ void amdgpu_irq_dispatch(struct amdgpu_device *adev,  	entry.ih = ih;  	entry.iv_entry = (const uint32_t *)&ih->ring[ring_index]; + +	/* +	 * timestamp is not supported on some legacy SOCs (cik, cz, iceland, +	 * si and tonga), so initialize timestamp and timestamp_src to 0 +	 */ +	entry.timestamp = 0; +	entry.timestamp_src = 0; +  	amdgpu_ih_decode_iv(adev, &entry);  	trace_amdgpu_iv(ih - &adev->irq.ih, &entry); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 4b3000c21ef2..e4742b65032d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -304,12 +304,15 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)  		dma_fence_set_error(finished, -ECANCELED);  	if (finished->error < 0) { -		DRM_INFO("Skip scheduling IBs!\n"); +		dev_dbg(adev->dev, "Skip scheduling IBs in ring(%s)", +			ring->name);  	} else {  		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,  				       &fence);  		if (r) -			DRM_ERROR("Error scheduling IBs (%d)\n", r); +			dev_err(adev->dev, +				"Error scheduling IBs (%d) in ring(%s)", r, +				ring->name);  	}  	job->job_run_counter++; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index a2df3025a754..a0ea6fe8d060 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -149,38 +149,7 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags)  		goto out;  	} -	adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; -	if (amdgpu_device_supports_px(dev) && -	    (amdgpu_runtime_pm != 0)) { /* enable PX as runtime mode */ -		adev->pm.rpm_mode = AMDGPU_RUNPM_PX; -		dev_info(adev->dev, "Using ATPX for runtime pm\n"); -	} else if (amdgpu_device_supports_boco(dev) && -		   (amdgpu_runtime_pm != 0)) { /* enable boco as runtime mode */ -		adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; -		dev_info(adev->dev, "Using BOCO for runtime pm\n"); -	} else if (amdgpu_device_supports_baco(dev) && -		   (amdgpu_runtime_pm != 0)) { -		switch (adev->asic_type) { -		case CHIP_VEGA20: -		case CHIP_ARCTURUS: -			/* enable BACO as runpm mode if runpm=1 */ -			if (amdgpu_runtime_pm > 0) -				adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; -			break; -		case CHIP_VEGA10: -			/* enable BACO as runpm mode if noretry=0 */ -			if (!adev->gmc.noretry) -				adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; -			break; -		default: -			/* enable BACO as runpm mode on CI+ */ -			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; -			break; -		} - -		if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) -			dev_info(adev->dev, "Using BACO for runtime pm\n"); -	} +	amdgpu_device_detect_runtime_pm_mode(adev);  	/* Call ACPI methods: require modeset init  	 * but failure is not fatal diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 24ad4b97177b..0734490347db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -210,22 +210,26 @@ int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable)  	return -EOPNOTSUPP;  } -static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry) +static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, struct mca_bank_entry *entry, +					 struct ras_query_context *qctx)  { -	dev_info(adev->dev, HW_ERR "Accelerator Check Architecture events logged\n"); -	dev_info(adev->dev, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n", -		 idx, entry->regs[MCA_REG_IDX_STATUS]); -	dev_info(adev->dev, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n", -		 idx, entry->regs[MCA_REG_IDX_ADDR]); -	dev_info(adev->dev, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n", -		 idx, entry->regs[MCA_REG_IDX_MISC0]); -	dev_info(adev->dev, HW_ERR "aca entry[%02d].IPID=0x%016llx\n", -		 idx, entry->regs[MCA_REG_IDX_IPID]); -	dev_info(adev->dev, HW_ERR "aca entry[%02d].SYND=0x%016llx\n", -		 idx, entry->regs[MCA_REG_IDX_SYND]); +	u64 event_id = qctx->event_id; + +	RAS_EVENT_LOG(adev, event_id, HW_ERR "Accelerator Check Architecture events logged\n"); +	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].STATUS=0x%016llx\n", +		      idx, entry->regs[MCA_REG_IDX_STATUS]); +	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].ADDR=0x%016llx\n", +		      idx, entry->regs[MCA_REG_IDX_ADDR]); +	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].MISC0=0x%016llx\n", +		      idx, entry->regs[MCA_REG_IDX_MISC0]); +	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].IPID=0x%016llx\n", +		      idx, entry->regs[MCA_REG_IDX_IPID]); +	RAS_EVENT_LOG(adev, event_id, HW_ERR "aca entry[%02d].SYND=0x%016llx\n", +		      idx, entry->regs[MCA_REG_IDX_SYND]);  } -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data) +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, +				 struct ras_err_data *err_data, struct ras_query_context *qctx)  {  	struct amdgpu_smuio_mcm_config_info mcm_info;  	struct ras_err_addr err_addr = {0}; @@ -244,7 +248,7 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo  	list_for_each_entry(node, &mca_set.list, node) {  		entry = &node->entry; -		amdgpu_mca_smu_mca_bank_dump(adev, i++, entry); +		amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx);  		count = 0;  		ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h index b964110ed1e0..e5bf07ce3451 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.h @@ -169,6 +169,7 @@ void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root  void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set);  int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry);  void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set); -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data); +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, +				 struct ras_err_data *err_data, struct ras_query_context *qctx);  #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index a98e03e0a51f..5ca5c47ab54e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -32,6 +32,18 @@  #define AMDGPU_MES_MAX_NUM_OF_QUEUES_PER_PROCESS 1024  #define AMDGPU_ONE_DOORBELL_SIZE 8 +signed long amdgpu_mes_fence_wait_polling(u64 *fence, +					  u64 wait_seq, +					  signed long timeout) +{ + +	while ((s64)(wait_seq - *fence) > 0 && timeout > 0) { +		udelay(2); +		timeout -= 2; +	} +	return timeout > 0 ? timeout : 0; +} +  int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev)  {  	return roundup(AMDGPU_ONE_DOORBELL_SIZE * @@ -40,7 +52,6 @@ int amdgpu_mes_doorbell_process_slice(struct amdgpu_device *adev)  }  static int amdgpu_mes_kernel_doorbell_get(struct amdgpu_device *adev, -					 struct amdgpu_mes_process *process,  					 int ip_type, uint64_t *doorbell_index)  {  	unsigned int offset, found; @@ -65,7 +76,6 @@ static int amdgpu_mes_kernel_doorbell_get(struct amdgpu_device *adev,  }  static void amdgpu_mes_kernel_doorbell_free(struct amdgpu_device *adev, -					   struct amdgpu_mes_process *process,  					   uint32_t doorbell_index)  {  	unsigned int old, rel_index; @@ -102,7 +112,10 @@ static int amdgpu_mes_event_log_init(struct amdgpu_device *adev)  {  	int r; -	r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, +	if (!amdgpu_mes_log_enable) +		return 0; + +	r = amdgpu_bo_create_kernel(adev, AMDGPU_MES_LOG_BUFFER_SIZE, PAGE_SIZE,  				    AMDGPU_GEM_DOMAIN_GTT,  				    &adev->mes.event_log_gpu_obj,  				    &adev->mes.event_log_gpu_addr, @@ -653,7 +666,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,  	*queue_id = queue->queue_id = r;  	/* allocate a doorbell index for the queue */ -	r = amdgpu_mes_kernel_doorbell_get(adev, gang->process, +	r = amdgpu_mes_kernel_doorbell_get(adev,  					  qprops->queue_type,  					  &qprops->doorbell_off);  	if (r) @@ -711,8 +724,7 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,  	return 0;  clean_up_doorbell: -	amdgpu_mes_kernel_doorbell_free(adev, gang->process, -				       qprops->doorbell_off); +	amdgpu_mes_kernel_doorbell_free(adev, qprops->doorbell_off);  clean_up_queue_id:  	spin_lock_irqsave(&adev->mes.queue_id_lock, flags);  	idr_remove(&adev->mes.queue_id_idr, queue->queue_id); @@ -766,8 +778,7 @@ int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id)  			  queue_id);  	list_del(&queue->list); -	amdgpu_mes_kernel_doorbell_free(adev, gang->process, -				       queue->doorbell_off); +	amdgpu_mes_kernel_doorbell_free(adev, queue->doorbell_off);  	amdgpu_mes_unlock(&adev->mes);  	amdgpu_mes_queue_free_mqd(queue); @@ -775,6 +786,28 @@ int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id)  	return 0;  } +int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev, +				struct amdgpu_ring *ring) +{ +	struct mes_map_legacy_queue_input queue_input; +	int r; + +	memset(&queue_input, 0, sizeof(queue_input)); + +	queue_input.queue_type = ring->funcs->type; +	queue_input.doorbell_offset = ring->doorbell_index; +	queue_input.pipe_id = ring->pipe; +	queue_input.queue_id = ring->queue; +	queue_input.mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj); +	queue_input.wptr_addr = ring->wptr_gpu_addr; + +	r = adev->mes.funcs->map_legacy_queue(&adev->mes, &queue_input); +	if (r) +		DRM_ERROR("failed to map legacy queue\n"); + +	return r; +} +  int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,  				  struct amdgpu_ring *ring,  				  enum amdgpu_unmap_queues_action action, @@ -1129,6 +1162,7 @@ void amdgpu_mes_remove_ring(struct amdgpu_device *adev,  		return;  	amdgpu_mes_remove_hw_queue(adev, ring->hw_queue_id); +	del_timer_sync(&ring->fence_drv.fallback_timer);  	amdgpu_ring_fini(ring);  	kfree(ring);  } @@ -1471,7 +1505,7 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe)  	const struct mes_firmware_header_v1_0 *mes_hdr;  	struct amdgpu_firmware_info *info;  	char ucode_prefix[30]; -	char fw_name[40]; +	char fw_name[50];  	bool need_retry = false;  	int r; @@ -1549,12 +1583,11 @@ static int amdgpu_debugfs_mes_event_log_show(struct seq_file *m, void *unused)  	uint32_t *mem = (uint32_t *)(adev->mes.event_log_cpu_addr);  	seq_hex_dump(m, "", DUMP_PREFIX_OFFSET, 32, 4, -		     mem, PAGE_SIZE, false); +		     mem, AMDGPU_MES_LOG_BUFFER_SIZE, false);  	return 0;  } -  DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_mes_event_log);  #endif @@ -1565,7 +1598,7 @@ void amdgpu_debugfs_mes_event_log_init(struct amdgpu_device *adev)  #if defined(CONFIG_DEBUG_FS)  	struct drm_minor *minor = adev_to_drm(adev)->primary;  	struct dentry *root = minor->debugfs_root; -	if (adev->enable_mes) +	if (adev->enable_mes && amdgpu_mes_log_enable)  		debugfs_create_file("amdgpu_mes_event_log", 0444, root,  				    adev, &amdgpu_debugfs_mes_event_log_fops); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index 7d4f93fea937..df9f0404d842 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -52,6 +52,7 @@ enum amdgpu_mes_priority_level {  #define AMDGPU_MES_PROC_CTX_SIZE 0x1000 /* one page area */  #define AMDGPU_MES_GANG_CTX_SIZE 0x1000 /* one page area */ +#define AMDGPU_MES_LOG_BUFFER_SIZE 0x4000 /* Maximu log buffer size for MES */  struct amdgpu_mes_funcs; @@ -140,6 +141,12 @@ struct amdgpu_mes {  	/* ip specific functions */  	const struct amdgpu_mes_funcs   *funcs; + +	/* mes resource_1 bo*/ +	struct amdgpu_bo    *resource_1; +	uint64_t            resource_1_gpu_addr; +	void                *resource_1_addr; +  };  struct amdgpu_mes_process { @@ -241,6 +248,15 @@ struct mes_remove_queue_input {  	uint64_t	gang_context_addr;  }; +struct mes_map_legacy_queue_input { +	uint32_t                           queue_type; +	uint32_t                           doorbell_offset; +	uint32_t                           pipe_id; +	uint32_t                           queue_id; +	uint64_t                           mqd_addr; +	uint64_t                           wptr_addr; +}; +  struct mes_unmap_legacy_queue_input {  	enum amdgpu_unmap_queues_action    action;  	uint32_t                           queue_type; @@ -317,6 +333,9 @@ struct amdgpu_mes_funcs {  	int (*remove_hw_queue)(struct amdgpu_mes *mes,  			       struct mes_remove_queue_input *input); +	int (*map_legacy_queue)(struct amdgpu_mes *mes, +				struct mes_map_legacy_queue_input *input); +  	int (*unmap_legacy_queue)(struct amdgpu_mes *mes,  				  struct mes_unmap_legacy_queue_input *input); @@ -333,6 +352,10 @@ struct amdgpu_mes_funcs {  #define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev))  #define amdgpu_mes_kiq_hw_fini(adev) (adev)->mes.kiq_hw_fini((adev)) +signed long amdgpu_mes_fence_wait_polling(u64 *fence, +					  u64 wait_seq, +					  signed long timeout); +  int amdgpu_mes_ctx_get_offs(struct amdgpu_ring *ring, unsigned int id_offs);  int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe); @@ -356,6 +379,8 @@ int amdgpu_mes_add_hw_queue(struct amdgpu_device *adev, int gang_id,  			    int *queue_id);  int amdgpu_mes_remove_hw_queue(struct amdgpu_device *adev, int queue_id); +int amdgpu_mes_map_legacy_queue(struct amdgpu_device *adev, +				struct amdgpu_ring *ring);  int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,  				  struct amdgpu_ring *ring,  				  enum amdgpu_unmap_queues_action action, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 1ca9d4ed8063..95d676ee207f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -63,6 +63,8 @@ struct amdgpu_mmhub_funcs {  				uint64_t page_table_base);  	void (*update_power_gating)(struct amdgpu_device *adev,                                  bool enable); +	bool (*query_utcl2_poison_status)(struct amdgpu_device *adev, +				int hub_inst);  };  struct amdgpu_mmhub { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 010b0cb7693c..b2a83c802bbd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -39,6 +39,7 @@  #include "amdgpu.h"  #include "amdgpu_trace.h"  #include "amdgpu_amdkfd.h" +#include "amdgpu_vram_mgr.h"  /**   * DOC: amdgpu_object @@ -153,8 +154,10 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)  		else  			places[c].flags |= TTM_PL_FLAG_TOPDOWN; -		if (flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) +		if (abo->tbo.type == ttm_bo_type_kernel && +		    flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)  			places[c].flags |= TTM_PL_FLAG_CONTIGUOUS; +  		c++;  	} @@ -173,6 +176,12 @@ void amdgpu_bo_placement_from_domain(struct amdgpu_bo *abo, u32 domain)  			abo->flags & AMDGPU_GEM_CREATE_PREEMPTIBLE ?  			AMDGPU_PL_PREEMPT : TTM_PL_TT;  		places[c].flags = 0; +		/* +		 * When GTT is just an alternative to VRAM make sure that we +		 * only use it as fallback and still try to fill up VRAM first. +		 */ +		if (domain & abo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) +			places[c].flags |= TTM_PL_FLAG_FALLBACK;  		c++;  	} @@ -595,8 +604,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,  	if (!amdgpu_bo_support_uswc(bo->flags))  		bo->flags &= ~AMDGPU_GEM_CREATE_CPU_GTT_USWC; -	if (adev->ras_enabled) -		bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; +	bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE;  	bo->tbo.bdev = &adev->mman.bdev;  	if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | @@ -605,6 +613,8 @@ int amdgpu_bo_create(struct amdgpu_device *adev,  	else  		amdgpu_bo_placement_from_domain(bo, bp->domain);  	if (bp->type == ttm_bo_type_kernel) +		bo->tbo.priority = 2; +	else if (!(bp->flags & AMDGPU_GEM_CREATE_DISCARDABLE))  		bo->tbo.priority = 1;  	if (!bp->destroy) @@ -617,8 +627,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,  		return r;  	if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && -	    bo->tbo.resource->mem_type == TTM_PL_VRAM && -	    amdgpu_bo_in_cpu_visible_vram(bo)) +	    amdgpu_res_cpu_visible(adev, bo->tbo.resource))  		amdgpu_cs_report_moved_bytes(adev, ctx.bytes_moved,  					     ctx.bytes_moved);  	else @@ -628,7 +637,7 @@ int amdgpu_bo_create(struct amdgpu_device *adev,  	    bo->tbo.resource->mem_type == TTM_PL_VRAM) {  		struct dma_fence *fence; -		r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, &fence, true); +		r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, &fence);  		if (unlikely(r))  			goto fail_unreserve; @@ -758,7 +767,7 @@ int amdgpu_bo_restore_shadow(struct amdgpu_bo *shadow, struct dma_fence **fence)  	return amdgpu_copy_buffer(ring, shadow_addr, parent_addr,  				  amdgpu_bo_size(shadow), NULL, fence, -				  true, false, false); +				  true, false, 0);  }  /** @@ -960,6 +969,10 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,  		if (!bo->placements[i].lpfn ||  		    (lpfn && lpfn < bo->placements[i].lpfn))  			bo->placements[i].lpfn = lpfn; + +		if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS && +		    bo->placements[i].mem_type == TTM_PL_VRAM) +			bo->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;  	}  	r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); @@ -1272,23 +1285,25 @@ void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict)  void amdgpu_bo_get_memory(struct amdgpu_bo *bo,  			  struct amdgpu_mem_stats *stats)  { +	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); +	struct ttm_resource *res = bo->tbo.resource;  	uint64_t size = amdgpu_bo_size(bo);  	struct drm_gem_object *obj;  	unsigned int domain;  	bool shared;  	/* Abort if the BO doesn't currently have a backing store */ -	if (!bo->tbo.resource) +	if (!res)  		return;  	obj = &bo->tbo.base;  	shared = drm_gem_object_is_shared_for_memory_stats(obj); -	domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type); +	domain = amdgpu_mem_type_to_domain(res->mem_type);  	switch (domain) {  	case AMDGPU_GEM_DOMAIN_VRAM:  		stats->vram += size; -		if (amdgpu_bo_in_cpu_visible_vram(bo)) +		if (amdgpu_res_cpu_visible(adev, bo->tbo.resource))  			stats->visible_vram += size;  		if (shared)  			stats->vram_shared += size; @@ -1359,8 +1374,9 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)  	if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))  		return; -	r = amdgpu_fill_buffer(abo, AMDGPU_POISON, bo->base.resv, &fence, true); +	r = amdgpu_fill_buffer(abo, 0, bo->base.resv, &fence, true);  	if (!WARN_ON(r)) { +		amdgpu_vram_mgr_set_cleared(bo->resource);  		amdgpu_bo_fence(abo, fence, false);  		dma_fence_put(fence);  	} @@ -1389,10 +1405,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo)  	/* Remember that this BO was accessed by the CPU */  	abo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; -	if (bo->resource->mem_type != TTM_PL_VRAM) -		return 0; - -	if (amdgpu_bo_in_cpu_visible_vram(abo)) +	if (amdgpu_res_cpu_visible(adev, bo->resource))  		return 0;  	/* Can't move a pinned BO to visible VRAM */ @@ -1415,7 +1428,7 @@ vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo)  	/* this should never happen */  	if (bo->resource->mem_type == TTM_PL_VRAM && -	    !amdgpu_bo_in_cpu_visible_vram(abo)) +	    !amdgpu_res_cpu_visible(adev, bo->resource))  		return VM_FAULT_SIGBUS;  	ttm_bo_move_to_lru_tail_unlocked(bo); @@ -1579,6 +1592,7 @@ uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev,   */  u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m)  { +	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);  	struct dma_buf_attachment *attachment;  	struct dma_buf *dma_buf;  	const char *placement; @@ -1587,10 +1601,11 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, struct seq_file *m)  	if (dma_resv_trylock(bo->tbo.base.resv)) {  		unsigned int domain; +  		domain = amdgpu_mem_type_to_domain(bo->tbo.resource->mem_type);  		switch (domain) {  		case AMDGPU_GEM_DOMAIN_VRAM: -			if (amdgpu_bo_in_cpu_visible_vram(bo)) +			if (amdgpu_res_cpu_visible(adev, bo->tbo.resource))  				placement = "VRAM VISIBLE";  			else  				placement = "VRAM"; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index be679c42b0b8..fa03d9e4874c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -251,28 +251,6 @@ static inline u64 amdgpu_bo_mmap_offset(struct amdgpu_bo *bo)  }  /** - * amdgpu_bo_in_cpu_visible_vram - check if BO is (partly) in visible VRAM - */ -static inline bool amdgpu_bo_in_cpu_visible_vram(struct amdgpu_bo *bo) -{ -	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); -	struct amdgpu_res_cursor cursor; - -	if (!bo->tbo.resource || bo->tbo.resource->mem_type != TTM_PL_VRAM) -		return false; - -	amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); -	while (cursor.remaining) { -		if (cursor.start < adev->gmc.visible_vram_size) -			return true; - -		amdgpu_res_next(&cursor, cursor.size); -	} - -	return false; -} - -/**   * amdgpu_bo_explicit_sync - return whether the bo is explicitly synced   */  static inline bool amdgpu_bo_explicit_sync(struct amdgpu_bo *bo) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 94b310fdb719..4bd4602d11b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1053,6 +1053,11 @@ static int psp_asd_initialize(struct psp_context *psp)  	if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)  		return 0; +	/* bypass asd if display hardware is not available */ +	if (!amdgpu_device_has_display_hardware(psp->adev) && +	    amdgpu_ip_version(psp->adev, MP0_HWIP, 0) >= IP_VERSION(13, 0, 10)) +		return 0; +  	psp->asd_context.mem_context.shared_mc_addr  = 0;  	psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;  	psp->asd_context.ta_load_type                = GFX_CMD_ID_LOAD_ASD; @@ -2260,6 +2265,15 @@ static int psp_hw_start(struct psp_context *psp)  			}  		} +		if ((is_psp_fw_valid(psp->ipkeymgr_drv)) && +		    (psp->funcs->bootloader_load_ipkeymgr_drv != NULL)) { +			ret = psp_bootloader_load_ipkeymgr_drv(psp); +			if (ret) { +				dev_err(adev->dev, "PSP load ipkeymgr_drv failed!\n"); +				return ret; +			} +		} +  		if ((is_psp_fw_valid(psp->sos)) &&  		    (psp->funcs->bootloader_load_sos != NULL)) {  			ret = psp_bootloader_load_sos(psp); @@ -2617,7 +2631,8 @@ static int psp_load_p2s_table(struct psp_context *psp)  	struct amdgpu_firmware_info *ucode =  		&adev->firmware.ucode[AMDGPU_UCODE_ID_P2S_TABLE]; -	if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO)) +	if (adev->in_runpm && ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) || +				(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)))  		return 0;  	if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) { @@ -2647,7 +2662,8 @@ static int psp_load_smu_fw(struct psp_context *psp)  	 * Skip SMU FW reloading in case of using BACO for runpm only,  	 * as SMU is always alive.  	 */ -	if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO)) +	if (adev->in_runpm && ((adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) || +				(adev->pm.rpm_mode == AMDGPU_RUNPM_BAMACO)))  		return 0;  	if (!ucode->fw || amdgpu_sriov_vf(psp->adev)) @@ -3273,6 +3289,12 @@ static int parse_sos_bin_descriptor(struct psp_context *psp,  		psp->ras_drv.size_bytes         = le32_to_cpu(desc->size_bytes);  		psp->ras_drv.start_addr         = ucode_start_addr;  		break; +	case PSP_FW_TYPE_PSP_IPKEYMGR_DRV: +		psp->ipkeymgr_drv.fw_version         = le32_to_cpu(desc->fw_version); +		psp->ipkeymgr_drv.feature_version    = le32_to_cpu(desc->fw_version); +		psp->ipkeymgr_drv.size_bytes         = le32_to_cpu(desc->size_bytes); +		psp->ipkeymgr_drv.start_addr         = ucode_start_addr; +		break;  	default:  		dev_warn(psp->adev->dev, "Unsupported PSP FW type: %d\n", desc->fw_type);  		break; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h index ee16f134ae92..3635303e6548 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h @@ -73,8 +73,10 @@ enum psp_bootloader_cmd {  	PSP_BL__LOAD_KEY_DATABASE	= 0x80000,  	PSP_BL__LOAD_SOCDRV             = 0xB0000,  	PSP_BL__LOAD_DBGDRV             = 0xC0000, +	PSP_BL__LOAD_HADDRV		= PSP_BL__LOAD_DBGDRV,  	PSP_BL__LOAD_INTFDRV		= 0xD0000, -	PSP_BL__LOAD_RASDRV		    = 0xE0000, +	PSP_BL__LOAD_RASDRV		= 0xE0000, +	PSP_BL__LOAD_IPKEYMGRDRV	= 0xF0000,  	PSP_BL__DRAM_LONG_TRAIN		= 0x100000,  	PSP_BL__DRAM_SHORT_TRAIN	= 0x200000,  	PSP_BL__LOAD_TOS_SPL_TABLE	= 0x10000000, @@ -117,6 +119,7 @@ struct psp_funcs {  	int (*bootloader_load_intf_drv)(struct psp_context *psp);  	int (*bootloader_load_dbg_drv)(struct psp_context *psp);  	int (*bootloader_load_ras_drv)(struct psp_context *psp); +	int (*bootloader_load_ipkeymgr_drv)(struct psp_context *psp);  	int (*bootloader_load_sos)(struct psp_context *psp);  	int (*ring_create)(struct psp_context *psp,  			   enum psp_ring_type ring_type); @@ -336,6 +339,7 @@ struct psp_context {  	struct psp_bin_desc		intf_drv;  	struct psp_bin_desc		dbg_drv;  	struct psp_bin_desc		ras_drv; +	struct psp_bin_desc		ipkeymgr_drv;  	/* tmr buffer */  	struct amdgpu_bo		*tmr_bo; @@ -424,6 +428,9 @@ struct amdgpu_psp_funcs {  #define psp_bootloader_load_ras_drv(psp) \  		((psp)->funcs->bootloader_load_ras_drv ? \  		(psp)->funcs->bootloader_load_ras_drv((psp)) : 0) +#define psp_bootloader_load_ipkeymgr_drv(psp) \ +		((psp)->funcs->bootloader_load_ipkeymgr_drv ? \ +		 (psp)->funcs->bootloader_load_ipkeymgr_drv((psp)) : 0)  #define psp_bootloader_load_sos(psp) \  		((psp)->funcs->bootloader_load_sos ? (psp)->funcs->bootloader_load_sos((psp)) : 0)  #define psp_smu_reload_quirk(psp) \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8ebab6f22e5a..1adc81a55734 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -122,6 +122,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)  #define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms +#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms +  enum amdgpu_ras_retire_page_reservation {  	AMDGPU_RAS_RETIRE_PAGE_RESERVED,  	AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -1045,6 +1047,7 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d  static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,  					      struct ras_manager *ras_mgr,  					      struct ras_err_data *err_data, +					      struct ras_query_context *qctx,  					      const char *blk_name,  					      bool is_ue,  					      bool is_de) @@ -1052,27 +1055,28 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,  	struct amdgpu_smuio_mcm_config_info *mcm_info;  	struct ras_err_node *err_node;  	struct ras_err_info *err_info; +	u64 event_id = qctx->event_id;  	if (is_ue) {  		for_each_ras_error(err_node, err_data) {  			err_info = &err_node->err_info;  			mcm_info = &err_info->mcm_info;  			if (err_info->ue_count) { -				dev_info(adev->dev, "socket: %d, die: %d, " -					 "%lld new uncorrectable hardware errors detected in %s block\n", -					 mcm_info->socket_id, -					 mcm_info->die_id, -					 err_info->ue_count, -					 blk_name); +				RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " +					      "%lld new uncorrectable hardware errors detected in %s block\n", +					      mcm_info->socket_id, +					      mcm_info->die_id, +					      err_info->ue_count, +					      blk_name);  			}  		}  		for_each_ras_error(err_node, &ras_mgr->err_data) {  			err_info = &err_node->err_info;  			mcm_info = &err_info->mcm_info; -			dev_info(adev->dev, "socket: %d, die: %d, " -				 "%lld uncorrectable hardware errors detected in total in %s block\n", -				 mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name); +			RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " +				      "%lld uncorrectable hardware errors detected in total in %s block\n", +				      mcm_info->socket_id, mcm_info->die_id, err_info->ue_count, blk_name);  		}  	} else { @@ -1081,44 +1085,44 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,  				err_info = &err_node->err_info;  				mcm_info = &err_info->mcm_info;  				if (err_info->de_count) { -					dev_info(adev->dev, "socket: %d, die: %d, " -						"%lld new deferred hardware errors detected in %s block\n", -						mcm_info->socket_id, -						mcm_info->die_id, -						err_info->de_count, -						blk_name); +					RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " +						      "%lld new deferred hardware errors detected in %s block\n", +						      mcm_info->socket_id, +						      mcm_info->die_id, +						      err_info->de_count, +						      blk_name);  				}  			}  			for_each_ras_error(err_node, &ras_mgr->err_data) {  				err_info = &err_node->err_info;  				mcm_info = &err_info->mcm_info; -				dev_info(adev->dev, "socket: %d, die: %d, " -					"%lld deferred hardware errors detected in total in %s block\n", -					mcm_info->socket_id, mcm_info->die_id, -					err_info->de_count, blk_name); +				RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " +					      "%lld deferred hardware errors detected in total in %s block\n", +					      mcm_info->socket_id, mcm_info->die_id, +					      err_info->de_count, blk_name);  			}  		} else {  			for_each_ras_error(err_node, err_data) {  				err_info = &err_node->err_info;  				mcm_info = &err_info->mcm_info;  				if (err_info->ce_count) { -					dev_info(adev->dev, "socket: %d, die: %d, " -						"%lld new correctable hardware errors detected in %s block\n", -						mcm_info->socket_id, -						mcm_info->die_id, -						err_info->ce_count, -						blk_name); +					RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " +						      "%lld new correctable hardware errors detected in %s block\n", +						      mcm_info->socket_id, +						      mcm_info->die_id, +						      err_info->ce_count, +						      blk_name);  				}  			}  			for_each_ras_error(err_node, &ras_mgr->err_data) {  				err_info = &err_node->err_info;  				mcm_info = &err_info->mcm_info; -				dev_info(adev->dev, "socket: %d, die: %d, " -					"%lld correctable hardware errors detected in total in %s block\n", -					mcm_info->socket_id, mcm_info->die_id, -					err_info->ce_count, blk_name); +				RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d, " +					      "%lld correctable hardware errors detected in total in %s block\n", +					      mcm_info->socket_id, mcm_info->die_id, +					      err_info->ce_count, blk_name);  			}  		}  	} @@ -1131,77 +1135,79 @@ static inline bool err_data_has_source_info(struct ras_err_data *data)  static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,  					     struct ras_query_if *query_if, -					     struct ras_err_data *err_data) +					     struct ras_err_data *err_data, +					     struct ras_query_context *qctx)  {  	struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);  	const char *blk_name = get_ras_block_str(&query_if->head); +	u64 event_id = qctx->event_id;  	if (err_data->ce_count) {  		if (err_data_has_source_info(err_data)) { -			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, +			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,  							  blk_name, false, false);  		} else if (!adev->aid_mask &&  			   adev->smuio.funcs &&  			   adev->smuio.funcs->get_socket_id &&  			   adev->smuio.funcs->get_die_id) { -			dev_info(adev->dev, "socket: %d, die: %d " -				 "%ld correctable hardware errors " -				 "detected in %s block\n", -				 adev->smuio.funcs->get_socket_id(adev), -				 adev->smuio.funcs->get_die_id(adev), -				 ras_mgr->err_data.ce_count, -				 blk_name); +			RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " +				      "%ld correctable hardware errors " +				      "detected in %s block\n", +				      adev->smuio.funcs->get_socket_id(adev), +				      adev->smuio.funcs->get_die_id(adev), +				      ras_mgr->err_data.ce_count, +				      blk_name);  		} else { -			dev_info(adev->dev, "%ld correctable hardware errors " -				 "detected in %s block\n", -				 ras_mgr->err_data.ce_count, -				 blk_name); +			RAS_EVENT_LOG(adev, event_id, "%ld correctable hardware errors " +				      "detected in %s block\n", +				      ras_mgr->err_data.ce_count, +				      blk_name);  		}  	}  	if (err_data->ue_count) {  		if (err_data_has_source_info(err_data)) { -			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, +			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,  							  blk_name, true, false);  		} else if (!adev->aid_mask &&  			   adev->smuio.funcs &&  			   adev->smuio.funcs->get_socket_id &&  			   adev->smuio.funcs->get_die_id) { -			dev_info(adev->dev, "socket: %d, die: %d " -				 "%ld uncorrectable hardware errors " -				 "detected in %s block\n", -				 adev->smuio.funcs->get_socket_id(adev), -				 adev->smuio.funcs->get_die_id(adev), -				 ras_mgr->err_data.ue_count, -				 blk_name); +			RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " +				      "%ld uncorrectable hardware errors " +				      "detected in %s block\n", +				      adev->smuio.funcs->get_socket_id(adev), +				      adev->smuio.funcs->get_die_id(adev), +				      ras_mgr->err_data.ue_count, +				      blk_name);  		} else { -			dev_info(adev->dev, "%ld uncorrectable hardware errors " -				 "detected in %s block\n", -				 ras_mgr->err_data.ue_count, -				 blk_name); +			RAS_EVENT_LOG(adev, event_id, "%ld uncorrectable hardware errors " +				      "detected in %s block\n", +				      ras_mgr->err_data.ue_count, +				      blk_name);  		}  	}  	if (err_data->de_count) {  		if (err_data_has_source_info(err_data)) { -			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, +			amdgpu_ras_error_print_error_data(adev, ras_mgr, err_data, qctx,  							  blk_name, false, true);  		} else if (!adev->aid_mask &&  			   adev->smuio.funcs &&  			   adev->smuio.funcs->get_socket_id &&  			   adev->smuio.funcs->get_die_id) { -			dev_info(adev->dev, "socket: %d, die: %d " -				 "%ld deferred hardware errors " -				 "detected in %s block\n", -				 adev->smuio.funcs->get_socket_id(adev), -				 adev->smuio.funcs->get_die_id(adev), -				 ras_mgr->err_data.de_count, -				 blk_name); +			RAS_EVENT_LOG(adev, event_id, "socket: %d, die: %d " +				      "%ld deferred hardware errors " +				      "detected in %s block\n", +				      adev->smuio.funcs->get_socket_id(adev), +				      adev->smuio.funcs->get_die_id(adev), +				      ras_mgr->err_data.de_count, +				      blk_name);  		} else { -			dev_info(adev->dev, "%ld deferred hardware errors " -				 "detected in %s block\n", -				 ras_mgr->err_data.de_count, -				 blk_name); +			RAS_EVENT_LOG(adev, event_id, "%ld deferred hardware errors " +				      "detected in %s block\n", +				      ras_mgr->err_data.de_count, +				      blk_name);  		}  	}  } @@ -1244,6 +1250,10 @@ int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk,  {  	struct ras_manager *obj; +	/* in resume phase, no need to create aca fs node */ +	if (adev->in_suspend || amdgpu_in_reset(adev)) +		return 0; +  	obj = get_ras_manager(adev, blk);  	if (!obj)  		return -EINVAL; @@ -1265,7 +1275,8 @@ int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk)  }  static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu_ras_block blk, -					 enum aca_error_type type, struct ras_err_data *err_data) +					 enum aca_error_type type, struct ras_err_data *err_data, +					 struct ras_query_context *qctx)  {  	struct ras_manager *obj; @@ -1273,7 +1284,7 @@ static int amdgpu_aca_log_ras_error_data(struct amdgpu_device *adev, enum amdgpu  	if (!obj)  		return -EINVAL; -	return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data); +	return amdgpu_aca_get_error_data(adev, &obj->aca_handle, type, err_data, qctx);  }  ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, @@ -1287,13 +1298,14 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a  	if (amdgpu_ras_query_error_status(obj->adev, &info))  		return -EINVAL; -	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, -			  "ce", info.ce_count); +	return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count, +			  "ce", info.ce_count, "de", info.ue_count);  }  static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,  						struct ras_query_if *info,  						struct ras_err_data *err_data, +						struct ras_query_context *qctx,  						unsigned int error_query_mode)  {  	enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT; @@ -1329,17 +1341,21 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,  		}  	} else {  		if (amdgpu_aca_is_enabled(adev)) { -			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data); +			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_UE, err_data, qctx); +			if (ret) +				return ret; + +			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data, qctx);  			if (ret)  				return ret; -			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_CE, err_data); +			ret = amdgpu_aca_log_ras_error_data(adev, blk, ACA_ERROR_TYPE_DEFERRED, err_data, qctx);  			if (ret)  				return ret;  		} else {  			/* FIXME: add code to check return value later */ -			amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data); -			amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data); +			amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data, qctx); +			amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data, qctx);  		}  	} @@ -1351,6 +1367,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i  {  	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);  	struct ras_err_data err_data; +	struct ras_query_context qctx;  	unsigned int error_query_mode;  	int ret; @@ -1364,8 +1381,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i  	if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))  		return -EINVAL; +	memset(&qctx, 0, sizeof(qctx)); +	qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? +						   RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);  	ret = amdgpu_ras_query_error_status_helper(adev, info,  						   &err_data, +						   &qctx,  						   error_query_mode);  	if (ret)  		goto out_fini_err_data; @@ -1376,7 +1397,7 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_i  	info->ce_count = obj->err_data.ce_count;  	info->de_count = obj->err_data.de_count; -	amdgpu_ras_error_generate_report(adev, info, &err_data); +	amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);  out_fini_err_data:  	amdgpu_ras_error_data_fini(&err_data); @@ -2041,7 +2062,7 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *  		}  	} -	amdgpu_umc_poison_handler(adev, obj->head.block, false); +	amdgpu_umc_poison_handler(adev, obj->head.block, 0);  	if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)  		poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); @@ -2061,6 +2082,17 @@ static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj  {  	dev_info(obj->adev->dev,  		"Poison is created\n"); + +	if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) { +		struct amdgpu_ras *con = amdgpu_ras_get_context(obj->adev); + +		amdgpu_ras_put_poison_req(obj->adev, +			AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false); + +		atomic_inc(&con->page_retirement_req_cnt); + +		wake_up(&con->page_retirement_wq); +	}  }  static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, @@ -2371,7 +2403,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,  			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,  		};  		status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, -				data->bps[i].retired_page); +				data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT);  		if (status == -EBUSY)  			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;  		else if (status == -ENOENT) @@ -2384,6 +2416,19 @@ out:  	return ret;  } +static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev, +				   struct amdgpu_hive_info *hive, bool status) +{ +	struct amdgpu_device *tmp_adev; + +	if (hive) { +		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) +			amdgpu_ras_set_fed(tmp_adev, status); +	} else { +		amdgpu_ras_set_fed(adev, status); +	} +} +  static void amdgpu_ras_do_recovery(struct work_struct *work)  {  	struct amdgpu_ras *ras = @@ -2393,8 +2438,21 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  	struct list_head device_list, *device_list_handle =  NULL;  	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); -	if (hive) +	if (hive) {  		atomic_set(&hive->ras_recovery, 1); + +		/* If any device which is part of the hive received RAS fatal +		 * error interrupt, set fatal error status on all. This +		 * condition will need a recovery, and flag will be cleared +		 * as part of recovery. +		 */ +		list_for_each_entry(remote_adev, &hive->device_list, +				    gmc.xgmi.head) +			if (amdgpu_ras_get_fed_status(remote_adev)) { +				amdgpu_ras_set_fed_all(adev, hive, true); +				break; +			} +	}  	if (!ras->disable_ras_err_cnt_harvest) {  		/* Build list of devices to query RAS related errors */ @@ -2439,18 +2497,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)  				ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;  				set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); -				/* For any RAS error that needs a full reset to -				 * recover, set the fatal error status -				 */ -				if (hive) { -					list_for_each_entry(remote_adev, -							    &hive->device_list, -							    gmc.xgmi.head) -						amdgpu_ras_set_fed(remote_adev, -								   true); -				} else { -					amdgpu_ras_set_fed(adev, true); -				}  				psp_fatal_error_recovery_quirk(&adev->psp);  			}  		} @@ -2516,9 +2562,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,  			goto out;  		} -		amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, -			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT, -			AMDGPU_GPU_PAGE_SIZE); +		amdgpu_ras_reserve_page(adev, bps[i].retired_page);  		memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps));  		data->count++; @@ -2674,10 +2718,167 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,  	}  } +int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, +		enum amdgpu_ras_block block, uint16_t pasid, +		pasid_notify pasid_fn, void *data, uint32_t reset) +{ +	int ret = 0; +	struct ras_poison_msg poison_msg; +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + +	memset(&poison_msg, 0, sizeof(poison_msg)); +	poison_msg.block = block; +	poison_msg.pasid = pasid; +	poison_msg.reset = reset; +	poison_msg.pasid_fn = pasid_fn; +	poison_msg.data = data; + +	ret = kfifo_put(&con->poison_fifo, poison_msg); +	if (!ret) { +		dev_err(adev->dev, "Poison message fifo is full!\n"); +		return -ENOSPC; +	} + +	return 0; +} + +static int amdgpu_ras_get_poison_req(struct amdgpu_device *adev, +		struct ras_poison_msg *poison_msg) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + +	return kfifo_get(&con->poison_fifo, poison_msg); +} + +static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) +{ +	mutex_init(&ecc_log->lock); + +	/* Set any value as siphash key */ +	memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key)); + +	INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); +	ecc_log->de_updated = false; +} + +static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) +{ +	struct radix_tree_iter iter; +	void __rcu **slot; +	struct ras_ecc_err *ecc_err; + +	mutex_lock(&ecc_log->lock); +	radix_tree_for_each_slot(slot, &ecc_log->de_page_tree, &iter, 0) { +		ecc_err = radix_tree_deref_slot(slot); +		kfree(ecc_err->err_pages.pfn); +		kfree(ecc_err); +		radix_tree_iter_delete(&ecc_log->de_page_tree, &iter, slot); +	} +	mutex_unlock(&ecc_log->lock); + +	mutex_destroy(&ecc_log->lock); +	ecc_log->de_updated = false; +} + +static void amdgpu_ras_do_page_retirement(struct work_struct *work) +{ +	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, +					      page_retirement_dwork.work); +	struct amdgpu_device *adev = con->adev; +	struct ras_err_data err_data; + +	if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) +		return; + +	amdgpu_ras_error_data_init(&err_data); + +	amdgpu_umc_handle_bad_pages(adev, &err_data); + +	amdgpu_ras_error_data_fini(&err_data); + +	mutex_lock(&con->umc_ecc_log.lock); +	if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree, +				UMC_ECC_NEW_DETECTED_TAG)) +		schedule_delayed_work(&con->page_retirement_dwork, +			msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL)); +	mutex_unlock(&con->umc_ecc_log.lock); +} + +static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev, +			enum amdgpu_ras_block ras_block, uint32_t timeout_ms) +{ +	int ret = 0; +	struct ras_ecc_log_info *ecc_log; +	struct ras_query_if info; +	uint32_t timeout = timeout_ms; +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + +	memset(&info, 0, sizeof(info)); +	info.head.block = ras_block; + +	ecc_log = &ras->umc_ecc_log; +	ecc_log->de_updated = false; +	do { +		ret = amdgpu_ras_query_error_status(adev, &info); +		if (ret) { +			dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret); +			return ret; +		} + +		if (timeout && !ecc_log->de_updated) { +			msleep(1); +			timeout--; +		} +	} while (timeout && !ecc_log->de_updated); + +	if (timeout_ms && !timeout) { +		dev_warn(adev->dev, "Can't find deferred error\n"); +		return -ETIMEDOUT; +	} + +	return 0; +} + +static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, +					uint32_t timeout) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	int ret; + +	ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout); +	if (!ret) +		schedule_delayed_work(&con->page_retirement_dwork, 0); +} + +static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, +			struct ras_poison_msg *poison_msg) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	uint32_t reset = poison_msg->reset; +	uint16_t pasid = poison_msg->pasid; + +	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + +	if (poison_msg->pasid_fn) +		poison_msg->pasid_fn(adev, pasid, poison_msg->data); + +	if (reset) { +		flush_delayed_work(&con->page_retirement_dwork); + +		con->gpu_reset_flags |= reset; +		amdgpu_ras_reset_gpu(adev); +	} + +	return 0; +} +  static int amdgpu_ras_page_retirement_thread(void *param)  {  	struct amdgpu_device *adev = (struct amdgpu_device *)param;  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_poison_msg poison_msg; +	enum amdgpu_ras_block ras_block; +	bool poison_creation_is_handled = false;  	while (!kthread_should_stop()) { @@ -2688,13 +2889,34 @@ static int amdgpu_ras_page_retirement_thread(void *param)  		if (kthread_should_stop())  			break; -		dev_info(adev->dev, "Start processing page retirement. request:%d\n", -			atomic_read(&con->page_retirement_req_cnt)); -  		atomic_dec(&con->page_retirement_req_cnt); -		amdgpu_umc_bad_page_polling_timeout(adev, -				false, MAX_UMC_POISON_POLLING_TIME_ASYNC); +		if (!amdgpu_ras_get_poison_req(adev, &poison_msg)) +			continue; + +		ras_block = poison_msg.block; + +		dev_info(adev->dev, "Start processing ras block %s(%d)\n", +				ras_block_str(ras_block), ras_block); + +		if (ras_block == AMDGPU_RAS_BLOCK__UMC) { +			amdgpu_ras_poison_creation_handler(adev, +				MAX_UMC_POISON_POLLING_TIME_ASYNC); +			poison_creation_is_handled = true; +		} else { +			/* poison_creation_is_handled: +			 *   false: no poison creation interrupt, but it has poison +			 *          consumption interrupt. +			 *   true: It has poison creation interrupt at the beginning, +			 *         but it has no poison creation interrupt later. +			 */ +			amdgpu_ras_poison_creation_handler(adev, +					poison_creation_is_handled ? +					0 : MAX_UMC_POISON_POLLING_TIME_ASYNC); + +			amdgpu_ras_poison_consumption_handler(adev, &poison_msg); +			poison_creation_is_handled = false; +		}  	}  	return 0; @@ -2763,6 +2985,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  		}  	} +	mutex_init(&con->page_rsv_lock); +	INIT_KFIFO(con->poison_fifo);  	mutex_init(&con->page_retirement_lock);  	init_waitqueue_head(&con->page_retirement_wq);  	atomic_set(&con->page_retirement_req_cnt, 0); @@ -2773,6 +2997,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  		dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");  	} +	INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement); +	amdgpu_ras_ecc_log_init(&con->umc_ecc_log);  #ifdef CONFIG_X86_MCE_AMD  	if ((adev->asic_type == CHIP_ALDEBARAN) &&  	    (adev->gmc.xgmi.connected_to_cpu)) @@ -2813,8 +3039,14 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)  	atomic_set(&con->page_retirement_req_cnt, 0); +	mutex_destroy(&con->page_rsv_lock); +  	cancel_work_sync(&con->recovery_work); +	cancel_delayed_work_sync(&con->page_retirement_dwork); + +	amdgpu_ras_ecc_log_fini(&con->umc_ecc_log); +  	mutex_lock(&con->recovery_lock);  	con->eh_data = NULL;  	kfree(data->bps); @@ -3036,6 +3268,35 @@ static int amdgpu_get_ras_schema(struct amdgpu_device *adev)  			AMDGPU_RAS_ERROR__PARITY;  } +static void ras_event_mgr_init(struct ras_event_manager *mgr) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(mgr->seqnos); i++) +		atomic64_set(&mgr->seqnos[i], 0); +} + +static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +	struct amdgpu_hive_info *hive; + +	if (!ras) +		return; + +	hive = amdgpu_get_xgmi_hive(adev); +	ras->event_mgr = hive ? &hive->event_mgr : &ras->__event_mgr; + +	/* init event manager with node 0 on xgmi system */ +	if (!amdgpu_in_reset(adev)) { +		if (!hive || adev->gmc.xgmi.node_id == 0) +			ras_event_mgr_init(ras->event_mgr); +	} + +	if (hive) +		amdgpu_put_xgmi_hive(hive); +} +  int amdgpu_ras_init(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -3356,6 +3617,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)  	if (amdgpu_sriov_vf(adev))  		return 0; +	amdgpu_ras_event_mgr_init(adev); +  	if (amdgpu_aca_is_enabled(adev)) {  		if (amdgpu_in_reset(adev))  			r = amdgpu_aca_reset(adev); @@ -3472,14 +3735,39 @@ void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status)  		atomic_set(&ras->fed, !!status);  } +bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id) +{ +	return !(id & BIT_ULL(63)); +} + +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type) +{ +	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +	u64 id; + +	switch (type) { +	case RAS_EVENT_TYPE_ISR: +		id = (u64)atomic64_read(&ras->event_mgr->seqnos[type]); +		break; +	case RAS_EVENT_TYPE_INVALID: +	default: +		id = BIT_ULL(63) | 0ULL; +		break; +	} + +	return id; +} +  void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)  {  	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {  		struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); +		u64 event_id = (u64)atomic64_inc_return(&ras->event_mgr->seqnos[RAS_EVENT_TYPE_ISR]); -		dev_info(adev->dev, "uncorrectable hardware error" -			"(ERREVENT_ATHUB_INTERRUPT) detected!\n"); +		RAS_EVENT_LOG(adev, event_id, "uncorrectable hardware error" +			      "(ERREVENT_ATHUB_INTERRUPT) detected!\n"); +		amdgpu_ras_set_fed(adev, true);  		ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;  		amdgpu_ras_reset_gpu(adev);  	} @@ -3998,6 +4286,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_a  {  	struct ras_err_addr *mca_err_addr; +	/* This function will be retired. */ +	return;  	mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);  	if (!mca_err_addr)  		return; @@ -4195,3 +4485,19 @@ void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)  			amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);  	}  } + +int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; +	uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT; +	int ret = 0; + +	mutex_lock(&con->page_rsv_lock); +	ret = amdgpu_vram_mgr_query_page_status(mgr, start); +	if (ret == -ENOENT) +		ret = amdgpu_vram_mgr_reserve_range(mgr, start, AMDGPU_GPU_PAGE_SIZE); +	mutex_unlock(&con->page_rsv_lock); + +	return ret; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index e0f8ce9d8440..c8980d5f6540 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -26,6 +26,9 @@  #include <linux/debugfs.h>  #include <linux/list.h> +#include <linux/kfifo.h> +#include <linux/radix-tree.h> +#include <linux/siphash.h>  #include "ta_ras_if.h"  #include "amdgpu_ras_eeprom.h"  #include "amdgpu_smuio.h" @@ -64,6 +67,14 @@ struct amdgpu_iv_entry;  /* The high three bits indicates socketid */  #define AMDGPU_RAS_GET_FEATURES(val)  ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) +#define RAS_EVENT_LOG(_adev, _id, _fmt, ...)				\ +do {									\ +	if (amdgpu_ras_event_id_is_valid((_adev), (_id)))			\ +	    dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__);	\ +	else								\ +	    dev_info((_adev)->dev, _fmt, ##__VA_ARGS__);			\ +} while (0) +  enum amdgpu_ras_block {  	AMDGPU_RAS_BLOCK__UMC = 0,  	AMDGPU_RAS_BLOCK__SDMA, @@ -419,6 +430,52 @@ struct umc_ecc_info {  	int record_ce_addr_supported;  }; +enum ras_event_type { +	RAS_EVENT_TYPE_INVALID = -1, +	RAS_EVENT_TYPE_ISR = 0, +	RAS_EVENT_TYPE_COUNT, +}; + +struct ras_event_manager { +	atomic64_t seqnos[RAS_EVENT_TYPE_COUNT]; +}; + +struct ras_query_context { +	enum ras_event_type type; +	u64 event_id; +}; + +typedef int (*pasid_notify)(struct amdgpu_device *adev, +		uint16_t pasid, void *data); + +struct ras_poison_msg { +	enum amdgpu_ras_block block; +	uint16_t pasid; +	uint32_t reset; +	pasid_notify pasid_fn; +	void *data; +}; + +struct ras_err_pages { +	uint32_t count; +	uint64_t *pfn; +}; + +struct ras_ecc_err { +	u64 hash_index; +	uint64_t status; +	uint64_t ipid; +	uint64_t addr; +	struct ras_err_pages err_pages; +}; + +struct ras_ecc_log_info { +	struct mutex lock; +	siphash_key_t ecc_key; +	struct radix_tree_root de_page_tree; +	bool	de_updated; +}; +  struct amdgpu_ras {  	/* ras infrastructure */  	/* for ras itself. */ @@ -477,8 +534,18 @@ struct amdgpu_ras {  	wait_queue_head_t page_retirement_wq;  	struct mutex page_retirement_lock;  	atomic_t page_retirement_req_cnt; +	struct mutex page_rsv_lock; +	DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); +	struct ras_ecc_log_info  umc_ecc_log; +	struct delayed_work page_retirement_dwork; +  	/* Fatal error detected flag */  	atomic_t fed; + +	/* RAS event manager */ +	struct ras_event_manager __event_mgr; +	struct ras_event_manager *event_mgr; +  };  struct ras_fs_data { @@ -512,6 +579,7 @@ struct ras_err_data {  	unsigned long de_count;  	unsigned long err_addr_cnt;  	struct eeprom_table_record *err_addr; +	unsigned long err_addr_len;  	u32 err_list_count;  	struct list_head err_node_list;  }; @@ -879,4 +947,13 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,  void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);  bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); +bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id); +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); + +int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); + +int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, +		enum amdgpu_ras_block block, uint16_t pasid, +		pasid_notify pasid_fn, void *data, uint32_t reset); +  #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index b12808c0c331..06a62a8a992e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -404,6 +404,22 @@ static int amdgpu_ras_eeprom_correct_header_tag(  	return res;  } +static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control *control) +{ +	struct amdgpu_device *adev = to_amdgpu_device(control); +	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + +	switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { +	case IP_VERSION(8, 10, 0): +	case IP_VERSION(12, 0, 0): +		hdr->version = RAS_TABLE_VER_V2_1; +		return; +	default: +		hdr->version = RAS_TABLE_VER_V1; +		return; +	} +} +  /**   * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table   * @control: pointer to control structure @@ -423,11 +439,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)  	mutex_lock(&control->ras_tbl_mutex);  	hdr->header = RAS_TABLE_HDR_VAL; -	if (adev->umc.ras && -	    adev->umc.ras->set_eeprom_table_version) -		adev->umc.ras->set_eeprom_table_version(hdr); -	else -		hdr->version = RAS_TABLE_VER_V1; +	amdgpu_ras_set_eeprom_table_version(control);  	if (hdr->version == RAS_TABLE_VER_V2_1) {  		hdr->first_rec_offset = RAS_RECORD_START_V2_1; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h index 381101d2bf05..50fcd86e1033 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct amdgpu_res_cursor *cur, uint64_t size)  	}  } +/** + * amdgpu_res_cleared - check if blocks are cleared + * + * @cur: the cursor to extract the block + * + * Check if the @cur block is cleared + */ +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur) +{ +	struct drm_buddy_block *block; + +	switch (cur->mem_type) { +	case TTM_PL_VRAM: +		block = cur->node; + +		if (!amdgpu_vram_mgr_is_cleared(block)) +			return false; +		break; +	default: +		return false; +	} + +	return true; +} +  #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c index 147100c27c2d..ea4873f6ccd1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c @@ -21,9 +21,6 @@   *   */ -#include <linux/devcoredump.h> -#include <generated/utsrelease.h> -  #include "amdgpu_reset.h"  #include "aldebaran.h"  #include "sienna_cichlid.h" @@ -161,105 +158,3 @@ void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)  	atomic_set(&reset_domain->in_gpu_reset, 0);  	up_write(&reset_domain->sem);  } - -#ifndef CONFIG_DEV_COREDUMP -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, -		     struct amdgpu_reset_context *reset_context) -{ -} -#else -static ssize_t -amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, -			void *data, size_t datalen) -{ -	struct drm_printer p; -	struct amdgpu_coredump_info *coredump = data; -	struct drm_print_iterator iter; -	int i; - -	iter.data = buffer; -	iter.offset = 0; -	iter.start = offset; -	iter.remain = count; - -	p = drm_coredump_printer(&iter); - -	drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); -	drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n"); -	drm_printf(&p, "kernel: " UTS_RELEASE "\n"); -	drm_printf(&p, "module: " KBUILD_MODNAME "\n"); -	drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, -			coredump->reset_time.tv_nsec); - -	if (coredump->reset_task_info.pid) -		drm_printf(&p, "process_name: %s PID: %d\n", -			   coredump->reset_task_info.process_name, -			   coredump->reset_task_info.pid); - -	if (coredump->ring) { -		drm_printf(&p, "\nRing timed out details\n"); -		drm_printf(&p, "IP Type: %d Ring Name: %s\n", -			   coredump->ring->funcs->type, -			   coredump->ring->name); -	} - -	if (coredump->reset_vram_lost) -		drm_printf(&p, "VRAM is lost due to GPU reset!\n"); -	if (coredump->adev->reset_info.num_regs) { -		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n"); - -		for (i = 0; i < coredump->adev->reset_info.num_regs; i++) -			drm_printf(&p, "0x%08x: 0x%08x\n", -				   coredump->adev->reset_info.reset_dump_reg_list[i], -				   coredump->adev->reset_info.reset_dump_reg_value[i]); -	} - -	return count - iter.remain; -} - -static void amdgpu_devcoredump_free(void *data) -{ -	kfree(data); -} - -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, -		     struct amdgpu_reset_context *reset_context) -{ -	struct amdgpu_coredump_info *coredump; -	struct drm_device *dev = adev_to_drm(adev); -	struct amdgpu_job *job = reset_context->job; -	struct drm_sched_job *s_job; - -	coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT); - -	if (!coredump) { -		DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__); -		return; -	} - -	coredump->reset_vram_lost = vram_lost; - -	if (reset_context->job && reset_context->job->vm) { -		struct amdgpu_task_info *ti; -		struct amdgpu_vm *vm = reset_context->job->vm; - -		ti = amdgpu_vm_get_task_info_vm(vm); -		if (ti) { -			coredump->reset_task_info = *ti; -			amdgpu_vm_put_task_info(ti); -		} -	} - -	if (job) { -		s_job = &job->base; -		coredump->ring = to_amdgpu_ring(s_job->sched); -	} - -	coredump->adev = adev; - -	ktime_get_ts64(&coredump->reset_time); - -	dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT, -		      amdgpu_devcoredump_read, amdgpu_devcoredump_free); -} -#endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index 60522963aaca..b11d190ece53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -32,6 +32,7 @@ enum AMDGPU_RESET_FLAGS {  	AMDGPU_NEED_FULL_RESET = 0,  	AMDGPU_SKIP_HW_RESET = 1, +	AMDGPU_SKIP_COREDUMP = 2,  };  struct amdgpu_reset_context { @@ -88,19 +89,6 @@ struct amdgpu_reset_domain {  	atomic_t reset_res;  }; -#ifdef CONFIG_DEV_COREDUMP - -#define AMDGPU_COREDUMP_VERSION "1" - -struct amdgpu_coredump_info { -	struct amdgpu_device		*adev; -	struct amdgpu_task_info         reset_task_info; -	struct timespec64               reset_time; -	bool                            reset_vram_lost; -	struct amdgpu_ring			*ring; -}; -#endif -  int amdgpu_reset_init(struct amdgpu_device *adev);  int amdgpu_reset_fini(struct amdgpu_device *adev); @@ -141,9 +129,6 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);  void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain); -void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, -		     struct amdgpu_reset_context *reset_context); -  #define for_each_handler(i, handler, reset_ctl)                  \  	for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) &&           \  		    (handler = (*reset_ctl->reset_handlers)[i]); \ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index 5505d646f43a..06f0a6534a94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -524,46 +524,58 @@ static ssize_t amdgpu_debugfs_mqd_read(struct file *f, char __user *buf,  {  	struct amdgpu_ring *ring = file_inode(f)->i_private;  	volatile u32 *mqd; -	int r; +	u32 *kbuf; +	int r, i;  	uint32_t value, result;  	if (*pos & 3 || size & 3)  		return -EINVAL; -	result = 0; +	kbuf = kmalloc(ring->mqd_size, GFP_KERNEL); +	if (!kbuf) +		return -ENOMEM;  	r = amdgpu_bo_reserve(ring->mqd_obj, false);  	if (unlikely(r != 0)) -		return r; +		goto err_free;  	r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&mqd); -	if (r) { -		amdgpu_bo_unreserve(ring->mqd_obj); -		return r; -	} +	if (r) +		goto err_unreserve; +	/* +	 * Copy to local buffer to avoid put_user(), which might fault +	 * and acquire mmap_sem, under reservation_ww_class_mutex. +	 */ +	for (i = 0; i < ring->mqd_size/sizeof(u32); i++) +		kbuf[i] = mqd[i]; + +	amdgpu_bo_kunmap(ring->mqd_obj); +	amdgpu_bo_unreserve(ring->mqd_obj); + +	result = 0;  	while (size) {  		if (*pos >= ring->mqd_size) -			goto done; +			break; -		value = mqd[*pos/4]; +		value = kbuf[*pos/4];  		r = put_user(value, (uint32_t *)buf);  		if (r) -			goto done; +			goto err_free;  		buf += 4;  		result += 4;  		size -= 4;  		*pos += 4;  	} -done: -	amdgpu_bo_kunmap(ring->mqd_obj); -	mqd = NULL; -	amdgpu_bo_unreserve(ring->mqd_obj); -	if (r) -		return r; - +	kfree(kbuf);  	return result; + +err_unreserve: +	amdgpu_bo_unreserve(ring->mqd_obj); +err_free: +	kfree(kbuf); +	return r;  }  static const struct file_operations amdgpu_debugfs_mqd_fops = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index 173a2a308078..b51a82e711df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -132,7 +132,7 @@ struct amdgpu_buffer_funcs {  				 uint64_t dst_offset,  				 /* number of byte to transfer */  				 uint32_t byte_count, -				 bool tmz); +				 uint32_t copy_flags);  	/* maximum bytes in a single operation */  	uint32_t	fill_max_bytes; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h index ff4435181055..ec9d12f85f39 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_smuio.h @@ -44,6 +44,7 @@ struct amdgpu_smuio_funcs {  	u32 (*get_socket_id)(struct amdgpu_device *adev);  	enum amdgpu_pkg_type (*get_pkg_type)(struct amdgpu_device *adev);  	bool (*is_host_gpu_xgmi_supported)(struct amdgpu_device *adev); +	u64 (*get_gpu_clock_counter)(struct amdgpu_device *adev);  };  struct amdgpu_smuio { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index fc418e670fda..3749892bf702 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -133,7 +133,7 @@ static void amdgpu_evict_flags(struct ttm_buffer_object *bo,  		} else if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&  			   !(abo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) && -			   amdgpu_bo_in_cpu_visible_vram(abo)) { +			   amdgpu_res_cpu_visible(adev, bo->resource)) {  			/* Try evicting to the CPU inaccessible part of VRAM  			 * first, but only set GTT as busy placement, so this @@ -236,7 +236,7 @@ static int amdgpu_ttm_map_buffer(struct ttm_buffer_object *bo,  	dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo);  	dst_addr += window * AMDGPU_GTT_MAX_TRANSFER_SIZE * 8;  	amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, -				dst_addr, num_bytes, false); +				dst_addr, num_bytes, 0);  	amdgpu_ring_pad_ib(ring, &job->ibs[0]);  	WARN_ON(job->ibs[0].length_dw > num_dw); @@ -296,6 +296,8 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,  	struct dma_fence *fence = NULL;  	int r = 0; +	uint32_t copy_flags = 0; +  	if (!adev->mman.buffer_funcs_enabled) {  		DRM_ERROR("Trying to move memory with ring turned off.\n");  		return -EINVAL; @@ -323,8 +325,11 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,  		if (r)  			goto error; -		r = amdgpu_copy_buffer(ring, from, to, cur_size, -				       resv, &next, false, true, tmz); +		if (tmz) +			copy_flags |= AMDGPU_COPY_FLAGS_TMZ; + +		r = amdgpu_copy_buffer(ring, from, to, cur_size, resv, +				       &next, false, true, copy_flags);  		if (r)  			goto error; @@ -378,11 +383,12 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo,  	    (abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {  		struct dma_fence *wipe_fence = NULL; -		r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, &wipe_fence, -					false); +		r = amdgpu_fill_buffer(abo, 0, NULL, &wipe_fence, +				       false);  		if (r) {  			goto error;  		} else if (wipe_fence) { +			amdgpu_vram_mgr_set_cleared(bo->resource);  			dma_fence_put(fence);  			fence = wipe_fence;  		} @@ -403,40 +409,55 @@ error:  	return r;  } -/* - * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy +/** + * amdgpu_res_cpu_visible - Check that resource can be accessed by CPU + * @adev: amdgpu device + * @res: the resource to check   * - * Called by amdgpu_bo_move() + * Returns: true if the full resource is CPU visible, false otherwise.   */ -static bool amdgpu_mem_visible(struct amdgpu_device *adev, -			       struct ttm_resource *mem) +bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, +			    struct ttm_resource *res)  { -	u64 mem_size = (u64)mem->size;  	struct amdgpu_res_cursor cursor; -	u64 end; -	if (mem->mem_type == TTM_PL_SYSTEM || -	    mem->mem_type == TTM_PL_TT) +	if (!res) +		return false; + +	if (res->mem_type == TTM_PL_SYSTEM || res->mem_type == TTM_PL_TT || +	    res->mem_type == AMDGPU_PL_PREEMPT)  		return true; -	if (mem->mem_type != TTM_PL_VRAM) + +	if (res->mem_type != TTM_PL_VRAM)  		return false; -	amdgpu_res_first(mem, 0, mem_size, &cursor); -	end = cursor.start + cursor.size; +	amdgpu_res_first(res, 0, res->size, &cursor);  	while (cursor.remaining) { +		if ((cursor.start + cursor.size) >= adev->gmc.visible_vram_size) +			return false;  		amdgpu_res_next(&cursor, cursor.size); +	} -		if (!cursor.remaining) -			break; +	return true; +} -		/* ttm_resource_ioremap only supports contiguous memory */ -		if (end != cursor.start) -			return false; +/* + * amdgpu_res_copyable - Check that memory can be accessed by ttm_bo_move_memcpy + * + * Called by amdgpu_bo_move() + */ +static bool amdgpu_res_copyable(struct amdgpu_device *adev, +				struct ttm_resource *mem) +{ +	if (!amdgpu_res_cpu_visible(adev, mem)) +		return false; -		end = cursor.start + cursor.size; -	} +	/* ttm_resource_ioremap only supports contiguous memory */ +	if (mem->mem_type == TTM_PL_VRAM && +	    !(mem->placement & TTM_PL_FLAG_CONTIGUOUS)) +		return false; -	return end <= adev->gmc.visible_vram_size; +	return true;  }  /* @@ -529,8 +550,8 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,  	if (r) {  		/* Check that all memory is CPU accessible */ -		if (!amdgpu_mem_visible(adev, old_mem) || -		    !amdgpu_mem_visible(adev, new_mem)) { +		if (!amdgpu_res_copyable(adev, old_mem) || +		    !amdgpu_res_copyable(adev, new_mem)) {  			pr_err("Move buffer fallback to memcpy unavailable\n");  			return r;  		} @@ -557,7 +578,6 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev,  				     struct ttm_resource *mem)  {  	struct amdgpu_device *adev = amdgpu_ttm_adev(bdev); -	size_t bus_size = (size_t)mem->size;  	switch (mem->mem_type) {  	case TTM_PL_SYSTEM: @@ -568,9 +588,6 @@ static int amdgpu_ttm_io_mem_reserve(struct ttm_device *bdev,  		break;  	case TTM_PL_VRAM:  		mem->bus.offset = mem->start << PAGE_SHIFT; -		/* check if it's visible */ -		if ((mem->bus.offset + bus_size) > adev->gmc.visible_vram_size) -			return -EINVAL;  		if (adev->mman.aper_base_kaddr &&  		    mem->placement & TTM_PL_FLAG_CONTIGUOUS) @@ -1477,7 +1494,7 @@ static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object *bo,  		swap(src_addr, dst_addr);  	amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, dst_addr, -				PAGE_SIZE, false); +				PAGE_SIZE, 0);  	amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, &job->ibs[0]);  	WARN_ON(job->ibs[0].length_dw > num_dw); @@ -2128,7 +2145,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,  		       uint64_t dst_offset, uint32_t byte_count,  		       struct dma_resv *resv,  		       struct dma_fence **fence, bool direct_submit, -		       bool vm_needs_flush, bool tmz) +		       bool vm_needs_flush, uint32_t copy_flags)  {  	struct amdgpu_device *adev = ring->adev;  	unsigned int num_loops, num_dw; @@ -2154,8 +2171,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,  		uint32_t cur_size_in_bytes = min(byte_count, max_bytes);  		amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_offset, -					dst_offset, cur_size_in_bytes, tmz); - +					dst_offset, cur_size_in_bytes, copy_flags);  		src_offset += cur_size_in_bytes;  		dst_offset += cur_size_in_bytes;  		byte_count -= cur_size_in_bytes; @@ -2215,6 +2231,71 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, uint32_t src_data,  	return 0;  } +/** + * amdgpu_ttm_clear_buffer - clear memory buffers + * @bo: amdgpu buffer object + * @resv: reservation object + * @fence: dma_fence associated with the operation + * + * Clear the memory buffer resource. + * + * Returns: + * 0 for success or a negative error code on failure. + */ +int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, +			    struct dma_resv *resv, +			    struct dma_fence **fence) +{ +	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); +	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; +	struct amdgpu_res_cursor cursor; +	u64 addr; +	int r; + +	if (!adev->mman.buffer_funcs_enabled) +		return -EINVAL; + +	if (!fence) +		return -EINVAL; + +	*fence = dma_fence_get_stub(); + +	amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); + +	mutex_lock(&adev->mman.gtt_window_lock); +	while (cursor.remaining) { +		struct dma_fence *next = NULL; +		u64 size; + +		if (amdgpu_res_cleared(&cursor)) { +			amdgpu_res_next(&cursor, cursor.size); +			continue; +		} + +		/* Never clear more than 256MiB at once to avoid timeouts */ +		size = min(cursor.size, 256ULL << 20); + +		r = amdgpu_ttm_map_buffer(&bo->tbo, bo->tbo.resource, &cursor, +					  1, ring, false, &size, &addr); +		if (r) +			goto err; + +		r = amdgpu_ttm_fill_mem(ring, 0, addr, size, resv, +					&next, true, true); +		if (r) +			goto err; + +		dma_fence_put(*fence); +		*fence = next; + +		amdgpu_res_next(&cursor, size); +	} +err: +	mutex_unlock(&adev->mman.gtt_window_lock); + +	return r; +} +  int amdgpu_fill_buffer(struct amdgpu_bo *bo,  			uint32_t src_data,  			struct dma_resv *resv, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 65ec82141a8e..b6f53129dea3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -38,8 +38,6 @@  #define AMDGPU_GTT_MAX_TRANSFER_SIZE	512  #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS	2 -#define AMDGPU_POISON	0xd0bed0be -  extern const struct attribute_group amdgpu_vram_mgr_attr_group;  extern const struct attribute_group amdgpu_gtt_mgr_attr_group; @@ -111,6 +109,8 @@ struct amdgpu_copy_mem {  	unsigned long			offset;  }; +#define AMDGPU_COPY_FLAGS_TMZ		(1 << 0) +  int amdgpu_gtt_mgr_init(struct amdgpu_device *adev, uint64_t gtt_size);  void amdgpu_gtt_mgr_fini(struct amdgpu_device *adev);  int amdgpu_preempt_mgr_init(struct amdgpu_device *adev); @@ -139,6 +139,9 @@ int amdgpu_vram_mgr_reserve_range(struct amdgpu_vram_mgr *mgr,  int amdgpu_vram_mgr_query_page_status(struct amdgpu_vram_mgr *mgr,  				      uint64_t start); +bool amdgpu_res_cpu_visible(struct amdgpu_device *adev, +			    struct ttm_resource *res); +  int amdgpu_ttm_init(struct amdgpu_device *adev);  void amdgpu_ttm_fini(struct amdgpu_device *adev);  void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, @@ -148,13 +151,16 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,  		       uint64_t dst_offset, uint32_t byte_count,  		       struct dma_resv *resv,  		       struct dma_fence **fence, bool direct_submit, -		       bool vm_needs_flush, bool tmz); +		       bool vm_needs_flush, uint32_t copy_flags);  int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,  			       const struct amdgpu_copy_mem *src,  			       const struct amdgpu_copy_mem *dst,  			       uint64_t size, bool tmz,  			       struct dma_resv *resv,  			       struct dma_fence **f); +int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, +			    struct dma_resv *resv, +			    struct dma_fence **fence);  int amdgpu_fill_buffer(struct amdgpu_bo *bo,  			uint32_t src_data,  			struct dma_resv *resv, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h index 619445760037..105d4de0613a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h @@ -125,6 +125,7 @@ enum psp_fw_type {  	PSP_FW_TYPE_PSP_INTF_DRV,  	PSP_FW_TYPE_PSP_DBG_DRV,  	PSP_FW_TYPE_PSP_RAS_DRV, +	PSP_FW_TYPE_PSP_IPKEYMGR_DRV,  	PSP_FW_TYPE_MAX_INDEX,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 20436f81856a..540e0f066b26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -21,10 +21,13 @@   *   */ +#include <linux/sort.h>  #include "amdgpu.h"  #include "umc_v6_7.h"  #define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms +#define MAX_UMC_HASH_STRING_SIZE  256 +  static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,  				    struct ras_err_data *err_data, uint64_t err_addr,  				    uint32_t ch_inst, uint32_t umc_inst) @@ -63,6 +66,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,  		goto out_fini_err_data;  	} +	err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query; +  	/*  	 * Translate UMC channel address to Physical address  	 */ @@ -86,7 +91,7 @@ out_fini_err_data:  	return ret;  } -static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, +void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,  			void *ras_error_status)  {  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; @@ -118,6 +123,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,  			if(!err_data->err_addr)  				dev_warn(adev->dev, "Failed to alloc memory for "  						"umc error address record!\n"); +			else +				err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;  			/* umc query_ras_error_address is also responsible for clearing  			 * error status @@ -143,6 +150,8 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,  			if(!err_data->err_addr)  				dev_warn(adev->dev, "Failed to alloc memory for "  						"umc error address record!\n"); +			else +				err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;  			/* umc query_ras_error_address is also responsible for clearing  			 * error status @@ -170,6 +179,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,  	}  	kfree(err_data->err_addr); +	err_data->err_addr = NULL;  	mutex_unlock(&con->page_retirement_lock);  } @@ -177,7 +187,7 @@ static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,  static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,  		void *ras_error_status,  		struct amdgpu_iv_entry *entry, -		bool reset) +		uint32_t reset)  {  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -186,9 +196,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,  	amdgpu_umc_handle_bad_pages(adev, ras_error_status);  	if (err_data->ue_count && reset) { -		/* use mode-2 reset for poison consumption */ -		if (!entry) -			con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; +		con->gpu_reset_flags |= reset;  		amdgpu_ras_reset_gpu(adev);  	} @@ -196,7 +204,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,  }  int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, -			bool reset, uint32_t timeout_ms) +			uint32_t reset, uint32_t timeout_ms)  {  	struct ras_err_data err_data;  	struct ras_common_if head = { @@ -238,16 +246,16 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,  	if (reset) {  		struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -		/* use mode-2 reset for poison consumption */ -		con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; +		con->gpu_reset_flags |= reset;  		amdgpu_ras_reset_gpu(adev);  	}  	return 0;  } -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, -			enum amdgpu_ras_block block, bool reset) +int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, +			enum amdgpu_ras_block block, uint16_t pasid, +			pasid_notify pasid_fn, void *data, uint32_t reset)  {  	int ret = AMDGPU_RAS_SUCCESS; @@ -285,16 +293,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,  			amdgpu_ras_error_data_fini(&err_data);  		} else { -			if (reset) { -				amdgpu_umc_bad_page_polling_timeout(adev, -							reset, MAX_UMC_POISON_POLLING_TIME_SYNC); -			} else {  				struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +				amdgpu_ras_put_poison_req(adev, +					block, pasid, pasid_fn, data, reset); +  				atomic_inc(&con->page_retirement_req_cnt);  				wake_up(&con->page_retirement_wq); -			}  		}  	} else {  		if (adev->virt.ops && adev->virt.ops->ras_poison_handler) @@ -307,11 +313,19 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,  	return ret;  } +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, +			enum amdgpu_ras_block block, uint32_t reset) +{ +	return amdgpu_umc_pasid_poison_handler(adev, +				block, 0, NULL, NULL, reset); +} +  int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,  		void *ras_error_status,  		struct amdgpu_iv_entry *entry)  { -	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true); +	return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, +				AMDGPU_RAS_GPU_RESET_MODE1_RESET);  }  int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev) @@ -388,14 +402,20 @@ int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,  	return 0;  } -void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, +int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,  		uint64_t err_addr,  		uint64_t retired_page,  		uint32_t channel_index,  		uint32_t umc_inst)  { -	struct eeprom_table_record *err_rec = -		&err_data->err_addr[err_data->err_addr_cnt]; +	struct eeprom_table_record *err_rec; + +	if (!err_data || +	    !err_data->err_addr || +	    (err_data->err_addr_cnt >= err_data->err_addr_len)) +		return -EINVAL; + +	err_rec = &err_data->err_addr[err_data->err_addr_cnt];  	err_rec->address = err_addr;  	/* page frame address is saved */ @@ -407,6 +427,8 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,  	err_rec->mcumc_id = umc_inst;  	err_data->err_addr_cnt++; + +	return 0;  }  int amdgpu_umc_loop_channels(struct amdgpu_device *adev, @@ -439,3 +461,76 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,  	return 0;  } + +int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev, +				uint64_t status, uint64_t ipid, uint64_t addr) +{ +	if (adev->umc.ras->update_ecc_status) +		return adev->umc.ras->update_ecc_status(adev, +					status, ipid, addr); +	return 0; +} + +static int amdgpu_umc_uint64_cmp(const void *a, const void *b) +{ +	uint64_t *addr_a = (uint64_t *)a; +	uint64_t *addr_b = (uint64_t *)b; + +	if (*addr_a > *addr_b) +		return 1; +	else if (*addr_a < *addr_b) +		return -1; +	else +		return 0; +} + +/* Use string hash to avoid logging the same bad pages repeatedly */ +int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev, +		uint64_t *pfns, int len, uint64_t *val) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	char buf[MAX_UMC_HASH_STRING_SIZE] = {0}; +	int offset = 0, i = 0; +	uint64_t hash_val; + +	if (!pfns || !len) +		return -EINVAL; + +	sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL); + +	for (i = 0; i < len; i++) +		offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]); + +	hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key); + +	*val = hash_val; + +	return 0; +} + +int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev, +		struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_ecc_log_info *ecc_log; +	int ret; + +	ecc_log = &con->umc_ecc_log; + +	mutex_lock(&ecc_log->lock); +	ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err); +	if (!ret) { +		struct ras_err_pages *err_pages = &ecc_err->err_pages; +		int i; + +		/* Reserve memory */ +		for (i = 0; i < err_pages->count; i++) +			amdgpu_ras_reserve_page(adev, err_pages->pfn[i]); + +		radix_tree_tag_set(ecc_tree, +			ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG); +	} +	mutex_unlock(&ecc_log->lock); + +	return ret; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index 26d2ae498daf..5f50c69c3cec 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -52,6 +52,8 @@  #define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \  		LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst)) +/* Page retirement tag */ +#define UMC_ECC_NEW_DETECTED_TAG       0x1  typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,  			uint32_t umc_inst, uint32_t ch_inst, void *data); @@ -66,8 +68,8 @@ struct amdgpu_umc_ras {  					void *ras_error_status);  	bool (*check_ecc_err_status)(struct amdgpu_device *adev,  			enum amdgpu_mca_error_type type, void *ras_error_status); -	/* support different eeprom table version for different asic */ -	void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header *hdr); +	int (*update_ecc_status)(struct amdgpu_device *adev, +			uint64_t status, uint64_t ipid, uint64_t addr);  };  struct amdgpu_umc_funcs { @@ -103,11 +105,14 @@ struct amdgpu_umc {  int amdgpu_umc_ras_sw_init(struct amdgpu_device *adev);  int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block);  int amdgpu_umc_poison_handler(struct amdgpu_device *adev, -			enum amdgpu_ras_block block, bool reset); +			enum amdgpu_ras_block block, uint32_t reset); +int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev, +			enum amdgpu_ras_block block, uint16_t pasid, +			pasid_notify pasid_fn, void *data, uint32_t reset);  int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,  		struct amdgpu_irq_src *source,  		struct amdgpu_iv_entry *entry); -void amdgpu_umc_fill_error_record(struct ras_err_data *err_data, +int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,  		uint64_t err_addr,  		uint64_t retired_page,  		uint32_t channel_index, @@ -123,5 +128,15 @@ int amdgpu_umc_loop_channels(struct amdgpu_device *adev,  			umc_func func, void *data);  int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, -			bool reset, uint32_t timeout_ms); +			uint32_t reset, uint32_t timeout_ms); + +int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev, +				uint64_t status, uint64_t ipid, uint64_t addr); +int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev, +		uint64_t *pfns, int len, uint64_t *val); +int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev, +		struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err); + +void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev, +			void *ras_error_status);  #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c index ab820cf52668..e01c1c8e64c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.c @@ -189,10 +189,13 @@ static void setup_vpe_queue(struct amdgpu_device *adev,  	mqd->rptr_val = 0;  	mqd->unmapped = 1; +	if (adev->vpe.collaborate_mode) +		memcpy(++mqd, test->mqd_data_cpu_addr, sizeof(struct MQD_INFO)); +  	qinfo->mqd_addr = test->mqd_data_gpu_addr;  	qinfo->csa_addr = test->ctx_data_gpu_addr +  		offsetof(struct umsch_mm_test_ctx_data, vpe_ctx_csa); -	qinfo->doorbell_offset_0 = (adev->doorbell_index.vpe_ring + 1) << 1; +	qinfo->doorbell_offset_0 = 0;  	qinfo->doorbell_offset_1 = 0;  } @@ -287,7 +290,10 @@ static int submit_vpe_queue(struct amdgpu_device *adev, struct umsch_mm_test *te  	ring[5] = 0;  	mqd->wptr_val = (6 << 2); -	// WDOORBELL32(adev->umsch_mm.agdb_index[CONTEXT_PRIORITY_LEVEL_NORMAL], mqd->wptr_val); +	if (adev->vpe.collaborate_mode) +		(++mqd)->wptr_val = (6 << 2); + +	WDOORBELL32(adev->umsch_mm.agdb_index[CONTEXT_PRIORITY_LEVEL_NORMAL], mqd->wptr_val);  	for (i = 0; i < adev->usec_timeout; i++) {  		if (*fence == test_pattern) @@ -571,6 +577,7 @@ int amdgpu_umsch_mm_init_microcode(struct amdgpu_umsch_mm *umsch)  	switch (amdgpu_ip_version(adev, VCN_HWIP, 0)) {  	case IP_VERSION(4, 0, 5): +	case IP_VERSION(4, 0, 6):  		fw_name = "amdgpu/umsch_mm_4_0_0.bin";  		break;  	default: @@ -750,6 +757,7 @@ static int umsch_mm_early_init(void *handle)  	switch (amdgpu_ip_version(adev, VCN_HWIP, 0)) {  	case IP_VERSION(4, 0, 5): +	case IP_VERSION(4, 0, 6):  		umsch_mm_v4_0_set_funcs(&adev->umsch_mm);  		break;  	default: @@ -766,6 +774,9 @@ static int umsch_mm_late_init(void *handle)  {  	struct amdgpu_device *adev = (struct amdgpu_device *)handle; +	if (amdgpu_in_reset(adev) || adev->in_s0ix || adev->in_suspend) +		return 0; +  	return umsch_mm_test(adev);  } @@ -867,6 +878,8 @@ static const struct amd_ip_funcs umsch_mm_v4_0_ip_funcs = {  	.hw_fini = umsch_mm_hw_fini,  	.suspend = umsch_mm_suspend,  	.resume = umsch_mm_resume, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version umsch_mm_v4_0_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.h index 8258a43a6236..5014b5af95fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umsch_mm.h @@ -33,13 +33,6 @@ enum UMSCH_SWIP_ENGINE_TYPE {  	UMSCH_SWIP_ENGINE_TYPE_MAX  }; -enum UMSCH_SWIP_AFFINITY_TYPE { -	UMSCH_SWIP_AFFINITY_TYPE_ANY = 0, -	UMSCH_SWIP_AFFINITY_TYPE_VCN0 = 1, -	UMSCH_SWIP_AFFINITY_TYPE_VCN1 = 2, -	UMSCH_SWIP_AFFINITY_TYPE_MAX -}; -  enum UMSCH_CONTEXT_PRIORITY_LEVEL {  	CONTEXT_PRIORITY_LEVEL_IDLE = 0,  	CONTEXT_PRIORITY_LEVEL_NORMAL = 1, @@ -51,13 +44,15 @@ enum UMSCH_CONTEXT_PRIORITY_LEVEL {  struct umsch_mm_set_resource_input {  	uint32_t vmid_mask_mm_vcn;  	uint32_t vmid_mask_mm_vpe; +	uint32_t collaboration_mask_vpe;  	uint32_t logging_vmid;  	uint32_t engine_mask;  	union {  		struct {  			uint32_t disable_reset : 1;  			uint32_t disable_umsch_mm_log : 1; -			uint32_t reserved : 30; +			uint32_t use_rs64mem_for_proc_ctx_csa : 1; +			uint32_t reserved : 29;  		};  		uint32_t uint32_all;  	}; @@ -78,15 +73,18 @@ struct umsch_mm_add_queue_input {  	uint32_t doorbell_offset_1;  	enum UMSCH_SWIP_ENGINE_TYPE engine_type;  	uint32_t affinity; -	enum UMSCH_SWIP_AFFINITY_TYPE affinity_type;  	uint64_t mqd_addr;  	uint64_t h_context;  	uint64_t h_queue;  	uint32_t vm_context_cntl; +	uint32_t process_csa_array_index; +	uint32_t context_csa_array_index; +  	struct {  		uint32_t is_context_suspended : 1; -		uint32_t reserved : 31; +		uint32_t collaboration_mode : 1; +		uint32_t reserved : 30;  	};  }; @@ -94,6 +92,7 @@ struct umsch_mm_remove_queue_input {  	uint32_t doorbell_offset_0;  	uint32_t doorbell_offset_1;  	uint64_t context_csa_addr; +	uint32_t context_csa_array_index;  };  struct MQD_INFO { @@ -103,6 +102,7 @@ struct MQD_INFO {  	uint32_t wptr_val;  	uint32_t rptr_val;  	uint32_t unmapped; +	uint32_t vmid;  };  struct amdgpu_umsch_mm; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 59acf424a078..968ca2c84ef7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -743,7 +743,8 @@ int amdgpu_vce_ring_parse_cs(struct amdgpu_cs_parser *p,  	uint32_t created = 0;  	uint32_t allocated = 0;  	uint32_t tmp, handle = 0; -	uint32_t *size = &tmp; +	uint32_t dummy = 0xffffffff; +	uint32_t *size = &dummy;  	unsigned int idx;  	int i, r = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index 9c514a606a2f..677eb141554e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -93,7 +93,7 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work);  int amdgpu_vcn_early_init(struct amdgpu_device *adev)  { -	char ucode_prefix[30]; +	char ucode_prefix[25];  	char fw_name[40];  	int r, i; @@ -185,7 +185,10 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev)  	if (adev->firmware.load_type != AMDGPU_FW_LOAD_PSP)  		bo_size += AMDGPU_GPU_PAGE_ALIGN(le32_to_cpu(hdr->ucode_size_bytes) + 8); -	if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0)) { +	if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(5, 0, 0)) { +		fw_shared_size = AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn5_fw_shared)); +		log_offset = offsetof(struct amdgpu_vcn5_fw_shared, fw_log); +	} else if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0)) {  		fw_shared_size = AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn4_fw_shared));  		log_offset = offsetof(struct amdgpu_vcn4_fw_shared, fw_log);  	} else { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h index a418393d89ec..9f06def236fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h @@ -454,6 +454,16 @@ struct amdgpu_vcn_rb_metadata {  	uint8_t pad[26];  }; +struct amdgpu_vcn5_fw_shared { +	uint32_t present_flag_0; +	uint8_t pad[12]; +	struct amdgpu_fw_shared_unified_queue_struct sq; +	uint8_t pad1[8]; +	struct amdgpu_fw_shared_fw_logging fw_log; +	struct amdgpu_fw_shared_rb_setup rb_setup; +	uint8_t pad2[4]; +}; +  #define VCN_BLOCK_ENCODE_DISABLE_MASK 0x80  #define VCN_BLOCK_DECODE_DISABLE_MASK 0x40  #define VCN_BLOCK_QUEUE_DISABLE_MASK 0xC0 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 7a4eae36778a..54ab51a4ada7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -32,6 +32,7 @@  #include "amdgpu.h"  #include "amdgpu_ras.h" +#include "amdgpu_reset.h"  #include "vi.h"  #include "soc15.h"  #include "nv.h" @@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)  		return -EINVAL;  	if (pf2vf_info->size > 1024) { -		DRM_ERROR("invalid pf2vf message size\n"); +		dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size);  		return -EINVAL;  	} @@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)  			adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,  			adev->virt.fw_reserve.checksum_key, checksum);  		if (checksum != checkval) { -			DRM_ERROR("invalid pf2vf message\n"); +			dev_err(adev->dev, +				"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", +				checksum, checkval);  			return -EINVAL;  		} @@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)  			adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,  			0, checksum);  		if (checksum != checkval) { -			DRM_ERROR("invalid pf2vf message\n"); +			dev_err(adev->dev, +				"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", +				checksum, checkval);  			return -EINVAL;  		} @@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)  			((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;  		break;  	default: -		DRM_ERROR("invalid pf2vf version\n"); +		dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version);  		return -EINVAL;  	} @@ -571,6 +576,11 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev)  	vf2pf_info->decode_usage = 0;  	vf2pf_info->dummy_page_addr = (uint64_t)adev->dummy_page_addr; +	vf2pf_info->mes_info_addr = (uint64_t)adev->mes.resource_1_gpu_addr; + +	if (adev->mes.resource_1) { +		vf2pf_info->mes_info_size = adev->mes.resource_1->tbo.base.size; +	}  	vf2pf_info->checksum =  		amd_sriov_msg_checksum(  		vf2pf_info, vf2pf_info->header.size, 0, 0); @@ -584,8 +594,22 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)  	int ret;  	ret = amdgpu_virt_read_pf2vf_data(adev); -	if (ret) +	if (ret) { +		adev->virt.vf2pf_update_retry_cnt++; +		if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && +		    amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) { +			amdgpu_ras_set_fed(adev, true); +			if (amdgpu_reset_domain_schedule(adev->reset_domain, +							  &adev->virt.flr_work)) +				return; +			else +				dev_err(adev->dev, "Failed to queue work! at %s", __func__); +		} +  		goto out; +	} + +	adev->virt.vf2pf_update_retry_cnt = 0;  	amdgpu_virt_write_vf2pf_data(adev);  out: @@ -606,6 +630,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)  	adev->virt.fw_reserve.p_pf2vf = NULL;  	adev->virt.fw_reserve.p_vf2pf = NULL;  	adev->virt.vf2pf_update_interval_ms = 0; +	adev->virt.vf2pf_update_retry_cnt = 0;  	if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {  		DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!"); @@ -705,12 +730,6 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev)  			adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE;  	} -	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) -		/* VF MMIO access (except mailbox range) from CPU -		 * will be blocked during sriov runtime -		 */ -		adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; -  	/* we have the ability to check now */  	if (amdgpu_sriov_vf(adev)) {  		switch (adev->asic_type) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index 3f59b7b5523f..642f1fd287d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -52,6 +52,8 @@  /* tonga/fiji use this offset */  #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503 +#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5 +  enum amdgpu_sriov_vf_mode {  	SRIOV_VF_MODE_BARE_METAL = 0,  	SRIOV_VF_MODE_ONE_VF, @@ -130,6 +132,8 @@ enum AMDGIM_FEATURE_FLAG {  	AMDGIM_FEATURE_AV1_SUPPORT = (1 << 6),  	/* VCN RB decouple */  	AMDGIM_FEATURE_VCN_RB_DECOUPLE = (1 << 7), +	/* MES info */ +	AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8),  };  enum AMDGIM_REG_ACCESS_FLAG { @@ -257,6 +261,7 @@ struct amdgpu_virt {  	/* vf2pf message */  	struct delayed_work vf2pf_work;  	uint32_t vf2pf_update_interval_ms; +	int vf2pf_update_retry_cnt;  	/* multimedia bandwidth config */  	bool     is_mm_bw_enabled; @@ -332,6 +337,8 @@ static inline bool is_virtual_machine(void)  	((adev)->virt.gim_feature & AMDGIM_FEATURE_AV1_SUPPORT)  #define amdgpu_sriov_is_vcn_rb_decouple(adev) \  	((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE) +#define amdgpu_sriov_is_mes_info_enable(adev) \ +	((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE)  bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);  void amdgpu_virt_init_setting(struct amdgpu_device *adev);  int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c index 8baa2e0935cc..e30eecd02ae1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c @@ -658,6 +658,8 @@ static const struct amd_ip_funcs amdgpu_vkms_ip_funcs = {  	.soft_reset = amdgpu_vkms_soft_reset,  	.set_clockgating_state = amdgpu_vkms_set_clockgating_state,  	.set_powergating_state = amdgpu_vkms_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version amdgpu_vkms_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 4299ce386322..4e2391c83d7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -886,6 +886,44 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,  }  /** + * amdgpu_vm_tlb_flush - prepare TLB flush + * + * @params: parameters for update + * @fence: input fence to sync TLB flush with + * @tlb_cb: the callback structure + * + * Increments the tlb sequence to make sure that future CS execute a VM flush. + */ +static void +amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params, +		    struct dma_fence **fence, +		    struct amdgpu_vm_tlb_seq_struct *tlb_cb) +{ +	struct amdgpu_vm *vm = params->vm; + +	if (!fence || !*fence) +		return; + +	tlb_cb->vm = vm; +	if (!dma_fence_add_callback(*fence, &tlb_cb->cb, +				    amdgpu_vm_tlb_seq_cb)) { +		dma_fence_put(vm->last_tlb_flush); +		vm->last_tlb_flush = dma_fence_get(*fence); +	} else { +		amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb); +	} + +	/* Prepare a TLB flush fence to be attached to PTs */ +	if (!params->unlocked && vm->is_compute_context) { +		amdgpu_vm_tlb_fence_create(params->adev, vm, fence); + +		/* Makes sure no PD/PT is freed before the flush */ +		dma_resv_add_fence(vm->root.bo->tbo.base.resv, *fence, +				   DMA_RESV_USAGE_BOOKKEEP); +	} +} + +/**   * amdgpu_vm_update_range - update a range in the vm page table   *   * @adev: amdgpu_device pointer to use for commands @@ -916,8 +954,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,  			   struct ttm_resource *res, dma_addr_t *pages_addr,  			   struct dma_fence **fence)  { -	struct amdgpu_vm_update_params params;  	struct amdgpu_vm_tlb_seq_struct *tlb_cb; +	struct amdgpu_vm_update_params params;  	struct amdgpu_res_cursor cursor;  	enum amdgpu_sync_mode sync_mode;  	int r, idx; @@ -927,8 +965,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,  	tlb_cb = kmalloc(sizeof(*tlb_cb), GFP_KERNEL);  	if (!tlb_cb) { -		r = -ENOMEM; -		goto error_unlock; +		drm_dev_exit(idx); +		return -ENOMEM;  	}  	/* Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache, @@ -948,7 +986,9 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,  	params.immediate = immediate;  	params.pages_addr = pages_addr;  	params.unlocked = unlocked; +	params.needs_flush = flush_tlb;  	params.allow_override = allow_override; +	INIT_LIST_HEAD(¶ms.tlb_flush_waitlist);  	/* Implicitly sync to command submissions in the same VM before  	 * unmapping. Sync to moving fences before mapping. @@ -1031,24 +1071,18 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,  	}  	r = vm->update_funcs->commit(¶ms, fence); +	if (r) +		goto error_free; -	if (flush_tlb || params.table_freed) { -		tlb_cb->vm = vm; -		if (fence && *fence && -		    !dma_fence_add_callback(*fence, &tlb_cb->cb, -					   amdgpu_vm_tlb_seq_cb)) { -			dma_fence_put(vm->last_tlb_flush); -			vm->last_tlb_flush = dma_fence_get(*fence); -		} else { -			amdgpu_vm_tlb_seq_cb(NULL, &tlb_cb->cb); -		} +	if (params.needs_flush) { +		amdgpu_vm_tlb_flush(¶ms, fence, tlb_cb);  		tlb_cb = NULL;  	} +	amdgpu_vm_pt_free_list(adev, ¶ms); +  error_free:  	kfree(tlb_cb); - -error_unlock:  	amdgpu_vm_eviction_unlock(vm);  	drm_dev_exit(idx);  	return r; @@ -1613,6 +1647,37 @@ static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev,  	trace_amdgpu_vm_bo_map(bo_va, mapping);  } +/* Validate operation parameters to prevent potential abuse */ +static int amdgpu_vm_verify_parameters(struct amdgpu_device *adev, +					  struct amdgpu_bo *bo, +					  uint64_t saddr, +					  uint64_t offset, +					  uint64_t size) +{ +	uint64_t tmp, lpfn; + +	if (saddr & AMDGPU_GPU_PAGE_MASK +	    || offset & AMDGPU_GPU_PAGE_MASK +	    || size & AMDGPU_GPU_PAGE_MASK) +		return -EINVAL; + +	if (check_add_overflow(saddr, size, &tmp) +	    || check_add_overflow(offset, size, &tmp) +	    || size == 0 /* which also leads to end < begin */) +		return -EINVAL; + +	/* make sure object fit at this offset */ +	if (bo && offset + size > amdgpu_bo_size(bo)) +		return -EINVAL; + +	/* Ensure last pfn not exceed max_pfn */ +	lpfn = (saddr + size - 1) >> AMDGPU_GPU_PAGE_SHIFT; +	if (lpfn >= adev->vm_manager.max_pfn) +		return -EINVAL; + +	return 0; +} +  /**   * amdgpu_vm_bo_map - map bo inside a vm   * @@ -1639,21 +1704,14 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev,  	struct amdgpu_bo *bo = bo_va->base.bo;  	struct amdgpu_vm *vm = bo_va->base.vm;  	uint64_t eaddr; +	int r; -	/* validate the parameters */ -	if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK || size & ~PAGE_MASK) -		return -EINVAL; -	if (saddr + size <= saddr || offset + size <= offset) -		return -EINVAL; - -	/* make sure object fit at this offset */ -	eaddr = saddr + size - 1; -	if ((bo && offset + size > amdgpu_bo_size(bo)) || -	    (eaddr >= adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT)) -		return -EINVAL; +	r = amdgpu_vm_verify_parameters(adev, bo, saddr, offset, size); +	if (r) +		return r;  	saddr /= AMDGPU_GPU_PAGE_SIZE; -	eaddr /= AMDGPU_GPU_PAGE_SIZE; +	eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE;  	tmp = amdgpu_vm_it_iter_first(&vm->va, saddr, eaddr);  	if (tmp) { @@ -1706,17 +1764,9 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,  	uint64_t eaddr;  	int r; -	/* validate the parameters */ -	if (saddr & ~PAGE_MASK || offset & ~PAGE_MASK || size & ~PAGE_MASK) -		return -EINVAL; -	if (saddr + size <= saddr || offset + size <= offset) -		return -EINVAL; - -	/* make sure object fit at this offset */ -	eaddr = saddr + size - 1; -	if ((bo && offset + size > amdgpu_bo_size(bo)) || -	    (eaddr >= adev->vm_manager.max_pfn << AMDGPU_GPU_PAGE_SHIFT)) -		return -EINVAL; +	r = amdgpu_vm_verify_parameters(adev, bo, saddr, offset, size); +	if (r) +		return r;  	/* Allocate all the needed memory */  	mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); @@ -1730,7 +1780,7 @@ int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,  	}  	saddr /= AMDGPU_GPU_PAGE_SIZE; -	eaddr /= AMDGPU_GPU_PAGE_SIZE; +	eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE;  	mapping->start = saddr;  	mapping->last = eaddr; @@ -1817,10 +1867,14 @@ int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,  	struct amdgpu_bo_va_mapping *before, *after, *tmp, *next;  	LIST_HEAD(removed);  	uint64_t eaddr; +	int r; + +	r = amdgpu_vm_verify_parameters(adev, NULL, saddr, 0, size); +	if (r) +		return r; -	eaddr = saddr + size - 1;  	saddr /= AMDGPU_GPU_PAGE_SIZE; -	eaddr /= AMDGPU_GPU_PAGE_SIZE; +	eaddr = saddr + (size - 1) / AMDGPU_GPU_PAGE_SIZE;  	/* Allocate all the needed memory */  	before = kzalloc(sizeof(*before), GFP_KERNEL); @@ -2391,6 +2445,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,  	mutex_init(&vm->eviction_lock);  	vm->evicting = false; +	vm->tlb_fence_context = dma_fence_context_alloc(1);  	r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,  				false, &root, xcp_id); @@ -2924,6 +2979,14 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,  	if (vm && status) {  		vm->fault_info.addr = addr;  		vm->fault_info.status = status; +		/* +		 * Update the fault information globally for later usage +		 * when vm could be stale or freed. +		 */ +		adev->vm_manager.fault_info.addr = addr; +		adev->vm_manager.fault_info.vmhub = vmhub; +		adev->vm_manager.fault_info.status = status; +  		if (AMDGPU_IS_GFXHUB(vmhub)) {  			vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;  			vm->fault_info.vmhub |= diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 047ec1930d12..54d7da396de0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -257,15 +257,20 @@ struct amdgpu_vm_update_params {  	unsigned int num_dw_left;  	/** -	 * @table_freed: return true if page table is freed when updating +	 * @needs_flush: true whenever we need to invalidate the TLB  	 */ -	bool table_freed; +	bool needs_flush;  	/**  	 * @allow_override: true for memory that is not uncached: allows MTYPE  	 * to be overridden for NUMA local memory.  	 */  	bool allow_override; + +	/** +	 * @tlb_flush_waitlist: temporary storage for BOs until tlb_flush +	 */ +	struct list_head tlb_flush_waitlist;  };  struct amdgpu_vm_update_funcs { @@ -342,6 +347,7 @@ struct amdgpu_vm {  	atomic64_t		tlb_seq;  	struct dma_fence	*last_tlb_flush;  	atomic64_t		kfd_last_flushed_seq; +	uint64_t		tlb_fence_context;  	/* How many times we had to re-generate the page tables */  	uint64_t		generation; @@ -422,6 +428,8 @@ struct amdgpu_vm_manager {  	 * look up VM of a page fault  	 */  	struct xarray				pasids; +	/* Global registration of recent page fault information */ +	struct amdgpu_vm_fault_info	fault_info;  };  struct amdgpu_bo_va_mapping; @@ -544,6 +552,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,  			  uint64_t start, uint64_t end,  			  uint64_t dst, uint64_t flags);  void amdgpu_vm_pt_free_work(struct work_struct *work); +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev, +			    struct amdgpu_vm_update_params *params);  #if defined(CONFIG_DEBUG_FS)  void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m); @@ -609,5 +619,8 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,  				  uint64_t addr,  				  uint32_t status,  				  unsigned int vmhub); +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, +				 struct amdgpu_vm *vm, +				 struct dma_fence **fence);  #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c index 6e31621452de..3895bd7d176a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c @@ -108,7 +108,9 @@ static int amdgpu_vm_cpu_update(struct amdgpu_vm_update_params *p,  static int amdgpu_vm_cpu_commit(struct amdgpu_vm_update_params *p,  				struct dma_fence **fence)  { -	/* Flush HDP */ +	if (p->needs_flush) +		atomic64_inc(&p->vm->tlb_seq); +  	mb();  	amdgpu_device_flush_hdp(p->adev, NULL);  	return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 124389a6bf48..7fdd306a48a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -622,40 +622,58 @@ void amdgpu_vm_pt_free_work(struct work_struct *work)  }  /** - * amdgpu_vm_pt_free_dfs - free PD/PT levels + * amdgpu_vm_pt_free_list - free PD/PT levels   *   * @adev: amdgpu device structure - * @vm: amdgpu vm structure - * @start: optional cursor where to start freeing PDs/PTs - * @unlocked: vm resv unlock status + * @params: see amdgpu_vm_update_params definition   * - * Free the page directory or page table level and all sub levels. + * Free the page directory objects saved in the flush list   */ -static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev, -				  struct amdgpu_vm *vm, -				  struct amdgpu_vm_pt_cursor *start, -				  bool unlocked) +void amdgpu_vm_pt_free_list(struct amdgpu_device *adev, +			    struct amdgpu_vm_update_params *params)  { -	struct amdgpu_vm_pt_cursor cursor; -	struct amdgpu_vm_bo_base *entry; +	struct amdgpu_vm_bo_base *entry, *next; +	struct amdgpu_vm *vm = params->vm; +	bool unlocked = params->unlocked; + +	if (list_empty(¶ms->tlb_flush_waitlist)) +		return;  	if (unlocked) {  		spin_lock(&vm->status_lock); -		for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) -			list_move(&entry->vm_status, &vm->pt_freed); - -		if (start) -			list_move(&start->entry->vm_status, &vm->pt_freed); +		list_splice_init(¶ms->tlb_flush_waitlist, &vm->pt_freed);  		spin_unlock(&vm->status_lock);  		schedule_work(&vm->pt_free_work);  		return;  	} -	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, start, cursor, entry) +	list_for_each_entry_safe(entry, next, ¶ms->tlb_flush_waitlist, vm_status)  		amdgpu_vm_pt_free(entry); +} -	if (start) -		amdgpu_vm_pt_free(start->entry); +/** + * amdgpu_vm_pt_add_list - add PD/PT level to the flush list + * + * @params: parameters for the update + * @cursor: first PT entry to start DF search from, non NULL + * + * This list will be freed after TLB flush. + */ +static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params, +				  struct amdgpu_vm_pt_cursor *cursor) +{ +	struct amdgpu_vm_pt_cursor seek; +	struct amdgpu_vm_bo_base *entry; + +	spin_lock(¶ms->vm->status_lock); +	for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, entry) { +		if (entry && entry->bo) +			list_move(&entry->vm_status, ¶ms->tlb_flush_waitlist); +	} + +	/* enter start node now */ +	list_move(&cursor->entry->vm_status, ¶ms->tlb_flush_waitlist); +	spin_unlock(¶ms->vm->status_lock);  }  /** @@ -667,7 +685,13 @@ static void amdgpu_vm_pt_free_dfs(struct amdgpu_device *adev,   */  void amdgpu_vm_pt_free_root(struct amdgpu_device *adev, struct amdgpu_vm *vm)  { -	amdgpu_vm_pt_free_dfs(adev, vm, NULL, false); +	struct amdgpu_vm_pt_cursor cursor; +	struct amdgpu_vm_bo_base *entry; + +	for_each_amdgpu_vm_pt_dfs_safe(adev, vm, NULL, cursor, entry) { +		if (entry) +			amdgpu_vm_pt_free(entry); +	}  }  /** @@ -972,10 +996,8 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params,  			while (cursor.pfn < frag_start) {  				/* Make sure previous mapping is freed */  				if (cursor.entry->bo) { -					params->table_freed = true; -					amdgpu_vm_pt_free_dfs(adev, params->vm, -							      &cursor, -							      params->unlocked); +					params->needs_flush = true; +					amdgpu_vm_pt_add_list(params, &cursor);  				}  				amdgpu_vm_pt_next(adev, &cursor);  			} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 349416e176a1..66e8a016126b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -126,6 +126,10 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p,  	WARN_ON(ib->length_dw == 0);  	amdgpu_ring_pad_ib(ring, ib); + +	if (p->needs_flush) +		atomic64_inc(&p->vm->tlb_seq); +  	WARN_ON(ib->length_dw > p->num_dw_left);  	f = amdgpu_job_submit(p->job); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c new file mode 100644 index 000000000000..51cddfa3f1e8 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_tlb_fence.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/dma-fence.h> +#include <linux/workqueue.h> + +#include "amdgpu.h" +#include "amdgpu_vm.h" +#include "amdgpu_gmc.h" + +struct amdgpu_tlb_fence { +	struct dma_fence	base; +	struct amdgpu_device	*adev; +	struct dma_fence	*dependency; +	struct work_struct	work; +	spinlock_t		lock; +	uint16_t		pasid; + +}; + +static const char *amdgpu_tlb_fence_get_driver_name(struct dma_fence *fence) +{ +	return "amdgpu tlb fence"; +} + +static const char *amdgpu_tlb_fence_get_timeline_name(struct dma_fence *f) +{ +	return "amdgpu tlb timeline"; +} + +static void amdgpu_tlb_fence_work(struct work_struct *work) +{ +	struct amdgpu_tlb_fence *f = container_of(work, typeof(*f), work); +	int r; + +	if (f->dependency) { +		dma_fence_wait(f->dependency, false); +		dma_fence_put(f->dependency); +		f->dependency = NULL; +	} + +	r = amdgpu_gmc_flush_gpu_tlb_pasid(f->adev, f->pasid, 2, true, 0); +	if (r) { +		dev_err(f->adev->dev, "TLB flush failed for PASID %d.\n", +			f->pasid); +		dma_fence_set_error(&f->base, r); +	} + +	dma_fence_signal(&f->base); +	dma_fence_put(&f->base); +} + +static const struct dma_fence_ops amdgpu_tlb_fence_ops = { +	.use_64bit_seqno = true, +	.get_driver_name = amdgpu_tlb_fence_get_driver_name, +	.get_timeline_name = amdgpu_tlb_fence_get_timeline_name +}; + +void amdgpu_vm_tlb_fence_create(struct amdgpu_device *adev, struct amdgpu_vm *vm, +				struct dma_fence **fence) +{ +	struct amdgpu_tlb_fence *f; + +	f = kmalloc(sizeof(*f), GFP_KERNEL); +	if (!f) { +		/* +		 * We can't fail since the PDEs and PTEs are already updated, so +		 * just block for the dependency and execute the TLB flush +		 */ +		if (*fence) +			dma_fence_wait(*fence, false); + +		amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, 2, true, 0); +		*fence = dma_fence_get_stub(); +		return; +	} + +	f->adev = adev; +	f->dependency = *fence; +	f->pasid = vm->pasid; +	INIT_WORK(&f->work, amdgpu_tlb_fence_work); +	spin_lock_init(&f->lock); + +	dma_fence_init(&f->base, &amdgpu_tlb_fence_ops, &f->lock, +		       vm->tlb_fence_context, atomic64_read(&vm->tlb_seq)); + +	/* TODO: We probably need a separate wq here */ +	dma_fence_get(&f->base); +	schedule_work(&f->work); + +	*fence = &f->base; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c index 7a65a2b128ec..c23d97d34b7e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c @@ -205,7 +205,7 @@ disable_dpm:  	dpm_ctl &= 0xfffffffe; /* Disable DPM */  	WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_enable), dpm_ctl);  	dev_dbg(adev->dev, "%s: disable vpe dpm\n", __func__); -	return 0; +	return -EINVAL;  }  int amdgpu_vpe_psp_update_sram(struct amdgpu_device *adev) @@ -396,6 +396,12 @@ static int vpe_hw_init(void *handle)  	struct amdgpu_vpe *vpe = &adev->vpe;  	int ret; +	/* Power on VPE */ +	ret = amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VPE, +						     AMD_PG_STATE_UNGATE); +	if (ret) +		return ret; +  	ret = vpe_load_microcode(vpe);  	if (ret)  		return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 8db880244324..6c30eceec896 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -450,6 +450,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,  {  	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);  	struct amdgpu_device *adev = to_amdgpu_device(mgr); +	struct amdgpu_bo *bo = ttm_to_amdgpu_bo(tbo);  	u64 vis_usage = 0, max_bytes, min_block_size;  	struct amdgpu_vram_mgr_resource *vres;  	u64 size, remaining_size, lpfn, fpfn; @@ -468,7 +469,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,  	if (tbo->type != ttm_bo_type_kernel)  		max_bytes -= AMDGPU_VM_RESERVED_VRAM; -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) { +	if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {  		pages_per_block = ~0ul;  	} else {  #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -477,7 +478,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,  		/* default to 2MB */  		pages_per_block = 2UL << (20UL - PAGE_SHIFT);  #endif -		pages_per_block = max_t(uint32_t, pages_per_block, +		pages_per_block = max_t(u32, pages_per_block,  					tbo->page_alignment);  	} @@ -498,9 +499,12 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,  	if (place->flags & TTM_PL_FLAG_TOPDOWN)  		vres->flags |= DRM_BUDDY_TOPDOWN_ALLOCATION; -	if (place->flags & TTM_PL_FLAG_CONTIGUOUS) +	if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS)  		vres->flags |= DRM_BUDDY_CONTIGUOUS_ALLOCATION; +	if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED) +		vres->flags |= DRM_BUDDY_CLEAR_ALLOCATION; +  	if (fpfn || lpfn != mgr->mm.size)  		/* Allocate blocks in desired range */  		vres->flags |= DRM_BUDDY_RANGE_ALLOCATION; @@ -514,21 +518,31 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,  		else  			min_block_size = mgr->default_page_size; -		BUG_ON(min_block_size < mm->chunk_size); -  		/* Limit maximum size to 2GiB due to SG table limitations */  		size = min(remaining_size, 2ULL << 30);  		if ((size >= (u64)pages_per_block << PAGE_SHIFT) && -				!(size & (((u64)pages_per_block << PAGE_SHIFT) - 1))) +		    !(size & (((u64)pages_per_block << PAGE_SHIFT) - 1)))  			min_block_size = (u64)pages_per_block << PAGE_SHIFT; +		BUG_ON(min_block_size < mm->chunk_size); +  		r = drm_buddy_alloc_blocks(mm, fpfn,  					   lpfn,  					   size,  					   min_block_size,  					   &vres->blocks,  					   vres->flags); + +		if (unlikely(r == -ENOSPC) && pages_per_block == ~0ul && +		    !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) { +			vres->flags &= ~DRM_BUDDY_CONTIGUOUS_ALLOCATION; +			pages_per_block = max_t(u32, 2UL << (20UL - PAGE_SHIFT), +						tbo->page_alignment); + +			continue; +		} +  		if (unlikely(r))  			goto error_free_blocks; @@ -571,7 +585,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,  	return 0;  error_free_blocks: -	drm_buddy_free_list(mm, &vres->blocks); +	drm_buddy_free_list(mm, &vres->blocks, 0);  	mutex_unlock(&mgr->lock);  error_fini:  	ttm_resource_fini(man, &vres->base); @@ -604,7 +618,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man,  	amdgpu_vram_mgr_do_reserve(man); -	drm_buddy_free_list(mm, &vres->blocks); +	drm_buddy_free_list(mm, &vres->blocks, vres->flags);  	mutex_unlock(&mgr->lock);  	atomic64_sub(vis_usage, &mgr->vis_usage); @@ -912,7 +926,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)  		kfree(rsv);  	list_for_each_entry_safe(rsv, temp, &mgr->reserved_pages, blocks) { -		drm_buddy_free_list(&mgr->mm, &rsv->allocated); +		drm_buddy_free_list(&mgr->mm, &rsv->allocated, 0);  		kfree(rsv);  	}  	if (!adev->gmc.is_app_apu) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h index 0e04e42cf809..b256cbc2bc27 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h @@ -53,10 +53,20 @@ static inline u64 amdgpu_vram_mgr_block_size(struct drm_buddy_block *block)  	return (u64)PAGE_SIZE << drm_buddy_block_order(block);  } +static inline bool amdgpu_vram_mgr_is_cleared(struct drm_buddy_block *block) +{ +	return drm_buddy_block_is_clear(block); +} +  static inline struct amdgpu_vram_mgr_resource *  to_amdgpu_vram_mgr_resource(struct ttm_resource *res)  {  	return container_of(res, struct amdgpu_vram_mgr_resource, base);  } +static inline void amdgpu_vram_mgr_set_cleared(struct ttm_resource *res) +{ +	to_amdgpu_vram_mgr_resource(res)->flags |= DRM_BUDDY_CLEARED; +} +  #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 20d51f6c9bb8..dd2ec48cf5c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1035,15 +1035,16 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)  	return 0;  } -static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, -						struct aca_bank_report *report, void *data) +static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, +				       enum aca_smu_type type, void *data)  {  	struct amdgpu_device *adev = handle->adev; +	struct aca_bank_info info;  	const char *error_str; -	u64 status; +	u64 status, count;  	int ret, ext_error_code; -	ret = aca_bank_info_decode(bank, &report->info); +	ret = aca_bank_info_decode(bank, &info);  	if (ret)  		return ret; @@ -1055,15 +1056,28 @@ static int xgmi_v6_4_0_aca_bank_generate_report(struct aca_handle *handle, struc  	if (error_str)  		dev_info(adev->dev, "%s detected\n", error_str); -	if ((type == ACA_ERROR_TYPE_UE && ext_error_code == 0) || -	    (type == ACA_ERROR_TYPE_CE && ext_error_code == 6)) -		report->count[type] = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]); +	count = ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]); -	return 0; +	switch (type) { +	case ACA_SMU_TYPE_UE: +		if (ext_error_code != 0 && ext_error_code != 9) +			count = 0ULL; + +		ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, count); +		break; +	case ACA_SMU_TYPE_CE: +		count = ext_error_code == 6 ? count : 0ULL; +		ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, count); +		break; +	default: +		return -EINVAL; +	} + +	return ret;  }  static const struct aca_bank_ops xgmi_v6_4_0_aca_bank_ops = { -	.aca_bank_generate_report = xgmi_v6_4_0_aca_bank_generate_report, +	.aca_bank_parser = xgmi_v6_4_0_aca_bank_parser,  };  static const struct aca_info xgmi_v6_4_0_aca_info = { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 1592c63b3099..a3bfc16de6d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -44,6 +44,7 @@ struct amdgpu_hive_info {  	struct amdgpu_reset_domain *reset_domain;  	atomic_t ras_recovery; +	struct ras_event_manager event_mgr;  };  struct amdgpu_pcs_ras_field { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h index 51a14f6d93bd..fb2b394bb9c5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h @@ -94,7 +94,8 @@ union amd_sriov_msg_feature_flags {  		uint32_t reg_indirect_acc  : 1;  		uint32_t av1_support       : 1;  		uint32_t vcn_rb_decouple   : 1; -		uint32_t reserved          : 24; +		uint32_t mes_info_enable   : 1; +		uint32_t reserved          : 23;  	} flags;  	uint32_t all;  }; @@ -157,7 +158,7 @@ struct amd_sriov_msg_pf2vf_info_header {  	uint32_t reserved[2];  }; -#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (48) +#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (49)  struct amd_sriov_msg_pf2vf_info {  	/* header contains size and version */  	struct amd_sriov_msg_pf2vf_info_header header; @@ -208,6 +209,8 @@ struct amd_sriov_msg_pf2vf_info {  	struct amd_sriov_msg_uuid_info uuid_info;  	/* PCIE atomic ops support flag */  	uint32_t pcie_atomic_ops_support_flags; +	/* Portion of GPU memory occupied by VF.  MAX value is 65535, but set to uint32_t to maintain alignment with reserved size */ +	uint32_t gpu_capacity;  	/* reserved */  	uint32_t reserved[256 - AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE];  }; @@ -221,7 +224,7 @@ struct amd_sriov_msg_vf2pf_info_header {  	uint32_t reserved[2];  }; -#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (70) +#define AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE (73)  struct amd_sriov_msg_vf2pf_info {  	/* header contains size and version */  	struct amd_sriov_msg_vf2pf_info_header header; @@ -265,7 +268,9 @@ struct amd_sriov_msg_vf2pf_info {  		uint32_t version;  	} ucode_info[AMD_SRIOV_MSG_RESERVE_UCODE];  	uint64_t dummy_page_addr; - +	/* FB allocated for guest MES to record UQ info */ +	uint64_t mes_info_addr; +	uint32_t mes_info_size;  	/* reserved */  	uint32_t reserved[256 - AMD_SRIOV_MSG_VF2PF_INFO_FILLED_SIZE];  }; diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c index d6f808acfb17..414ea3f560a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c +++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c @@ -62,6 +62,11 @@ void aqua_vanjaram_doorbell_index_init(struct amdgpu_device *adev)  	adev->doorbell_index.max_assignment = AMDGPU_DOORBELL_LAYOUT1_MAX_ASSIGNMENT << 1;  } +static bool aqua_vanjaram_xcp_vcn_shared(struct amdgpu_device *adev) +{ +	return (adev->xcp_mgr->num_xcps > adev->vcn.num_vcn_inst); +} +  static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev,  			     uint32_t inst_idx, struct amdgpu_ring *ring)  { @@ -87,7 +92,7 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device *adev,  	case AMDGPU_RING_TYPE_VCN_ENC:  	case AMDGPU_RING_TYPE_VCN_JPEG:  		ip_blk = AMDGPU_XCP_VCN; -		if (adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE) +		if (aqua_vanjaram_xcp_vcn_shared(adev))  			inst_mask = 1 << (inst_idx * 2);  		break;  	default: @@ -140,10 +145,12 @@ static int aqua_vanjaram_xcp_sched_list_update(  		aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id); -		/* VCN is shared by two partitions under CPX MODE */ +		/* VCN may be shared by two partitions under CPX MODE in certain +		 * configs. +		 */  		if ((ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC || -			ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) && -			adev->xcp_mgr->mode == AMDGPU_CPX_PARTITION_MODE) +		     ring->funcs->type == AMDGPU_RING_TYPE_VCN_JPEG) && +		    aqua_vanjaram_xcp_vcn_shared(adev))  			aqua_vanjaram_xcp_gpu_sched_update(adev, ring, ring->xcp_id + 1);  	} @@ -623,7 +630,7 @@ static int aqua_vanjaram_xcp_mgr_init(struct amdgpu_device *adev)  int aqua_vanjaram_init_soc_config(struct amdgpu_device *adev)  { -	u32 mask, inst_mask = adev->sdma.sdma_mask; +	u32 mask, avail_inst, inst_mask = adev->sdma.sdma_mask;  	int ret, i;  	/* generally 1 AID supports 4 instances */ @@ -635,7 +642,9 @@ int aqua_vanjaram_init_soc_config(struct amdgpu_device *adev)  	for (mask = (1 << adev->sdma.num_inst_per_aid) - 1; inst_mask;  	     inst_mask >>= adev->sdma.num_inst_per_aid, ++i) { -		if ((inst_mask & mask) == mask) +		avail_inst = inst_mask & mask; +		if (avail_inst == mask || avail_inst == 0x3 || +		    avail_inst == 0xc)  			adev->aid_mask |= (1 << i);  	} diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c index 72362df352f6..d552e013354c 100644 --- a/drivers/gpu/drm/amd/amdgpu/atom.c +++ b/drivers/gpu/drm/amd/amdgpu/atom.c @@ -1243,6 +1243,7 @@ static int amdgpu_atom_execute_table_locked(struct atom_context *ctx, int index,  	ectx.ps_size = params_size;  	ectx.abort = false;  	ectx.last_jump = 0; +	ectx.last_jump_jiffies = 0;  	if (ws) {  		ectx.ws = kcalloc(4, ws, GFP_KERNEL);  		ectx.ws_size = ws; diff --git a/drivers/gpu/drm/amd/amdgpu/cik.c b/drivers/gpu/drm/amd/amdgpu/cik.c index a3a643254d7a..cf1d5d462b67 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik.c +++ b/drivers/gpu/drm/amd/amdgpu/cik.c @@ -1375,14 +1375,14 @@ static int cik_asic_pci_config_reset(struct amdgpu_device *adev)  	return r;  } -static bool cik_asic_supports_baco(struct amdgpu_device *adev) +static int cik_asic_supports_baco(struct amdgpu_device *adev)  {  	switch (adev->asic_type) {  	case CHIP_BONAIRE:  	case CHIP_HAWAII:  		return amdgpu_dpm_is_baco_supported(adev);  	default: -		return false; +		return 0;  	}  } @@ -2210,6 +2210,8 @@ static const struct amd_ip_funcs cik_common_ip_funcs = {  	.soft_reset = cik_common_soft_reset,  	.set_clockgating_state = cik_common_set_clockgating_state,  	.set_powergating_state = cik_common_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ip_block_version cik_common_ip_block = diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c b/drivers/gpu/drm/amd/amdgpu/cik_ih.c index f24e34dc33d1..576baa9dbb0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c @@ -435,6 +435,8 @@ static const struct amd_ip_funcs cik_ih_ip_funcs = {  	.soft_reset = cik_ih_soft_reset,  	.set_clockgating_state = cik_ih_set_clockgating_state,  	.set_powergating_state = cik_ih_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs cik_ih_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c index a3fccc4c1f43..6948ebda0fa2 100644 --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c @@ -1228,6 +1228,8 @@ static const struct amd_ip_funcs cik_sdma_ip_funcs = {  	.soft_reset = cik_sdma_soft_reset,  	.set_clockgating_state = cik_sdma_set_clockgating_state,  	.set_powergating_state = cik_sdma_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs cik_sdma_ring_funcs = { @@ -1290,7 +1292,7 @@ static void cik_sdma_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: is this a secure operation + * @copy_flags: unused   *   * Copy GPU buffers using the DMA engine (CIK).   * Used by the amdgpu ttm implementation to move pages if @@ -1300,7 +1302,7 @@ static void cik_sdma_emit_copy_buffer(struct amdgpu_ib *ib,  				      uint64_t src_offset,  				      uint64_t dst_offset,  				      uint32_t byte_count, -				      bool tmz) +				      uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR, 0);  	ib->ptr[ib->length_dw++] = byte_count; diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c b/drivers/gpu/drm/amd/amdgpu/cz_ih.c index c19681492efa..072643787384 100644 --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c @@ -433,6 +433,8 @@ static const struct amd_ip_funcs cz_ih_ip_funcs = {  	.soft_reset = cz_ih_soft_reset,  	.set_clockgating_state = cz_ih_set_clockgating_state,  	.set_powergating_state = cz_ih_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs cz_ih_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c index 221af054d874..b44fce44c066 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c @@ -3333,6 +3333,8 @@ static const struct amd_ip_funcs dce_v10_0_ip_funcs = {  	.soft_reset = dce_v10_0_soft_reset,  	.set_clockgating_state = dce_v10_0_set_clockgating_state,  	.set_powergating_state = dce_v10_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static void diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c index 69e8b0db6cf7..80b2e7f79acf 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c @@ -3464,6 +3464,8 @@ static const struct amd_ip_funcs dce_v11_0_ip_funcs = {  	.soft_reset = dce_v11_0_soft_reset,  	.set_clockgating_state = dce_v11_0_set_clockgating_state,  	.set_powergating_state = dce_v11_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static void diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c index 60d40201fdd1..db20012600f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c @@ -3154,6 +3154,8 @@ static const struct amd_ip_funcs dce_v6_0_ip_funcs = {  	.soft_reset = dce_v6_0_soft_reset,  	.set_clockgating_state = dce_v6_0_set_clockgating_state,  	.set_powergating_state = dce_v6_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static void diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c index 5a5fcc45e452..5b56100ec902 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c @@ -3242,6 +3242,8 @@ static const struct amd_ip_funcs dce_v8_0_ip_funcs = {  	.soft_reset = dce_v8_0_soft_reset,  	.set_clockgating_state = dce_v8_0_set_clockgating_state,  	.set_powergating_state = dce_v8_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static void diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index f90905ef32c7..536287ddd2ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -276,6 +276,99 @@ MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec.bin");  MODULE_FIRMWARE("amdgpu/gc_10_3_7_mec2.bin");  MODULE_FIRMWARE("amdgpu/gc_10_3_7_rlc.bin"); +static const struct amdgpu_hwip_reg_entry gc_reg_list_10_1[] = { +	SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS2), +	SOC15_REG_ENTRY_STR(GC, 0, mmGRBM_STATUS3), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT1), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_STALLED_STAT2), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STALLED_STAT1), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STALLED_STAT1), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_BUSY_STAT), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_BUSY_STAT2), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_BUSY_STAT2), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPF_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_ERROR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_GFX_HPD_STATUS0), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_BASE), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_RPTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB_WPTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_BASE), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_RPTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB0_WPTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_BASE), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_RPTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB1_WPTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_BASE), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_RB2_WPTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_CMD_BUFSZ), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_CMD_BUFSZ), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_CMD_BUFSZ), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_CMD_BUFSZ), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_LO), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BASE_HI), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB1_BUFSZ), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_LO), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BASE_HI), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_IB2_BUFSZ), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_LO), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BASE_HI), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB1_BUFSZ), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_LO), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BASE_HI), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_IB2_BUFSZ), +	SOC15_REG_ENTRY_STR(GC, 0, mmCPF_UTCL1_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmCPC_UTCL1_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmCPG_UTCL1_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmGDS_PROTECTION_FAULT), +	SOC15_REG_ENTRY_STR(GC, 0, mmGDS_VM_PROTECTION_FAULT), +	SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmIA_UTCL1_STATUS_2), +	SOC15_REG_ENTRY_STR(GC, 0, mmPA_CL_CNTL_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_UTCL1_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmRMI_UTCL1_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmSQC_DCACHE_UTCL0_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmSQC_ICACHE_UTCL0_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmSQG_UTCL0_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmTCP_UTCL0_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmWD_UTCL1_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_CNTL), +	SOC15_REG_ENTRY_STR(GC, 0, mmGCVM_L2_PROTECTION_FAULT_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_DEBUG), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC_CNTL), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_CNTL), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CE_INSTR_PNTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC1_INSTR_PNTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MEC2_INSTR_PNTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_DEBUG_INTERRUPT_INSTR_PNTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_MES_INSTR_PNTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_ME_INSTR_PNTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_PFP_INSTR_PNTR), +	SOC15_REG_ENTRY_STR(GC, 0, mmCP_CPC_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_STAT), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_COMMAND), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_MESSAGE), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_1), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_2), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_3), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_ARGUMENT_4), +	SOC15_REG_ENTRY_STR(GC, 0, mmSMU_RLC_RESPONSE), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SAFE_MODE), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SMU_SAFE_MODE), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_GPM_STAT_2), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_SPP_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_RLCS_BOOTLOAD_STATUS), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_INT_STAT), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_GENERAL_6), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_A), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_B), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_GPM_DEBUG_INST_ADDR), +	SOC15_REG_ENTRY_STR(GC, 0, mmRLC_LX6_CORE_PDEBUG_INST) +}; +  static const struct soc15_reg_golden golden_settings_gc_10_1[] = {  	SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 0xffffffff, 0x00400014),  	SOC15_REG_GOLDEN_VALUE(GC, 0, mmCGTT_CPF_CLK_CTRL, 0xfcff8fff, 0xf8000100), @@ -3964,7 +4057,7 @@ static void gfx_v10_0_check_gfxoff_flag(struct amdgpu_device *adev)  static int gfx_v10_0_init_microcode(struct amdgpu_device *adev)  { -	char fw_name[40]; +	char fw_name[53];  	char ucode_prefix[30];  	const char *wks = "";  	int err; @@ -4490,6 +4583,22 @@ static int gfx_v10_0_compute_ring_init(struct amdgpu_device *adev, int ring_id,  			     hw_prio, NULL);  } +static void gfx_v10_0_alloc_dump_mem(struct amdgpu_device *adev) +{ +	uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1); +	uint32_t *ptr; + +	ptr = kcalloc(reg_count, sizeof(uint32_t), GFP_KERNEL); +	if (ptr == NULL) { +		DRM_ERROR("Failed to allocate memory for IP Dump\n"); +		adev->gfx.ip_dump = NULL; +		adev->gfx.reg_count = 0; +	} else { +		adev->gfx.ip_dump = ptr; +		adev->gfx.reg_count = reg_count; +	} +} +  static int gfx_v10_0_sw_init(void *handle)  {  	int i, j, k, r, ring_id = 0; @@ -4518,7 +4627,7 @@ static int gfx_v10_0_sw_init(void *handle)  	case IP_VERSION(10, 3, 3):  	case IP_VERSION(10, 3, 7):  		adev->gfx.me.num_me = 1; -		adev->gfx.me.num_pipe_per_me = 1; +		adev->gfx.me.num_pipe_per_me = 2;  		adev->gfx.me.num_queue_per_pipe = 1;  		adev->gfx.mec.num_mec = 2;  		adev->gfx.mec.num_pipe_per_mec = 4; @@ -4642,6 +4751,8 @@ static int gfx_v10_0_sw_init(void *handle)  	gfx_v10_0_gpu_early_init(adev); +	gfx_v10_0_alloc_dump_mem(adev); +  	return 0;  } @@ -4694,6 +4805,8 @@ static int gfx_v10_0_sw_fini(void *handle)  	gfx_v10_0_free_microcode(adev); +	kfree(adev->gfx.ip_dump); +  	return 0;  } @@ -8317,7 +8430,7 @@ static void gfx_v10_0_ring_emit_hdp_flush(struct amdgpu_ring *ring)  		}  		reg_mem_engine = 0;  	} else { -		ref_and_mask = nbio_hf_reg->ref_and_mask_cp0; +		ref_and_mask = nbio_hf_reg->ref_and_mask_cp0 << ring->pipe;  		reg_mem_engine = 1; /* pfp */  	} @@ -9154,6 +9267,36 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)  	amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */  } +static void gfx_v10_ip_print(void *handle, struct drm_printer *p) +{ +	struct amdgpu_device *adev = (struct amdgpu_device *)handle; +	uint32_t i; +	uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1); + +	if (!adev->gfx.ip_dump) +		return; + +	for (i = 0; i < reg_count; i++) +		drm_printf(p, "%-50s \t 0x%08x\n", +			   gc_reg_list_10_1[i].reg_name, +			   adev->gfx.ip_dump[i]); +} + +static void gfx_v10_ip_dump(void *handle) +{ +	struct amdgpu_device *adev = (struct amdgpu_device *)handle; +	uint32_t i; +	uint32_t reg_count = ARRAY_SIZE(gc_reg_list_10_1); + +	if (!adev->gfx.ip_dump) +		return; + +	amdgpu_gfx_off_ctrl(adev, false); +	for (i = 0; i < reg_count; i++) +		adev->gfx.ip_dump[i] = RREG32(SOC15_REG_ENTRY_OFFSET(gc_reg_list_10_1[i])); +	amdgpu_gfx_off_ctrl(adev, true); +} +  static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {  	.name = "gfx_v10_0",  	.early_init = gfx_v10_0_early_init, @@ -9170,6 +9313,8 @@ static const struct amd_ip_funcs gfx_v10_0_ip_funcs = {  	.set_clockgating_state = gfx_v10_0_set_clockgating_state,  	.set_powergating_state = gfx_v10_0_set_powergating_state,  	.get_clockgating_state = gfx_v10_0_get_clockgating_state, +	.dump_ip_state = gfx_v10_ip_dump, +	.print_ip_state = gfx_v10_ip_print,  };  static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { @@ -9186,7 +9331,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {  		7 + /* PIPELINE_SYNC */  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + -		2 + /* VM_FLUSH */ +		4 + /* VM_FLUSH */  		8 + /* FENCE for VM_FLUSH */  		20 + /* GDS switch */  		4 + /* double SWITCH_BUFFER, @@ -9276,7 +9421,6 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {  		7 + /* gfx_v10_0_ring_emit_pipeline_sync */  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + -		2 + /* gfx_v10_0_ring_emit_vm_flush */  		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence_kiq x3 for user fence, vm fence */  	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */  	.emit_ib = gfx_v10_0_ring_emit_ib_compute, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 1770e496c1b7..ad6431013c73 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -510,7 +510,7 @@ static void gfx_v11_0_check_fw_cp_gfx_shadow(struct amdgpu_device *adev)  static int gfx_v11_0_init_microcode(struct amdgpu_device *adev)  {  	char fw_name[40]; -	char ucode_prefix[30]; +	char ucode_prefix[25];  	int err;  	const struct rlc_firmware_header_v2_0 *rlc_hdr;  	uint16_t version_major; @@ -1635,7 +1635,7 @@ static void gfx_v11_0_setup_rb(struct amdgpu_device *adev)  			active_rb_bitmap |= (0x3 << (i * rb_bitmap_width_per_sa));  	} -	active_rb_bitmap |= global_active_rb_bitmap; +	active_rb_bitmap &= global_active_rb_bitmap;  	adev->gfx.config.backend_enable_mask = active_rb_bitmap;  	adev->gfx.config.num_rbs = hweight32(active_rb_bitmap);  } @@ -4506,14 +4506,11 @@ static int gfx_v11_0_soft_reset(void *handle)  	gfx_v11_0_set_safe_mode(adev, 0); +	mutex_lock(&adev->srbm_mutex);  	for (i = 0; i < adev->gfx.mec.num_mec; ++i) {  		for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) {  			for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; k++) { -				tmp = RREG32_SOC15(GC, 0, regGRBM_GFX_CNTL); -				tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, MEID, i); -				tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, QUEUEID, j); -				tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, PIPEID, k); -				WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, tmp); +				soc21_grbm_select(adev, i, k, j, 0);  				WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 0x2);  				WREG32_SOC15(GC, 0, regSPI_COMPUTE_QUEUE_RESET, 0x1); @@ -4523,16 +4520,14 @@ static int gfx_v11_0_soft_reset(void *handle)  	for (i = 0; i < adev->gfx.me.num_me; ++i) {  		for (j = 0; j < adev->gfx.me.num_queue_per_pipe; j++) {  			for (k = 0; k < adev->gfx.me.num_pipe_per_me; k++) { -				tmp = RREG32_SOC15(GC, 0, regGRBM_GFX_CNTL); -				tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, MEID, i); -				tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, QUEUEID, j); -				tmp = REG_SET_FIELD(tmp, GRBM_GFX_CNTL, PIPEID, k); -				WREG32_SOC15(GC, 0, regGRBM_GFX_CNTL, tmp); +				soc21_grbm_select(adev, i, k, j, 0);  				WREG32_SOC15(GC, 0, regCP_GFX_HQD_DEQUEUE_REQUEST, 0x1);  			}  		}  	} +	soc21_grbm_select(adev, 0, 0, 0, 0); +	mutex_unlock(&adev->srbm_mutex);  	/* Try to acquire the gfx mutex before access to CP_VMID_RESET */  	r = gfx_v11_0_request_gfx_index_mutex(adev, 1); @@ -5465,6 +5460,7 @@ static void gfx_v11_0_ring_emit_vm_flush(struct amdgpu_ring *ring,  	/* Make sure that we can't skip the SET_Q_MODE packets when the VM  	 * changed in any way.  	 */ +	ring->set_q_mode_offs = 0;  	ring->set_q_mode_ptr = NULL;  } @@ -6173,6 +6169,8 @@ static const struct amd_ip_funcs gfx_v11_0_ip_funcs = {  	.set_clockgating_state = gfx_v11_0_set_clockgating_state,  	.set_powergating_state = gfx_v11_0_set_powergating_state,  	.get_clockgating_state = gfx_v11_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = { @@ -6191,7 +6189,7 @@ static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = {  		7 + /* PIPELINE_SYNC */  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + -		2 + /* VM_FLUSH */ +		4 + /* VM_FLUSH */  		8 + /* FENCE for VM_FLUSH */  		20 + /* GDS switch */  		5 + /* COND_EXEC */ @@ -6277,7 +6275,6 @@ static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_kiq = {  		7 + /* gfx_v11_0_ring_emit_pipeline_sync */  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + -		2 + /* gfx_v11_0_ring_emit_vm_flush */  		8 + 8 + 8, /* gfx_v11_0_ring_emit_fence_kiq x3 for user fence, vm fence */  	.emit_ib_size =	7, /* gfx_v11_0_ring_emit_ib_compute */  	.emit_ib = gfx_v11_0_ring_emit_ib_compute, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c index 34f9211b2679..d0992ce9fb47 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c @@ -3457,6 +3457,8 @@ static const struct amd_ip_funcs gfx_v6_0_ip_funcs = {  	.soft_reset = gfx_v6_0_soft_reset,  	.set_clockgating_state = gfx_v6_0_set_clockgating_state,  	.set_powergating_state = gfx_v6_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs gfx_v6_0_ring_funcs_gfx = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index 86a4865b1ae5..541dbd70d8c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -4977,6 +4977,8 @@ static const struct amd_ip_funcs gfx_v7_0_ip_funcs = {  	.soft_reset = gfx_v7_0_soft_reset,  	.set_clockgating_state = gfx_v7_0_set_clockgating_state,  	.set_powergating_state = gfx_v7_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs gfx_v7_0_ring_funcs_gfx = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 202ddda57f98..2f0e72caee1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -6878,6 +6878,8 @@ static const struct amd_ip_funcs gfx_v8_0_ip_funcs = {  	.set_clockgating_state = gfx_v8_0_set_clockgating_state,  	.set_powergating_state = gfx_v8_0_set_powergating_state,  	.get_clockgating_state = gfx_v8_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 6f97a6d0e6d0..3c8c5abf35ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -1249,7 +1249,7 @@ static void gfx_v9_0_check_if_need_gfxoff(struct amdgpu_device *adev)  static int gfx_v9_0_init_cp_gfx_microcode(struct amdgpu_device *adev,  					  char *chip_name)  { -	char fw_name[30]; +	char fw_name[50];  	int err;  	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_pfp.bin", chip_name); @@ -1282,7 +1282,7 @@ out:  static int gfx_v9_0_init_rlc_microcode(struct amdgpu_device *adev,  				       char *chip_name)  { -	char fw_name[30]; +	char fw_name[53];  	int err;  	const struct rlc_firmware_header_v2_0 *rlc_hdr;  	uint16_t version_major; @@ -1337,7 +1337,7 @@ static bool gfx_v9_0_load_mec2_fw_bin_support(struct amdgpu_device *adev)  static int gfx_v9_0_init_cp_compute_microcode(struct amdgpu_device *adev,  					      char *chip_name)  { -	char fw_name[30]; +	char fw_name[50];  	int err;  	if (amdgpu_sriov_vf(adev) && (adev->asic_type == CHIP_ALDEBARAN)) @@ -6856,6 +6856,8 @@ static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {  	.set_clockgating_state = gfx_v9_0_set_clockgating_state,  	.set_powergating_state = gfx_v9_0_set_powergating_state,  	.get_clockgating_state = gfx_v9_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { @@ -6981,7 +6983,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {  		7 + /* gfx_v9_0_ring_emit_pipeline_sync */  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + -		2 + /* gfx_v9_0_ring_emit_vm_flush */  		8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */  		7 + /* gfx_v9_0_emit_mem_sync */  		5 + /* gfx_v9_0_emit_wave_limit for updating mmSPI_WCL_PIPE_PERCENT_GFX register */ @@ -7019,7 +7020,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {  		7 + /* gfx_v9_0_ring_emit_pipeline_sync */  		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +  		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + -		2 + /* gfx_v9_0_ring_emit_vm_flush */  		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */  	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */  	.emit_fence = gfx_v9_0_ring_emit_fence_kiq, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index 065b2bd5f5a6..3f4fd2f08163 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1909,18 +1909,7 @@ static void gfx_v9_4_2_reset_sq_timeout_status(struct amdgpu_device *adev)  	mutex_unlock(&adev->grbm_idx_mutex);  } -static bool gfx_v9_4_2_query_uctl2_poison_status(struct amdgpu_device *adev) -{ -	u32 status = 0; -	struct amdgpu_vmhub *hub; - -	hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; -	status = RREG32(hub->vm_l2_pro_fault_status); -	/* reset page fault status */ -	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); -	return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); -}  struct amdgpu_ras_block_hw_ops  gfx_v9_4_2_ras_ops = {  		.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, @@ -1934,5 +1923,4 @@ struct amdgpu_gfx_ras gfx_v9_4_2_ras = {  		.hw_ops = &gfx_v9_4_2_ras_ops,  	},  	.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer, -	.query_utcl2_poison_status = gfx_v9_4_2_query_uctl2_poison_status,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index b53c8fd4e8cf..7b16e8cca86a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -431,16 +431,16 @@ out:  static int gfx_v9_4_3_init_microcode(struct amdgpu_device *adev)  { -	const char *chip_name; +	char ucode_prefix[15];  	int r; -	chip_name = "gc_9_4_3"; +	amdgpu_ucode_ip_version_decode(adev, GC_HWIP, ucode_prefix, sizeof(ucode_prefix)); -	r = gfx_v9_4_3_init_rlc_microcode(adev, chip_name); +	r = gfx_v9_4_3_init_rlc_microcode(adev, ucode_prefix);  	if (r)  		return r; -	r = gfx_v9_4_3_init_cp_compute_microcode(adev, chip_name); +	r = gfx_v9_4_3_init_cp_compute_microcode(adev, ucode_prefix);  	if (r)  		return r; @@ -680,38 +680,44 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_3_gfx_funcs = {  	.ih_node_to_logical_xcc = &gfx_v9_4_3_ih_to_xcc_inst,  }; -static int gfx_v9_4_3_aca_bank_generate_report(struct aca_handle *handle, -					       struct aca_bank *bank, enum aca_error_type type, -					       struct aca_bank_report *report, void *data) +static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle, +				      struct aca_bank *bank, enum aca_smu_type type, +				      void *data)  { -	u64 status, misc0; +	struct aca_bank_info info; +	u64 misc0;  	u32 instlo;  	int ret; -	status = bank->regs[ACA_REG_IDX_STATUS]; -	if ((type == ACA_ERROR_TYPE_UE && -	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) || -	    (type == ACA_ERROR_TYPE_CE && -	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) { +	ret = aca_bank_info_decode(bank, &info); +	if (ret) +		return ret; -		ret = aca_bank_info_decode(bank, &report->info); -		if (ret) -			return ret; +	/* NOTE: overwrite info.die_id with xcd id for gfx */ +	instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); +	instlo &= GENMASK(31, 1); +	info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1; -		/* NOTE: overwrite info.die_id with xcd id for gfx */ -		instlo = ACA_REG__IPID__INSTANCEIDLO(bank->regs[ACA_REG_IDX_IPID]); -		instlo &= GENMASK(31, 1); -		report->info.die_id = instlo == mmSMNAID_XCD0_MCA_SMU ? 0 : 1; +	misc0 = bank->regs[ACA_REG_IDX_MISC0]; -		misc0 = bank->regs[ACA_REG_IDX_MISC0]; -		report->count[type] = ACA_REG__MISC0__ERRCNT(misc0); +	switch (type) { +	case ACA_SMU_TYPE_UE: +		ret = aca_error_cache_log_bank_error(handle, &info, +						     ACA_ERROR_TYPE_UE, 1ULL); +		break; +	case ACA_SMU_TYPE_CE: +		ret = aca_error_cache_log_bank_error(handle, &info, +						     ACA_ERROR_TYPE_CE, ACA_REG__MISC0__ERRCNT(misc0)); +		break; +	default: +		return -EINVAL;  	} -	return 0; +	return ret;  }  static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, -					 enum aca_error_type type, void *data) +					 enum aca_smu_type type, void *data)  {  	u32 instlo; @@ -730,7 +736,7 @@ static bool gfx_v9_4_3_aca_bank_is_valid(struct aca_handle *handle, struct aca_b  }  static const struct aca_bank_ops gfx_v9_4_3_aca_bank_ops = { -	.aca_bank_generate_report = gfx_v9_4_3_aca_bank_generate_report, +	.aca_bank_parser = gfx_v9_4_3_aca_bank_parser,  	.aca_bank_is_valid = gfx_v9_4_3_aca_bank_is_valid,  }; @@ -2398,10 +2404,10 @@ gfx_v9_4_3_xcc_update_coarse_grain_clock_gating(struct amdgpu_device *adev,  		if (def != data)  			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGTT_MGCG_OVERRIDE, data); -		/* enable cgcg FSM(0x0000363F) */ +		/* CGCG Hysteresis: 400us */  		def = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL); -		data = (0x36 +		data = (0x2710  			<< RLC_CGCG_CGLS_CTRL__CGCG_GFX_IDLE_THRESHOLD__SHIFT) |  		       RLC_CGCG_CGLS_CTRL__CGCG_EN_MASK;  		if (adev->cg_flags & AMD_CG_SUPPORT_GFX_CGLS) @@ -2410,10 +2416,10 @@ gfx_v9_4_3_xcc_update_coarse_grain_clock_gating(struct amdgpu_device *adev,  		if (def != data)  			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regRLC_CGCG_CGLS_CTRL, data); -		/* set IDLE_POLL_COUNT(0x00900100) */ +		/* set IDLE_POLL_COUNT(0x33450100)*/  		def = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL);  		data = (0x0100 << CP_RB_WPTR_POLL_CNTL__POLL_FREQUENCY__SHIFT) | -			(0x0090 << CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT__SHIFT); +			(0x3345 << CP_RB_WPTR_POLL_CNTL__IDLE_POLL_COUNT__SHIFT);  		if (def != data)  			WREG32_SOC15(GC, GET_INST(GC, xcc_id), regCP_RB_WPTR_POLL_CNTL, data);  	} else { @@ -4010,6 +4016,8 @@ static const struct amd_ip_funcs gfx_v9_4_3_ip_funcs = {  	.set_clockgating_state = gfx_v9_4_3_set_clockgating_state,  	.set_powergating_state = gfx_v9_4_3_set_powergating_state,  	.get_clockgating_state = gfx_v9_4_3_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs gfx_v9_4_3_ring_funcs_compute = { diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c index 22175da0e16a..d200310d1731 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c @@ -443,6 +443,22 @@ static void gfxhub_v1_0_init(struct amdgpu_device *adev)  		mmVM_INVALIDATE_ENG0_ADDR_RANGE_LO32;  } +static bool gfxhub_v1_0_query_utcl2_poison_status(struct amdgpu_device *adev, +				int xcc_id) +{ +	u32 status = 0; +	struct amdgpu_vmhub *hub; + +	if (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2)) +		return false; + +	hub = &adev->vmhub[AMDGPU_GFXHUB(0)]; +	status = RREG32(hub->vm_l2_pro_fault_status); +	/* reset page fault status */ +	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1); + +	return REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); +}  const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {  	.get_mc_fb_offset = gfxhub_v1_0_get_mc_fb_offset, @@ -452,4 +468,5 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_0_funcs = {  	.set_fault_enable_default = gfxhub_v1_0_set_fault_enable_default,  	.init = gfxhub_v1_0_init,  	.get_xgmi_info = gfxhub_v1_1_get_xgmi_info, +	.query_utcl2_poison_status = gfxhub_v1_0_query_utcl2_poison_status,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c index 49aecdcee006..77df8c9cbad2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c @@ -620,6 +620,20 @@ static int gfxhub_v1_2_get_xgmi_info(struct amdgpu_device *adev)  	return 0;  } +static bool gfxhub_v1_2_query_utcl2_poison_status(struct amdgpu_device *adev, +				int xcc_id) +{ +	u32 fed, status; + +	status = RREG32_SOC15(GC, GET_INST(GC, xcc_id), regVM_L2_PROTECTION_FAULT_STATUS); +	fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); +	/* reset page fault status */ +	WREG32_P(SOC15_REG_OFFSET(GC, GET_INST(GC, xcc_id), +			regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1); + +	return fed; +} +  const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {  	.get_mc_fb_offset = gfxhub_v1_2_get_mc_fb_offset,  	.setup_vm_pt_regs = gfxhub_v1_2_setup_vm_pt_regs, @@ -628,6 +642,7 @@ const struct amdgpu_gfxhub_funcs gfxhub_v1_2_funcs = {  	.set_fault_enable_default = gfxhub_v1_2_set_fault_enable_default,  	.init = gfxhub_v1_2_init,  	.get_xgmi_info = gfxhub_v1_2_get_xgmi_info, +	.query_utcl2_poison_status = gfxhub_v1_2_query_utcl2_poison_status,  };  static int gfxhub_v1_2_xcp_resume(void *handle, uint32_t inst_mask) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c index 23b478639921..3e38d8bfcb69 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c @@ -1115,6 +1115,8 @@ static const struct amd_ip_funcs gmc_v6_0_ip_funcs = {  	.soft_reset = gmc_v6_0_soft_reset,  	.set_clockgating_state = gmc_v6_0_set_clockgating_state,  	.set_powergating_state = gmc_v6_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_gmc_funcs gmc_v6_0_gmc_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c index 3da7b6a2b00d..85df8fc81065 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c @@ -1354,6 +1354,8 @@ static const struct amd_ip_funcs gmc_v7_0_ip_funcs = {  	.soft_reset = gmc_v7_0_soft_reset,  	.set_clockgating_state = gmc_v7_0_set_clockgating_state,  	.set_powergating_state = gmc_v7_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_gmc_funcs gmc_v7_0_gmc_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c index d20e5f20ee31..fc97757e33d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c @@ -1717,6 +1717,8 @@ static const struct amd_ip_funcs gmc_v8_0_ip_funcs = {  	.set_clockgating_state = gmc_v8_0_set_clockgating_state,  	.set_powergating_state = gmc_v8_0_set_powergating_state,  	.get_clockgating_state = gmc_v8_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_gmc_funcs gmc_v8_0_gmc_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 47b63a4ce68b..c4ec1358f3aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -548,7 +548,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,  {  	bool retry_fault = !!(entry->src_data[1] & 0x80);  	bool write_fault = !!(entry->src_data[1] & 0x20); -	uint32_t status = 0, cid = 0, rw = 0; +	uint32_t status = 0, cid = 0, rw = 0, fed = 0;  	struct amdgpu_task_info *task_info;  	struct amdgpu_vmhub *hub;  	const char *mmhub_cid; @@ -664,6 +664,13 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,  	status = RREG32(hub->vm_l2_pro_fault_status);  	cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);  	rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW); +	fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); + +	/* for fed error, kfd will handle it, return directly */ +	if (fed && amdgpu_ras_is_poison_mode_supported(adev) && +	    (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2))) +		return 0; +  	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);  	amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub); @@ -1450,7 +1457,6 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev)  		adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;  		adev->umc.active_mask = adev->aid_mask;  		adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; -		adev->umc.channel_idx_tbl = &umc_v12_0_channel_idx_tbl[0][0][0];  		if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)  			adev->umc.ras = &umc_v12_0_ras;  		break; diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c index 2c02ae69883d..07984f7c3ae7 100644 --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c @@ -425,6 +425,8 @@ static const struct amd_ip_funcs iceland_ih_ip_funcs = {  	.soft_reset = iceland_ih_soft_reset,  	.set_clockgating_state = iceland_ih_set_clockgating_state,  	.set_powergating_state = iceland_ih_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs iceland_ih_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c index ad4ad39f128f..3cb64c8f7175 100644 --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c @@ -346,6 +346,21 @@ static int ih_v6_0_irq_init(struct amdgpu_device *adev)  			    DELAY, 3);  	WREG32_SOC15(OSSSYS, 0, regIH_MSI_STORM_CTRL, tmp); +	/* Redirect the interrupts to IH RB1 for dGPU */ +	if (adev->irq.ih1.ring_size) { +		tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX); +		tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_INDEX, INDEX, 0); +		WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX, tmp); + +		tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA); +		tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, CLIENT_ID, 0xa); +		tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, SOURCE_ID, 0x0); +		tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, +				    SOURCE_ID_MATCH_ENABLE, 0x1); + +		WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA, tmp); +	} +  	pci_set_master(adev->pdev);  	/* enable interrupts */ @@ -549,8 +564,15 @@ static int ih_v6_0_sw_init(void *handle)  	adev->irq.ih.use_doorbell = true;  	adev->irq.ih.doorbell_index = adev->doorbell_index.ih << 1; -	adev->irq.ih1.ring_size = 0; -	adev->irq.ih2.ring_size = 0; +	if (!(adev->flags & AMD_IS_APU)) { +		r = amdgpu_ih_ring_init(adev, &adev->irq.ih1, IH_RING_SIZE, +					use_bus_addr); +		if (r) +			return r; + +		adev->irq.ih1.use_doorbell = true; +		adev->irq.ih1.doorbell_index = (adev->doorbell_index.ih + 1) << 1; +	}  	/* initialize ih control register offset */  	ih_v6_0_init_register_offset(adev); @@ -748,6 +770,8 @@ static const struct amd_ip_funcs ih_v6_0_ip_funcs = {  	.set_clockgating_state = ih_v6_0_set_clockgating_state,  	.set_powergating_state = ih_v6_0_set_powergating_state,  	.get_clockgating_state = ih_v6_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs ih_v6_0_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c index b8da0fc29378..0fbf5fa7b0f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_1.c @@ -346,6 +346,21 @@ static int ih_v6_1_irq_init(struct amdgpu_device *adev)  			    DELAY, 3);  	WREG32_SOC15(OSSSYS, 0, regIH_MSI_STORM_CTRL, tmp); +	/* Redirect the interrupts to IH RB1 for dGPU */ +	if (adev->irq.ih1.ring_size) { +		tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX); +		tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_INDEX, INDEX, 0); +		WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_INDEX, tmp); + +		tmp = RREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA); +		tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, CLIENT_ID, 0xa); +		tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, SOURCE_ID, 0x0); +		tmp = REG_SET_FIELD(tmp, IH_RING1_CLIENT_CFG_DATA, +				    SOURCE_ID_MATCH_ENABLE, 0x1); + +		WREG32_SOC15(OSSSYS, 0, regIH_RING1_CLIENT_CFG_DATA, tmp); +	} +  	pci_set_master(adev->pdev);  	/* enable interrupts */ @@ -550,8 +565,15 @@ static int ih_v6_1_sw_init(void *handle)  	adev->irq.ih.use_doorbell = true;  	adev->irq.ih.doorbell_index = adev->doorbell_index.ih << 1; -	adev->irq.ih1.ring_size = 0; -	adev->irq.ih2.ring_size = 0; +	if (!(adev->flags & AMD_IS_APU)) { +		r = amdgpu_ih_ring_init(adev, &adev->irq.ih1, IH_RING_SIZE, +					use_bus_addr); +		if (r) +			return r; + +		adev->irq.ih1.use_doorbell = true; +		adev->irq.ih1.doorbell_index = (adev->doorbell_index.ih + 1) << 1; +	}  	/* initialize ih control register offset */  	ih_v6_1_init_register_offset(adev); @@ -753,6 +775,8 @@ static const struct amd_ip_funcs ih_v6_1_ip_funcs = {  	.set_clockgating_state = ih_v6_1_set_clockgating_state,  	.set_powergating_state = ih_v6_1_set_powergating_state,  	.get_clockgating_state = ih_v6_1_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs ih_v6_1_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v7_0.c b/drivers/gpu/drm/amd/amdgpu/ih_v7_0.c index 7aed96fa10a9..aa6235dd4f2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/ih_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/ih_v7_0.c @@ -749,6 +749,8 @@ static const struct amd_ip_funcs ih_v7_0_ip_funcs = {  	.set_clockgating_state = ih_v7_0_set_clockgating_state,  	.set_powergating_state = ih_v7_0_set_powergating_state,  	.get_clockgating_state = ih_v7_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs ih_v7_0_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c index 1c8116d75f63..ef3e42f6b841 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c @@ -759,6 +759,8 @@ static const struct amd_ip_funcs jpeg_v2_0_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = jpeg_v2_0_set_clockgating_state,  	.set_powergating_state = jpeg_v2_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs jpeg_v2_0_dec_ring_vm_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c index 99cd49ee8ef6..afeaf3c64e27 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_5.c @@ -632,6 +632,8 @@ static const struct amd_ip_funcs jpeg_v2_5_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = jpeg_v2_5_set_clockgating_state,  	.set_powergating_state = jpeg_v2_5_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amd_ip_funcs jpeg_v2_6_ip_funcs = { @@ -652,6 +654,8 @@ static const struct amd_ip_funcs jpeg_v2_6_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = jpeg_v2_5_set_clockgating_state,  	.set_powergating_state = jpeg_v2_5_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs jpeg_v2_5_dec_ring_vm_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c index a92481da60cd..1c7cf4800bf7 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v3_0.c @@ -557,6 +557,8 @@ static const struct amd_ip_funcs jpeg_v3_0_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = jpeg_v3_0_set_clockgating_state,  	.set_powergating_state = jpeg_v3_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs jpeg_v3_0_dec_ring_vm_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c index 88ea58d5c4ab..237fe5df5a8f 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0.c @@ -719,6 +719,8 @@ static const struct amd_ip_funcs jpeg_v4_0_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = jpeg_v4_0_set_clockgating_state,  	.set_powergating_state = jpeg_v4_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs jpeg_v4_0_dec_ring_vm_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index 32caeb37cef9..d66af11aa66c 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -1053,6 +1053,8 @@ static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = jpeg_v4_0_3_set_clockgating_state,  	.set_powergating_state = jpeg_v4_0_3_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs jpeg_v4_0_3_dec_ring_vm_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c index edf5bcdd2bc9..da6bb9022b80 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c @@ -762,6 +762,8 @@ static const struct amd_ip_funcs jpeg_v4_0_5_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = jpeg_v4_0_5_set_clockgating_state,  	.set_powergating_state = jpeg_v4_0_5_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs jpeg_v4_0_5_dec_ring_vm_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c index e70200f97555..64c856bfe0cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v5_0_0.c @@ -513,6 +513,8 @@ static const struct amd_ip_funcs jpeg_v5_0_0_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = jpeg_v5_0_0_set_clockgating_state,  	.set_powergating_state = jpeg_v5_0_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs jpeg_v5_0_0_dec_ring_vm_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c index 1e5ad1e08d2a..a626bf904926 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c @@ -1176,6 +1176,8 @@ static const struct amd_ip_funcs mes_v10_1_ip_funcs = {  	.hw_fini = mes_v10_1_hw_fini,  	.suspend = mes_v10_1_suspend,  	.resume = mes_v10_1_resume, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version mes_v10_1_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 072c478665ad..0d1407f25005 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -100,18 +100,76 @@ static const struct amdgpu_ring_funcs mes_v11_0_ring_funcs = {  	.insert_nop = amdgpu_ring_insert_nop,  }; +static const char *mes_v11_0_opcodes[] = { +	"SET_HW_RSRC", +	"SET_SCHEDULING_CONFIG", +	"ADD_QUEUE", +	"REMOVE_QUEUE", +	"PERFORM_YIELD", +	"SET_GANG_PRIORITY_LEVEL", +	"SUSPEND", +	"RESUME", +	"RESET", +	"SET_LOG_BUFFER", +	"CHANGE_GANG_PRORITY", +	"QUERY_SCHEDULER_STATUS", +	"PROGRAM_GDS", +	"SET_DEBUG_VMID", +	"MISC", +	"UPDATE_ROOT_PAGE_TABLE", +	"AMD_LOG", +}; + +static const char *mes_v11_0_misc_opcodes[] = { +	"WRITE_REG", +	"INV_GART", +	"QUERY_STATUS", +	"READ_REG", +	"WAIT_REG_MEM", +	"SET_SHADER_DEBUGGER", +}; + +static const char *mes_v11_0_get_op_string(union MESAPI__MISC *x_pkt) +{ +	const char *op_str = NULL; + +	if (x_pkt->header.opcode < ARRAY_SIZE(mes_v11_0_opcodes)) +		op_str = mes_v11_0_opcodes[x_pkt->header.opcode]; + +	return op_str; +} + +static const char *mes_v11_0_get_misc_op_string(union MESAPI__MISC *x_pkt) +{ +	const char *op_str = NULL; + +	if ((x_pkt->header.opcode == MES_SCH_API_MISC) && +	    (x_pkt->opcode < ARRAY_SIZE(mes_v11_0_misc_opcodes))) +		op_str = mes_v11_0_misc_opcodes[x_pkt->opcode]; + +	return op_str; +} +  static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,  						    void *pkt, int size,  						    int api_status_off)  {  	int ndw = size / 4;  	signed long r; -	union MESAPI__ADD_QUEUE *x_pkt = pkt; +	union MESAPI__MISC *x_pkt = pkt;  	struct MES_API_STATUS *api_status;  	struct amdgpu_device *adev = mes->adev;  	struct amdgpu_ring *ring = &mes->ring;  	unsigned long flags; -	signed long timeout = adev->usec_timeout; +	signed long timeout = 3000000; /* 3000 ms */ +	const char *op_str, *misc_op_str; +	u32 fence_offset; +	u64 fence_gpu_addr; +	u64 *fence_ptr; +	int ret; + +	if (x_pkt->header.opcode >= MES_SCH_API_MAX) +		return -EINVAL;  	if (amdgpu_emu_mode) {  		timeout *= 100; @@ -121,27 +179,52 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,  	}  	BUG_ON(size % 4 != 0); +	ret = amdgpu_device_wb_get(adev, &fence_offset); +	if (ret) +		return ret; +	fence_gpu_addr = +		adev->wb.gpu_addr + (fence_offset * 4); +	fence_ptr = (u64 *)&adev->wb.wb[fence_offset]; +	*fence_ptr = 0; +  	spin_lock_irqsave(&mes->ring_lock, flags);  	if (amdgpu_ring_alloc(ring, ndw)) {  		spin_unlock_irqrestore(&mes->ring_lock, flags); +		amdgpu_device_wb_free(adev, fence_offset);  		return -ENOMEM;  	}  	api_status = (struct MES_API_STATUS *)((char *)pkt + api_status_off); -	api_status->api_completion_fence_addr = mes->ring.fence_drv.gpu_addr; -	api_status->api_completion_fence_value = ++mes->ring.fence_drv.sync_seq; +	api_status->api_completion_fence_addr = fence_gpu_addr; +	api_status->api_completion_fence_value = 1;  	amdgpu_ring_write_multiple(ring, pkt, ndw);  	amdgpu_ring_commit(ring);  	spin_unlock_irqrestore(&mes->ring_lock, flags); -	DRM_DEBUG("MES msg=%d was emitted\n", x_pkt->header.opcode); +	op_str = mes_v11_0_get_op_string(x_pkt); +	misc_op_str = mes_v11_0_get_misc_op_string(x_pkt); + +	if (misc_op_str) +		dev_dbg(adev->dev, "MES msg=%s (%s) was emitted\n", op_str, misc_op_str); +	else if (op_str) +		dev_dbg(adev->dev, "MES msg=%s was emitted\n", op_str); +	else +		dev_dbg(adev->dev, "MES msg=%d was emitted\n", x_pkt->header.opcode); -	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq, -		      timeout); +	r = amdgpu_mes_fence_wait_polling(fence_ptr, (u64)1, timeout); +	amdgpu_device_wb_free(adev, fence_offset);  	if (r < 1) { -		DRM_ERROR("MES failed to response msg=%d\n", -			  x_pkt->header.opcode); + +		if (misc_op_str) +			dev_err(adev->dev, "MES failed to respond to msg=%s (%s)\n", +				op_str, misc_op_str); +		else if (op_str) +			dev_err(adev->dev, "MES failed to respond to msg=%s\n", +				op_str); +		else +			dev_err(adev->dev, "MES failed to respond to msg=%d\n", +				x_pkt->header.opcode);  		while (halt_if_hws_hang)  			schedule(); @@ -411,14 +494,47 @@ static int mes_v11_0_set_hw_resources(struct amdgpu_mes *mes)  	mes_set_hw_res_pkt.enable_reg_active_poll = 1;  	mes_set_hw_res_pkt.enable_level_process_quantum_check = 1;  	mes_set_hw_res_pkt.oversubscription_timer = 50; -	mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; -	mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = mes->event_log_gpu_addr; +	if (amdgpu_mes_log_enable) { +		mes_set_hw_res_pkt.enable_mes_event_int_logging = 1; +		mes_set_hw_res_pkt.event_intr_history_gpu_mc_ptr = +					mes->event_log_gpu_addr; +	}  	return mes_v11_0_submit_pkt_and_poll_completion(mes,  			&mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt),  			offsetof(union MESAPI_SET_HW_RESOURCES, api_status));  } +static int mes_v11_0_set_hw_resources_1(struct amdgpu_mes *mes) +{ +	int size = 128 * PAGE_SIZE; +	int ret = 0; +	struct amdgpu_device *adev = mes->adev; +	union MESAPI_SET_HW_RESOURCES_1 mes_set_hw_res_pkt; +	memset(&mes_set_hw_res_pkt, 0, sizeof(mes_set_hw_res_pkt)); + +	mes_set_hw_res_pkt.header.type = MES_API_TYPE_SCHEDULER; +	mes_set_hw_res_pkt.header.opcode = MES_SCH_API_SET_HW_RSRC_1; +	mes_set_hw_res_pkt.header.dwsize = API_FRAME_SIZE_IN_DWORDS; +	mes_set_hw_res_pkt.enable_mes_info_ctx = 1; + +	ret = amdgpu_bo_create_kernel(adev, size, PAGE_SIZE, +				AMDGPU_GEM_DOMAIN_VRAM, +				&mes->resource_1, +				&mes->resource_1_gpu_addr, +				&mes->resource_1_addr); +	if (ret) { +		dev_err(adev->dev, "(%d) failed to create mes resource_1 bo\n", ret); +		return ret; +	} + +	mes_set_hw_res_pkt.mes_info_ctx_mc_addr = mes->resource_1_gpu_addr; +	mes_set_hw_res_pkt.mes_info_ctx_size = mes->resource_1->tbo.base.size; +	return mes_v11_0_submit_pkt_and_poll_completion(mes, +			&mes_set_hw_res_pkt, sizeof(mes_set_hw_res_pkt), +			offsetof(union MESAPI_SET_HW_RESOURCES_1, api_status)); +} +  static const struct amdgpu_mes_funcs mes_v11_0_funcs = {  	.add_hw_queue = mes_v11_0_add_hw_queue,  	.remove_hw_queue = mes_v11_0_remove_hw_queue, @@ -1200,6 +1316,14 @@ static int mes_v11_0_hw_init(void *handle)  	if (r)  		goto failure; +	if (amdgpu_sriov_is_mes_info_enable(adev)) { +		r = mes_v11_0_set_hw_resources_1(&adev->mes); +		if (r) { +			DRM_ERROR("failed mes_v11_0_set_hw_resources_1, r=%d\n", r); +			goto failure; +		} +	} +  	r = mes_v11_0_query_sched_status(&adev->mes);  	if (r) {  		DRM_ERROR("MES is busy\n"); @@ -1223,6 +1347,11 @@ failure:  static int mes_v11_0_hw_fini(void *handle)  { +	struct amdgpu_device *adev = (struct amdgpu_device *)handle; +	if (amdgpu_sriov_is_mes_info_enable(adev)) { +		amdgpu_bo_free_kernel(&adev->mes.resource_1, &adev->mes.resource_1_gpu_addr, +					&adev->mes.resource_1_addr); +	}  	return 0;  } @@ -1288,6 +1417,8 @@ static const struct amd_ip_funcs mes_v11_0_ip_funcs = {  	.hw_fini = mes_v11_0_hw_fini,  	.suspend = mes_v11_0_suspend,  	.resume = mes_v11_0_resume, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version mes_v11_0_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index c0fc44cdd658..7a1ff298417a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -559,6 +559,20 @@ static void mmhub_v1_8_get_clockgating(struct amdgpu_device *adev, u64 *flags)  } +static bool mmhub_v1_8_query_utcl2_poison_status(struct amdgpu_device *adev, +				int hub_inst) +{ +	u32 fed, status; + +	status = RREG32_SOC15(MMHUB, hub_inst, regVM_L2_PROTECTION_FAULT_STATUS); +	fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED); +	/* reset page fault status */ +	WREG32_P(SOC15_REG_OFFSET(MMHUB, hub_inst, +			regVM_L2_PROTECTION_FAULT_STATUS), 1, ~1); + +	return fed; +} +  const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {  	.get_fb_location = mmhub_v1_8_get_fb_location,  	.init = mmhub_v1_8_init, @@ -568,6 +582,7 @@ const struct amdgpu_mmhub_funcs mmhub_v1_8_funcs = {  	.setup_vm_pt_regs = mmhub_v1_8_setup_vm_pt_regs,  	.set_clockgating = mmhub_v1_8_set_clockgating,  	.get_clockgating = mmhub_v1_8_get_clockgating, +	.query_utcl2_poison_status = mmhub_v1_8_query_utcl2_poison_status,  };  static const struct amdgpu_ras_err_status_reg_entry mmhub_v1_8_ce_reg_list[] = { @@ -706,28 +721,32 @@ static const struct amdgpu_ras_block_hw_ops mmhub_v1_8_ras_hw_ops = {  	.reset_ras_error_count = mmhub_v1_8_reset_ras_error_count,  }; -static int mmhub_v1_8_aca_bank_generate_report(struct aca_handle *handle, -					       struct aca_bank *bank, enum aca_error_type type, -					       struct aca_bank_report *report, void *data) +static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, +				      enum aca_smu_type type, void *data)  { -	u64 status, misc0; +	struct aca_bank_info info; +	u64 misc0;  	int ret; -	status = bank->regs[ACA_REG_IDX_STATUS]; -	if ((type == ACA_ERROR_TYPE_UE && -	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) || -	    (type == ACA_ERROR_TYPE_CE && -	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) { - -		ret = aca_bank_info_decode(bank, &report->info); -		if (ret) -			return ret; - -		misc0 = bank->regs[ACA_REG_IDX_MISC0]; -		report->count[type] = ACA_REG__MISC0__ERRCNT(misc0); +	ret = aca_bank_info_decode(bank, &info); +	if (ret) +		return ret; + +	misc0 = bank->regs[ACA_REG_IDX_MISC0]; +	switch (type) { +	case ACA_SMU_TYPE_UE: +		ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, +						     1ULL); +		break; +	case ACA_SMU_TYPE_CE: +		ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, +						     ACA_REG__MISC0__ERRCNT(misc0)); +		break; +	default: +		return -EINVAL;  	} -	return 0; +	return ret;  }  /* reference to smu driver if header file */ @@ -741,7 +760,7 @@ static int mmhub_v1_8_err_codes[] = {  };  static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, -					 enum aca_error_type type, void *data) +					 enum aca_smu_type type, void *data)  {  	u32 instlo; @@ -760,7 +779,7 @@ static bool mmhub_v1_8_aca_bank_is_valid(struct aca_handle *handle, struct aca_b  }  static const struct aca_bank_ops mmhub_v1_8_aca_bank_ops = { -	.aca_bank_generate_report = mmhub_v1_8_aca_bank_generate_report, +	.aca_bank_parser = mmhub_v1_8_aca_bank_parser,  	.aca_bank_is_valid = mmhub_v1_8_aca_bank_is_valid,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index a2bd2c3b1ef9..0c7275bca8f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)  		timeout -= 10;  	} while (timeout > 1); +	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n"); +  flr_done:  	atomic_set(&adev->reset_domain->in_gpu_reset, 0);  	up_write(&adev->reset_domain->sem); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 77f5b55decf9..aba00d961627 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)  		timeout -= 10;  	} while (timeout > 1); +	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n"); +  flr_done:  	atomic_set(&adev->reset_domain->in_gpu_reset, 0);  	up_write(&adev->reset_domain->sem); @@ -444,7 +446,6 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,  		amdgpu_virt_fini_data_exchange(adev);  		xgpu_nv_send_access_requests_with_param(adev,  					IDH_RAS_POISON,	block, 0, 0); -		amdgpu_virt_init_data_exchange(adev);  	}  } diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c index 4178f4e5dad7..b281462093f1 100644 --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c @@ -713,6 +713,8 @@ static const struct amd_ip_funcs navi10_ih_ip_funcs = {  	.set_clockgating_state = navi10_ih_set_clockgating_state,  	.set_powergating_state = navi10_ih_set_powergating_state,  	.get_clockgating_state = navi10_ih_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs navi10_ih_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 4d7976b77767..12e54047bf79 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -110,7 +110,7 @@ static const struct amdgpu_video_codec_info sc_video_codecs_decode_array_vcn0[]  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, -	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, +	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)},  }; @@ -121,7 +121,7 @@ static const struct amdgpu_video_codec_info sc_video_codecs_decode_array_vcn1[]  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, -	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, +	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)},  }; @@ -199,7 +199,7 @@ static const struct amdgpu_video_codec_info yc_video_codecs_decode_array[] = {  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)}, -	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, +	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)},  }; @@ -1131,4 +1131,6 @@ static const struct amd_ip_funcs nv_common_ip_funcs = {  	.set_clockgating_state = nv_common_set_clockgating_state,  	.set_powergating_state = nv_common_set_powergating_state,  	.get_clockgating_state = nv_common_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c index 78a95f8f370b..f08a32c18694 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c @@ -169,7 +169,8 @@ static int psp_v14_0_bootloader_load_intf_drv(struct psp_context *psp)  static int psp_v14_0_bootloader_load_dbg_drv(struct psp_context *psp)  { -	return psp_v14_0_bootloader_load_component(psp, &psp->dbg_drv, PSP_BL__LOAD_DBGDRV); +	/* dbg_drv was renamed to had_drv in psp v14 */ +	return psp_v14_0_bootloader_load_component(psp, &psp->dbg_drv, PSP_BL__LOAD_HADDRV);  }  static int psp_v14_0_bootloader_load_ras_drv(struct psp_context *psp) @@ -177,6 +178,10 @@ static int psp_v14_0_bootloader_load_ras_drv(struct psp_context *psp)  	return psp_v14_0_bootloader_load_component(psp, &psp->ras_drv, PSP_BL__LOAD_RASDRV);  } +static int psp_v14_0_bootloader_load_ipkeymgr_drv(struct psp_context *psp) +{ +	return psp_v14_0_bootloader_load_component(psp, &psp->ipkeymgr_drv, PSP_BL__LOAD_IPKEYMGRDRV); +}  static int psp_v14_0_bootloader_load_sos(struct psp_context *psp)  { @@ -653,6 +658,7 @@ static const struct psp_funcs psp_v14_0_funcs = {  	.bootloader_load_intf_drv = psp_v14_0_bootloader_load_intf_drv,  	.bootloader_load_dbg_drv = psp_v14_0_bootloader_load_dbg_drv,  	.bootloader_load_ras_drv = psp_v14_0_bootloader_load_ras_drv, +	.bootloader_load_ipkeymgr_drv = psp_v14_0_bootloader_load_ipkeymgr_drv,  	.bootloader_load_sos = psp_v14_0_bootloader_load_sos,  	.ring_create = psp_v14_0_ring_create,  	.ring_stop = psp_v14_0_ring_stop, diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c index 07e19caf2bc1..ac8a9b9b3e52 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c @@ -1113,6 +1113,8 @@ static const struct amd_ip_funcs sdma_v2_4_ip_funcs = {  	.soft_reset = sdma_v2_4_soft_reset,  	.set_clockgating_state = sdma_v2_4_set_clockgating_state,  	.set_powergating_state = sdma_v2_4_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs sdma_v2_4_ring_funcs = { @@ -1176,7 +1178,7 @@ static void sdma_v2_4_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: unused + * @copy_flags: unused   *   * Copy GPU buffers using the DMA engine (VI).   * Used by the amdgpu ttm implementation to move pages if @@ -1186,7 +1188,7 @@ static void sdma_v2_4_emit_copy_buffer(struct amdgpu_ib *ib,  				       uint64_t src_offset,  				       uint64_t dst_offset,  				       uint32_t byte_count, -				       bool tmz) +				       uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) |  		SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c index 2ad615be4bb3..b8ebdc4ae6f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c @@ -1553,6 +1553,8 @@ static const struct amd_ip_funcs sdma_v3_0_ip_funcs = {  	.set_clockgating_state = sdma_v3_0_set_clockgating_state,  	.set_powergating_state = sdma_v3_0_set_powergating_state,  	.get_clockgating_state = sdma_v3_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs sdma_v3_0_ring_funcs = { @@ -1616,7 +1618,7 @@ static void sdma_v3_0_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: unused + * @copy_flags: unused   *   * Copy GPU buffers using the DMA engine (VI).   * Used by the amdgpu ttm implementation to move pages if @@ -1626,7 +1628,7 @@ static void sdma_v3_0_emit_copy_buffer(struct amdgpu_ib *ib,  				       uint64_t src_offset,  				       uint64_t dst_offset,  				       uint32_t byte_count, -				       bool tmz) +				       uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) |  		SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 43775cb67ff5..101038395c3b 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -2021,6 +2021,9 @@ static int sdma_v4_0_process_trap_irq(struct amdgpu_device *adev,  	DRM_DEBUG("IH: SDMA trap\n");  	instance = sdma_v4_0_irq_id_to_seq(entry->client_id); +	if (instance < 0) +		return instance; +  	switch (entry->ring_id) {  	case 0:  		amdgpu_fence_process(&adev->sdma.instance[instance].ring); @@ -2448,7 +2451,7 @@ static void sdma_v4_0_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: if a secure copy should be used + * @copy_flags: copy flags for the buffers   *   * Copy GPU buffers using the DMA engine (VEGA10/12).   * Used by the amdgpu ttm implementation to move pages if @@ -2458,11 +2461,11 @@ static void sdma_v4_0_emit_copy_buffer(struct amdgpu_ib *ib,  				       uint64_t src_offset,  				       uint64_t dst_offset,  				       uint32_t byte_count, -				       bool tmz) +				       uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) |  		SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | -		SDMA_PKT_COPY_LINEAR_HEADER_TMZ(tmz ? 1 : 0); +		SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0);  	ib->ptr[ib->length_dw++] = byte_count - 1;  	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */  	ib->ptr[ib->length_dw++] = lower_32_bits(src_offset); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 34237a1b1f2e..341b24d8320b 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -368,7 +368,8 @@ static void sdma_v4_4_2_ring_emit_hdp_flush(struct amdgpu_ring *ring)  	u32 ref_and_mask = 0;  	const struct nbio_hdp_flush_reg *nbio_hf_reg = adev->nbio.hdp_flush_reg; -	ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 << ring->me; +	ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 +		       << (ring->me % adev->sdma.num_inst_per_aid);  	sdma_v4_4_2_wait_reg_mem(ring, 0, 1,  			       adev->nbio.funcs->get_hdp_flush_done_offset(adev), @@ -1602,19 +1603,9 @@ static int sdma_v4_4_2_set_ecc_irq_state(struct amdgpu_device *adev,  	u32 sdma_cntl;  	sdma_cntl = RREG32_SDMA(type, regSDMA_CNTL); -	switch (state) { -	case AMDGPU_IRQ_STATE_DISABLE: -		sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, -					  DRAM_ECC_INT_ENABLE, 0); -		WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl); -		break; -	/* sdma ecc interrupt is enabled by default -	 * driver doesn't need to do anything to -	 * enable the interrupt */ -	case AMDGPU_IRQ_STATE_ENABLE: -	default: -		break; -	} +	sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, DRAM_ECC_INT_ENABLE, +					state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); +	WREG32_SDMA(type, regSDMA_CNTL, sdma_cntl);  	return 0;  } @@ -1954,7 +1945,7 @@ static void sdma_v4_4_2_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: if a secure copy should be used + * @copy_flags: copy flags for the buffers   *   * Copy GPU buffers using the DMA engine.   * Used by the amdgpu ttm implementation to move pages if @@ -1964,11 +1955,11 @@ static void sdma_v4_4_2_emit_copy_buffer(struct amdgpu_ib *ib,  				       uint64_t src_offset,  				       uint64_t dst_offset,  				       uint32_t byte_count, -				       bool tmz) +				       uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) |  		SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | -		SDMA_PKT_COPY_LINEAR_HEADER_TMZ(tmz ? 1 : 0); +		SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0);  	ib->ptr[ib->length_dw++] = byte_count - 1;  	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */  	ib->ptr[ib->length_dw++] = lower_32_bits(src_offset); @@ -2189,35 +2180,39 @@ static const struct amdgpu_ras_block_hw_ops sdma_v4_4_2_ras_hw_ops = {  	.reset_ras_error_count = sdma_v4_4_2_reset_ras_error_count,  }; -static int sdma_v4_4_2_aca_bank_generate_report(struct aca_handle *handle, -						struct aca_bank *bank, enum aca_error_type type, -						struct aca_bank_report *report, void *data) +static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, +				       enum aca_smu_type type, void *data)  { -	u64 status, misc0; +	struct aca_bank_info info; +	u64 misc0;  	int ret; -	status = bank->regs[ACA_REG_IDX_STATUS]; -	if ((type == ACA_ERROR_TYPE_UE && -	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_FAULT) || -	    (type == ACA_ERROR_TYPE_CE && -	     ACA_REG__STATUS__ERRORCODEEXT(status) == ACA_EXTERROR_CODE_CE)) { - -		ret = aca_bank_info_decode(bank, &report->info); -		if (ret) -			return ret; +	ret = aca_bank_info_decode(bank, &info); +	if (ret) +		return ret; -		misc0 = bank->regs[ACA_REG_IDX_MISC0]; -		report->count[type] = ACA_REG__MISC0__ERRCNT(misc0); +	misc0 = bank->regs[ACA_REG_IDX_MISC0]; +	switch (type) { +	case ACA_SMU_TYPE_UE: +		ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_UE, +						     1ULL); +		break; +	case ACA_SMU_TYPE_CE: +		ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, +						     ACA_REG__MISC0__ERRCNT(misc0)); +		break; +	default: +		return -EINVAL;  	} -	return 0; +	return ret;  }  /* CODE_SDMA0 - CODE_SDMA4, reference to smu driver if header file */  static int sdma_v4_4_2_err_codes[] = { 33, 34, 35, 36 };  static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_bank *bank, -					  enum aca_error_type type, void *data) +					  enum aca_smu_type type, void *data)  {  	u32 instlo; @@ -2236,7 +2231,7 @@ static bool sdma_v4_4_2_aca_bank_is_valid(struct aca_handle *handle, struct aca_  }  static const struct aca_bank_ops sdma_v4_4_2_aca_bank_ops = { -	.aca_bank_generate_report = sdma_v4_4_2_aca_bank_generate_report, +	.aca_bank_parser = sdma_v4_4_2_aca_bank_parser,  	.aca_bank_is_valid = sdma_v4_4_2_aca_bank_is_valid,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index 883e8a1b8a40..b7d33d78bce0 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -999,7 +999,8 @@ static int sdma_v5_0_ring_test_ring(struct amdgpu_ring *ring)  	r = amdgpu_ring_alloc(ring, 20);  	if (r) {  		DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", ring->idx, r); -		amdgpu_device_wb_free(adev, index); +		if (!ring->is_mes_queue) +			amdgpu_device_wb_free(adev, index);  		return r;  	} @@ -1805,7 +1806,7 @@ static void sdma_v5_0_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: if a secure copy should be used + * @copy_flags: copy flags for the buffers   *   * Copy GPU buffers using the DMA engine (NAVI10).   * Used by the amdgpu ttm implementation to move pages if @@ -1815,11 +1816,11 @@ static void sdma_v5_0_emit_copy_buffer(struct amdgpu_ib *ib,  				       uint64_t src_offset,  				       uint64_t dst_offset,  				       uint32_t byte_count, -				       bool tmz) +				       uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) |  		SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | -		SDMA_PKT_COPY_LINEAR_HEADER_TMZ(tmz ? 1 : 0); +		SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0);  	ib->ptr[ib->length_dw++] = byte_count - 1;  	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */  	ib->ptr[ib->length_dw++] = lower_32_bits(src_offset); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index 42f4bd250def..cc9e961f0078 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -280,17 +280,21 @@ static void sdma_v5_2_ring_emit_hdp_flush(struct amdgpu_ring *ring)  	u32 ref_and_mask = 0;  	const struct nbio_hdp_flush_reg *nbio_hf_reg = adev->nbio.hdp_flush_reg; -	ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 << ring->me; - -	amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_POLL_REGMEM) | -			  SDMA_PKT_POLL_REGMEM_HEADER_HDP_FLUSH(1) | -			  SDMA_PKT_POLL_REGMEM_HEADER_FUNC(3)); /* == */ -	amdgpu_ring_write(ring, (adev->nbio.funcs->get_hdp_flush_done_offset(adev)) << 2); -	amdgpu_ring_write(ring, (adev->nbio.funcs->get_hdp_flush_req_offset(adev)) << 2); -	amdgpu_ring_write(ring, ref_and_mask); /* reference */ -	amdgpu_ring_write(ring, ref_and_mask); /* mask */ -	amdgpu_ring_write(ring, SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) | -			  SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(10)); /* retry count, poll interval */ +	if (ring->me > 1) { +		amdgpu_asic_flush_hdp(adev, ring); +	} else { +		ref_and_mask = nbio_hf_reg->ref_and_mask_sdma0 << ring->me; + +		amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_POLL_REGMEM) | +				  SDMA_PKT_POLL_REGMEM_HEADER_HDP_FLUSH(1) | +				  SDMA_PKT_POLL_REGMEM_HEADER_FUNC(3)); /* == */ +		amdgpu_ring_write(ring, (adev->nbio.funcs->get_hdp_flush_done_offset(adev)) << 2); +		amdgpu_ring_write(ring, (adev->nbio.funcs->get_hdp_flush_req_offset(adev)) << 2); +		amdgpu_ring_write(ring, ref_and_mask); /* reference */ +		amdgpu_ring_write(ring, ref_and_mask); /* mask */ +		amdgpu_ring_write(ring, SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) | +				  SDMA_PKT_POLL_REGMEM_DW5_INTERVAL(10)); /* retry count, poll interval */ +	}  }  /** @@ -835,7 +839,8 @@ static int sdma_v5_2_ring_test_ring(struct amdgpu_ring *ring)  	r = amdgpu_ring_alloc(ring, 20);  	if (r) {  		DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", ring->idx, r); -		amdgpu_device_wb_free(adev, index); +		if (!ring->is_mes_queue) +			amdgpu_device_wb_free(adev, index);  		return r;  	} @@ -1747,7 +1752,7 @@ static void sdma_v5_2_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: if a secure copy should be used + * @copy_flags: copy flags for the buffers   *   * Copy GPU buffers using the DMA engine.   * Used by the amdgpu ttm implementation to move pages if @@ -1757,11 +1762,11 @@ static void sdma_v5_2_emit_copy_buffer(struct amdgpu_ib *ib,  				       uint64_t src_offset,  				       uint64_t dst_offset,  				       uint32_t byte_count, -				       bool tmz) +				       uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = SDMA_PKT_HEADER_OP(SDMA_OP_COPY) |  		SDMA_PKT_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | -		SDMA_PKT_COPY_LINEAR_HEADER_TMZ(tmz ? 1 : 0); +		SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0);  	ib->ptr[ib->length_dw++] = byte_count - 1;  	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */  	ib->ptr[ib->length_dw++] = lower_32_bits(src_offset); diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index 361835a61f2e..c833b6b8373b 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -507,6 +507,13 @@ static int sdma_v6_0_gfx_resume(struct amdgpu_device *adev)  		/* set minor_ptr_update to 0 after wptr programed */  		WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_QUEUE0_MINOR_PTR_UPDATE), 0); +		/* Set up sdma hang watchdog */ +		temp = RREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_WATCHDOG_CNTL)); +		/* 100ms per unit */ +		temp = REG_SET_FIELD(temp, SDMA0_WATCHDOG_CNTL, QUEUE_HANG_COUNT, +				     max(adev->usec_timeout/100000, 1)); +		WREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_WATCHDOG_CNTL), temp); +  		/* Set up RESP_MODE to non-copy addresses */  		temp = RREG32_SOC15_IP(GC, sdma_v6_0_get_reg_offset(adev, i, regSDMA0_UTCL1_CNTL));  		temp = REG_SET_FIELD(temp, SDMA0_UTCL1_CNTL, RESP_MODE, 3); @@ -854,7 +861,8 @@ static int sdma_v6_0_ring_test_ring(struct amdgpu_ring *ring)  	r = amdgpu_ring_alloc(ring, 5);  	if (r) {  		DRM_ERROR("amdgpu: dma failed to lock ring %d (%d).\n", ring->idx, r); -		amdgpu_device_wb_free(adev, index); +		if (!ring->is_mes_queue) +			amdgpu_device_wb_free(adev, index);  		return r;  	} @@ -1567,7 +1575,7 @@ static void sdma_v6_0_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: if a secure copy should be used + * @copy_flags: copy flags for the buffers   *   * Copy GPU buffers using the DMA engine.   * Used by the amdgpu ttm implementation to move pages if @@ -1577,11 +1585,11 @@ static void sdma_v6_0_emit_copy_buffer(struct amdgpu_ib *ib,  				       uint64_t src_offset,  				       uint64_t dst_offset,  				       uint32_t byte_count, -				       bool tmz) +				       uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = SDMA_PKT_COPY_LINEAR_HEADER_OP(SDMA_OP_COPY) |  		SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(SDMA_SUBOP_COPY_LINEAR) | -		SDMA_PKT_COPY_LINEAR_HEADER_TMZ(tmz ? 1 : 0); +		SDMA_PKT_COPY_LINEAR_HEADER_TMZ((copy_flags & AMDGPU_COPY_FLAGS_TMZ) ? 1 : 0);  	ib->ptr[ib->length_dw++] = byte_count - 1;  	ib->ptr[ib->length_dw++] = 0; /* src/dst endian swap */  	ib->ptr[ib->length_dw++] = lower_32_bits(src_offset); diff --git a/drivers/gpu/drm/amd/amdgpu/si.c b/drivers/gpu/drm/amd/amdgpu/si.c index 23e4ef4fff7c..85235470e872 100644 --- a/drivers/gpu/drm/amd/amdgpu/si.c +++ b/drivers/gpu/drm/amd/amdgpu/si.c @@ -1409,9 +1409,9 @@ static int si_gpu_pci_config_reset(struct amdgpu_device *adev)  	return r;  } -static bool si_asic_supports_baco(struct amdgpu_device *adev) +static int si_asic_supports_baco(struct amdgpu_device *adev)  { -	return false; +	return 0;  }  static enum amd_reset_method @@ -2706,6 +2706,8 @@ static const struct amd_ip_funcs si_common_ip_funcs = {  	.soft_reset = si_common_soft_reset,  	.set_clockgating_state = si_common_set_clockgating_state,  	.set_powergating_state = si_common_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ip_block_version si_common_ip_block = diff --git a/drivers/gpu/drm/amd/amdgpu/si_dma.c b/drivers/gpu/drm/amd/amdgpu/si_dma.c index 9aa0e11ee673..11db5b755832 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_dma.c +++ b/drivers/gpu/drm/amd/amdgpu/si_dma.c @@ -708,6 +708,8 @@ static const struct amd_ip_funcs si_dma_ip_funcs = {  	.soft_reset = si_dma_soft_reset,  	.set_clockgating_state = si_dma_set_clockgating_state,  	.set_powergating_state = si_dma_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs si_dma_ring_funcs = { @@ -761,7 +763,7 @@ static void si_dma_set_irq_funcs(struct amdgpu_device *adev)   * @src_offset: src GPU address   * @dst_offset: dst GPU address   * @byte_count: number of bytes to xfer - * @tmz: is this a secure operation + * @copy_flags: unused   *   * Copy GPU buffers using the DMA engine (VI).   * Used by the amdgpu ttm implementation to move pages if @@ -771,7 +773,7 @@ static void si_dma_emit_copy_buffer(struct amdgpu_ib *ib,  				       uint64_t src_offset,  				       uint64_t dst_offset,  				       uint32_t byte_count, -				       bool tmz) +				       uint32_t copy_flags)  {  	ib->ptr[ib->length_dw++] = DMA_PACKET(DMA_PACKET_COPY,  					      1, 0, 0, byte_count); diff --git a/drivers/gpu/drm/amd/amdgpu/si_ih.c b/drivers/gpu/drm/amd/amdgpu/si_ih.c index cada9f300a7f..5237395e4fab 100644 --- a/drivers/gpu/drm/amd/amdgpu/si_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/si_ih.c @@ -296,6 +296,8 @@ static const struct amd_ip_funcs si_ih_ip_funcs = {  	.soft_reset = si_ih_soft_reset,  	.set_clockgating_state = si_ih_set_clockgating_state,  	.set_powergating_state = si_ih_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs si_ih_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c index 93f6772d1b24..481217c32d85 100644 --- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c +++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c @@ -92,7 +92,7 @@ static int sienna_cichlid_mode2_suspend_ip(struct amdgpu_device *adev)  		adev->ip_blocks[i].status.hw = false;  	} -	return r; +	return 0;  }  static int diff --git a/drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.c b/drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.c new file mode 100644 index 000000000000..2a51a70d4846 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.c @@ -0,0 +1,62 @@ +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "amdgpu.h" +#include "smuio_v14_0_2.h" +#include "smuio/smuio_14_0_2_offset.h" +#include "smuio/smuio_14_0_2_sh_mask.h" +#include <linux/preempt.h> + +static u32 smuio_v14_0_2_get_rom_index_offset(struct amdgpu_device *adev) +{ +	return SOC15_REG_OFFSET(SMUIO, 0, regROM_INDEX); +} + +static u32 smuio_v14_0_2_get_rom_data_offset(struct amdgpu_device *adev) +{ +	return SOC15_REG_OFFSET(SMUIO, 0, regROM_DATA); +} + +static u64 smuio_v14_0_2_get_gpu_clock_counter(struct amdgpu_device *adev) +{ +	u64 clock; +	u64 clock_counter_lo, clock_counter_hi_pre, clock_counter_hi_after; + +	preempt_disable(); +	clock_counter_hi_pre = (u64)RREG32_SOC15(SMUIO, 0, regGOLDEN_TSC_COUNT_UPPER); +	clock_counter_lo = (u64)RREG32_SOC15(SMUIO, 0, regGOLDEN_TSC_COUNT_LOWER); +	/* the clock counter may be udpated during polling the counters */ +	clock_counter_hi_after = (u64)RREG32_SOC15(SMUIO, 0, regGOLDEN_TSC_COUNT_UPPER); +	if (clock_counter_hi_pre != clock_counter_hi_after) +		clock_counter_lo = (u64)RREG32_SOC15(SMUIO, 0, regGOLDEN_TSC_COUNT_LOWER); +	preempt_enable(); + +	clock = clock_counter_lo | (clock_counter_hi_after << 32ULL); + +	return clock; +} + +const struct amdgpu_smuio_funcs smuio_v14_0_2_funcs = { +	.get_rom_index_offset = smuio_v14_0_2_get_rom_index_offset, +	.get_rom_data_offset = smuio_v14_0_2_get_rom_data_offset, +	.get_gpu_clock_counter = smuio_v14_0_2_get_gpu_clock_counter, +}; diff --git a/drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.h b/drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.h new file mode 100644 index 000000000000..6e617f832d90 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/smuio_v14_0_2.h @@ -0,0 +1,30 @@ +/* + * Copyright 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __SMUIO_V14_0_2_H__ +#define __SMUIO_V14_0_2_H__ + +#include "soc15_common.h" + +extern const struct amdgpu_smuio_funcs smuio_v14_0_2_funcs; + +#endif /* __SMUIO_V14_0_2_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index dec81ccf6240..170f02e96717 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -143,7 +143,7 @@ static const struct amdgpu_video_codec_info rn_video_codecs_decode_array[] =  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VC1, 4096, 4096, 4)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, -	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, +	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)},  }; @@ -156,7 +156,7 @@ static const struct amdgpu_video_codecs rn_video_codecs_decode =  static const struct amdgpu_video_codec_info vcn_4_0_3_video_codecs_decode_array[] = {  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, -	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, +	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)},  }; @@ -502,7 +502,7 @@ static int soc15_asic_baco_reset(struct amdgpu_device *adev)  static enum amd_reset_method  soc15_asic_reset_method(struct amdgpu_device *adev)  { -	bool baco_reset = false; +	int baco_reset = 0;  	bool connected_to_cpu = false;  	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); @@ -540,7 +540,7 @@ soc15_asic_reset_method(struct amdgpu_device *adev)  			 */  			if (ras && adev->ras_enabled &&  			    adev->pm.fw_version <= 0x283400) -				baco_reset = false; +				baco_reset = 0;  		} else {  			baco_reset = amdgpu_dpm_is_baco_supported(adev);  		} @@ -620,7 +620,7 @@ static int soc15_asic_reset(struct amdgpu_device *adev)  	}  } -static bool soc15_supports_baco(struct amdgpu_device *adev) +static int soc15_supports_baco(struct amdgpu_device *adev)  {  	switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {  	case IP_VERSION(9, 0, 0): @@ -628,13 +628,13 @@ static bool soc15_supports_baco(struct amdgpu_device *adev)  		if (adev->asic_type == CHIP_VEGA20) {  			if (adev->psp.sos.fw_version >= 0x80067)  				return amdgpu_dpm_is_baco_supported(adev); -			return false; +			return 0;  		} else {  			return amdgpu_dpm_is_baco_supported(adev);  		}  		break;  	default: -		return false; +		return 0;  	}  } @@ -1501,4 +1501,6 @@ static const struct amd_ip_funcs soc15_common_ip_funcs = {  	.set_clockgating_state = soc15_common_set_clockgating_state,  	.set_powergating_state = soc15_common_set_powergating_state,  	.get_clockgating_state= soc15_common_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.h b/drivers/gpu/drm/amd/amdgpu/soc15.h index 1444b7765e4b..282584a48be0 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15.h @@ -88,6 +88,8 @@ struct soc15_ras_field_entry {  };  #define SOC15_REG_ENTRY(ip, inst, reg)	ip##_HWIP, inst, reg##_BASE_IDX, reg +#define SOC15_REG_ENTRY_STR(ip, inst, reg) \ +	{ ip##_HWIP, inst, reg##_BASE_IDX, reg, #reg }  #define SOC15_REG_ENTRY_OFFSET(entry)	(adev->reg_offset[entry.hwip][entry.inst][entry.seg] + entry.reg_offset) diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 581a3bd11481..fb6797467571 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -72,7 +72,7 @@ static const struct amdgpu_video_codecs vcn_4_0_0_video_codecs_encode_vcn1 = {  static const struct amdgpu_video_codec_info vcn_4_0_0_video_codecs_decode_array_vcn0[] = {  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, -	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, +	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_AV1, 8192, 4352, 0)},  }; @@ -80,7 +80,7 @@ static const struct amdgpu_video_codec_info vcn_4_0_0_video_codecs_decode_array_  static const struct amdgpu_video_codec_info vcn_4_0_0_video_codecs_decode_array_vcn1[] = {  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_MPEG4_AVC, 4096, 4096, 52)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_HEVC, 8192, 4352, 186)}, -	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 4096, 4096, 0)}, +	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, 16384, 16384, 0)},  	{codec_info_build(AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_VP9, 8192, 4352, 0)},  }; @@ -457,10 +457,8 @@ static bool soc21_need_full_reset(struct amdgpu_device *adev)  {  	switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {  	case IP_VERSION(11, 0, 0): -		return amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC);  	case IP_VERSION(11, 0, 2):  	case IP_VERSION(11, 0, 3): -		return false;  	default:  		return true;  	} @@ -722,7 +720,10 @@ static int soc21_common_early_init(void *handle)  			AMD_PG_SUPPORT_VCN |  			AMD_PG_SUPPORT_JPEG |  			AMD_PG_SUPPORT_GFX_PG; -		adev->external_rev_id = adev->rev_id + 0x1; +		if (adev->rev_id == 0) +			adev->external_rev_id = 0x1; +		else +			adev->external_rev_id = adev->rev_id + 0x10;  		break;  	case IP_VERSION(11, 5, 1):  		adev->cg_flags = @@ -869,10 +870,35 @@ static int soc21_common_suspend(void *handle)  	return soc21_common_hw_fini(adev);  } +static bool soc21_need_reset_on_resume(struct amdgpu_device *adev) +{ +	u32 sol_reg1, sol_reg2; + +	/* Will reset for the following suspend abort cases. +	 * 1) Only reset dGPU side. +	 * 2) S3 suspend got aborted and TOS is active. +	 */ +	if (!(adev->flags & AMD_IS_APU) && adev->in_s3 && +	    !adev->suspend_complete) { +		sol_reg1 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81); +		msleep(100); +		sol_reg2 = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_81); + +		return (sol_reg1 != sol_reg2); +	} + +	return false; +} +  static int soc21_common_resume(void *handle)  {  	struct amdgpu_device *adev = (struct amdgpu_device *)handle; +	if (soc21_need_reset_on_resume(adev)) { +		dev_info(adev->dev, "S3 suspend aborted, resetting..."); +		soc21_asic_reset(adev); +	} +  	return soc21_common_hw_init(adev);  } @@ -959,4 +985,6 @@ static const struct amd_ip_funcs soc21_common_ip_funcs = {  	.set_clockgating_state = soc21_common_set_clockgating_state,  	.set_powergating_state = soc21_common_set_powergating_state,  	.get_clockgating_state = soc21_common_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h index 056d4df8fa1f..3ac56a9645eb 100644 --- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h +++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h @@ -146,6 +146,7 @@ struct ta_ras_mca_addr {  	uint32_t ch_inst;  	uint32_t umc_inst;  	uint32_t node_inst; +	uint32_t socket_id;  };  struct ta_ras_phy_addr { diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c index 450b6e831509..24d49d813607 100644 --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c @@ -486,6 +486,8 @@ static const struct amd_ip_funcs tonga_ih_ip_funcs = {  	.post_soft_reset = tonga_ih_post_soft_reset,  	.set_clockgating_state = tonga_ih_set_clockgating_state,  	.set_powergating_state = tonga_ih_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ih_funcs tonga_ih_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 77af4e25ff46..bfe61d86ee6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -28,27 +28,7 @@  #include "umc/umc_12_0_0_sh_mask.h"  #include "mp/mp_13_0_6_sh_mask.h" -const uint32_t -	umc_v12_0_channel_idx_tbl[] -			[UMC_V12_0_UMC_INSTANCE_NUM] -			[UMC_V12_0_CHANNEL_INSTANCE_NUM] = { -		{{3,   7,   11,  15,  2,   6,   10,  14},  {1,   5,   9,   13,  0,   4,   8,   12}, -		 {19,  23,  27,  31,  18,  22,  26,  30},  {17,  21,  25,  29,  16,  20,  24,  28}}, -		{{47,  43,  39,  35,  46,  42,  38,  34},  {45,  41,  37,  33,  44,  40,  36,  32}, -		 {63,  59,  55,  51,  62,  58,  54,  50},  {61,  57,  53,  49,  60,  56,  52,  48}}, -		{{79,  75,  71,  67,  78,  74,  70,  66},  {77,  73,  69,  65,  76,  72,  68,  64}, -		 {95,  91,  87,  83,  94,  90,  86,  82},  {93,  89,  85,  81,  92,  88,  84,  80}}, -		{{99,  103, 107, 111, 98,  102, 106, 110}, {97,  101, 105, 109, 96,  100, 104, 108}, -		 {115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 112, 116, 120, 124}} -	}; - -/* mapping of MCA error address to normalized address */ -static const uint32_t umc_v12_0_ma2na_mapping[] = { -	0,  5,  6,  8,  9,  14, 12, 13, -	10, 11, 15, 16, 17, 18, 19, 20, -	21, 22, 23, 24, 25, 26, 27, 28, -	24, 7,  29, 30, -}; +#define MAX_ECC_NUM_PER_RETIREMENT  32  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,  					    uint32_t node_inst, @@ -192,99 +172,74 @@ static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev,  	umc_v12_0_reset_error_count(adev);  } -static bool umc_v12_0_bit_wise_xor(uint32_t val) +static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, +					struct ras_err_data *err_data, +					struct ta_ras_query_address_input *addr_in)  { -	bool result = 0; -	int i; +	uint32_t col, row, row_xor, bank, channel_index; +	uint64_t soc_pa, retired_page, column, err_addr; +	struct ta_ras_query_address_output addr_out; -	for (i = 0; i < 32; i++) -		result = result ^ ((val >> i) & 0x1); +	err_addr = addr_in->ma.err_addr; +	addr_in->addr_type = TA_RAS_MCA_TO_PA; +	if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) { +		dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx", +			err_addr); -	return result; -} +		return; +	} + +	soc_pa = addr_out.pa.pa; +	bank = addr_out.pa.bank; +	channel_index = addr_out.pa.channel_idx; -static void umc_v12_0_mca_addr_to_pa(struct amdgpu_device *adev, -					uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst, -					uint32_t node_inst, -					struct ta_ras_query_address_output *addr_out) -{ -	uint32_t channel_index, i; -	uint64_t na, soc_pa; -	uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row; -	uint32_t bank0, bank1, bank2, bank3, bank; - -	bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL; -	bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL; -	bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL; -	bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL;  	col = (err_addr >> 1) & 0x1fULL;  	row = (err_addr >> 10) & 0x3fffULL; +	row_xor = row ^ (0x1ULL << 13); +	/* clear [C3 C2] in soc physical address */ +	soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); +	/* clear [C4] in soc physical address */ +	soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); + +	/* loop for all possibilities of [C4 C3 C2] */ +	for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { +		retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT); +		retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); +		/* include column bit 0 and 1 */ +		col &= 0x3; +		col |= (column << 2); +		dev_info(adev->dev, +			"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", +			retired_page, row, col, bank, channel_index); +		amdgpu_umc_fill_error_record(err_data, err_addr, +			retired_page, channel_index, addr_in->ma.umc_inst); -	/* apply bank hash algorithm */ -	bank0 = -		bank_hash0 ^ (UMC_V12_0_XOR_EN0 & -		(umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^ -		(umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0)))); -	bank1 = -		bank_hash1 ^ (UMC_V12_0_XOR_EN1 & -		(umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^ -		(umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1)))); -	bank2 = -		bank_hash2 ^ (UMC_V12_0_XOR_EN2 & -		(umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^ -		(umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2)))); -	bank3 = -		bank_hash3 ^ (UMC_V12_0_XOR_EN3 & -		(umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^ -		(umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3)))); - -	bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3); -	err_addr &= ~0x3c0ULL; -	err_addr |= (bank << UMC_V12_0_MCA_B0_BIT); - -	na = 0x0; -	/* convert mca error address to normalized address */ -	for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++) -		na |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i]; - -	channel_index = -		adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num * -			adev->umc.channel_inst_num + -			umc_inst * adev->umc.channel_inst_num + -			ch_inst]; -	/* translate umc channel address to soc pa, 3 parts are included */ -	soc_pa = ADDR_OF_32KB_BLOCK(na) | -		ADDR_OF_256B_BLOCK(channel_index) | -		OFFSET_IN_256B_BLOCK(na); - -	/* the umc channel bits are not original values, they are hashed */ -	UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa); - -	addr_out->pa.pa = soc_pa; -	addr_out->pa.bank = bank; -	addr_out->pa.channel_idx = channel_index; +		/* shift R13 bit */ +		retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); +		dev_info(adev->dev, +			"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", +			retired_page, row_xor, col, bank, channel_index); +		amdgpu_umc_fill_error_record(err_data, err_addr, +			retired_page, channel_index, addr_in->ma.umc_inst); +	}  } -static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, -					    struct ras_err_data *err_data, uint64_t err_addr, -					    uint32_t ch_inst, uint32_t umc_inst, -					    uint32_t node_inst) +static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev, +				struct ta_ras_query_address_input *addr_in, +				uint64_t *pfns, int len)  {  	uint32_t col, row, row_xor, bank, channel_index; -	uint64_t soc_pa, retired_page, column; -	struct ta_ras_query_address_input addr_in; +	uint64_t soc_pa, retired_page, column, err_addr;  	struct ta_ras_query_address_output addr_out; +	uint32_t pos = 0; -	addr_in.addr_type = TA_RAS_MCA_TO_PA; -	addr_in.ma.err_addr = err_addr; -	addr_in.ma.ch_inst = ch_inst; -	addr_in.ma.umc_inst = umc_inst; -	addr_in.ma.node_inst = node_inst; - -	if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out)) -		/* fallback to old path if fail to get pa from psp */ -		umc_v12_0_mca_addr_to_pa(adev, err_addr, ch_inst, umc_inst, -				node_inst, &addr_out); +	err_addr = addr_in->ma.err_addr; +	addr_in->addr_type = TA_RAS_MCA_TO_PA; +	if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) { +		dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx", +			err_addr); +		return 0; +	}  	soc_pa = addr_out.pa.pa;  	bank = addr_out.pa.bank; @@ -302,33 +257,42 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,  	for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {  		retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);  		retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT); + +		if (pos >= len) +			return 0; +		pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; +  		/* include column bit 0 and 1 */  		col &= 0x3;  		col |= (column << 2);  		dev_info(adev->dev,  			"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",  			retired_page, row, col, bank, channel_index); -		amdgpu_umc_fill_error_record(err_data, err_addr, -			retired_page, channel_index, umc_inst);  		/* shift R13 bit */  		retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); + +		if (pos >= len) +			return 0; +		pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; +  		dev_info(adev->dev,  			"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",  			retired_page, row_xor, col, bank, channel_index); -		amdgpu_umc_fill_error_record(err_data, err_addr, -			retired_page, channel_index, umc_inst);  	} + +	return pos;  }  static int umc_v12_0_query_error_address(struct amdgpu_device *adev,  					uint32_t node_inst, uint32_t umc_inst,  					uint32_t ch_inst, void *data)  { +	struct ras_err_data *err_data = (struct ras_err_data *)data; +	struct ta_ras_query_address_input addr_in;  	uint64_t mc_umc_status_addr;  	uint64_t mc_umc_status, err_addr;  	uint64_t mc_umc_addrt0; -	struct ras_err_data *err_data = (struct ras_err_data *)data;  	uint64_t umc_reg_offset =  		get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); @@ -357,8 +321,19 @@ static int umc_v12_0_query_error_address(struct amdgpu_device *adev,  		err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); -		umc_v12_0_convert_error_address(adev, err_data, err_addr, -					ch_inst, umc_inst, node_inst); +		if (!adev->aid_mask && +		    adev->smuio.funcs && +		    adev->smuio.funcs->get_socket_id) +			addr_in.ma.socket_id = adev->smuio.funcs->get_socket_id(adev); +		else +			addr_in.ma.socket_id = 0; + +		addr_in.ma.err_addr = err_addr; +		addr_in.ma.ch_inst = ch_inst; +		addr_in.ma.umc_inst = umc_inst; +		addr_in.ma.node_inst = node_inst; + +		umc_v12_0_convert_error_address(adev, err_data, &addr_in);  	}  	/* clear umc status */ @@ -401,13 +376,20 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,  	return 0;  } +#ifdef TO_BE_REMOVED  static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,  					void *ras_error_status)  { +	struct ras_query_context qctx; + +	memset(&qctx, 0, sizeof(qctx)); +	qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? +						    RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID); +  	amdgpu_mca_smu_log_ras_error(adev, -		AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status); +		AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, &qctx);  	amdgpu_mca_smu_log_ras_error(adev, -		AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status); +		AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, &qctx);  }  static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev, @@ -418,12 +400,16 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade  	struct ras_err_info *err_info;  	struct ras_err_addr *mca_err_addr, *tmp;  	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; +	struct ta_ras_query_address_input addr_in;  	for_each_ras_error(err_node, err_data) {  		err_info = &err_node->err_info;  		if (list_empty(&err_info->err_addr_list))  			continue; +		addr_in.ma.node_inst = err_info->mcm_info.die_id; +		addr_in.ma.socket_id = err_info->mcm_info.socket_id; +  		list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) {  			mc_umc_status = mca_err_addr->err_status;  			if (mc_umc_status && @@ -439,6 +425,10 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade  							MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);  				InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo); +				addr_in.ma.err_addr = err_addr; +				addr_in.ma.ch_inst = MCA_IPID_LO_2_UMC_CH(InstanceIdLo); +				addr_in.ma.umc_inst = MCA_IPID_LO_2_UMC_INST(InstanceIdLo); +  				dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",  					mca_ipid,  					err_info->mcm_info.die_id, @@ -447,10 +437,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade  					err_addr);  				umc_v12_0_convert_error_address(adev, -					err_data, err_addr, -					MCA_IPID_LO_2_UMC_CH(InstanceIdLo), -					MCA_IPID_LO_2_UMC_INST(InstanceIdLo), -					err_info->mcm_info.die_id); +					err_data, &addr_in);  			}  			/* Delete error address node from list and free memory */ @@ -458,6 +445,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade  		}  	}  } +#endif  static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,  			enum amdgpu_mca_error_type type, void *ras_error_status) @@ -498,43 +486,49 @@ const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = {  	.query_ras_error_address = umc_v12_0_query_ras_error_address,  }; -static int umc_v12_0_aca_bank_generate_report(struct aca_handle *handle, struct aca_bank *bank, enum aca_error_type type, -					      struct aca_bank_report *report, void *data) +static int umc_v12_0_aca_bank_parser(struct aca_handle *handle, struct aca_bank *bank, +				     enum aca_smu_type type, void *data)  {  	struct amdgpu_device *adev = handle->adev; -	u64 status; +	struct aca_bank_info info; +	enum aca_error_type err_type; +	u64 status, count; +	u32 ext_error_code;  	int ret; -	ret = aca_bank_info_decode(bank, &report->info); +	status = bank->regs[ACA_REG_IDX_STATUS]; +	if (umc_v12_0_is_deferred_error(adev, status)) +		err_type = ACA_ERROR_TYPE_DEFERRED; +	else if (umc_v12_0_is_uncorrectable_error(adev, status)) +		err_type = ACA_ERROR_TYPE_UE; +	else if (umc_v12_0_is_correctable_error(adev, status)) +		err_type = ACA_ERROR_TYPE_CE; +	else +		return 0; + +	ret = aca_bank_info_decode(bank, &info);  	if (ret)  		return ret; -	status = bank->regs[ACA_REG_IDX_STATUS]; -	switch (type) { -	case ACA_ERROR_TYPE_UE: -		if (umc_v12_0_is_uncorrectable_error(adev, status)) { -			report->count[type] = 1; -		} -		break; -	case ACA_ERROR_TYPE_CE: -		if (umc_v12_0_is_correctable_error(adev, status)) { -			report->count[type] = 1; -		} -		break; -	default: -		return -EINVAL; -	} +	amdgpu_umc_update_ecc_status(adev, +		bank->regs[ACA_REG_IDX_STATUS], +		bank->regs[ACA_REG_IDX_IPID], +		bank->regs[ACA_REG_IDX_ADDR]); -	return 0; +	ext_error_code = ACA_REG__STATUS__ERRORCODEEXT(status); +	count = ext_error_code == 0 ? +		ACA_REG__MISC0__ERRCNT(bank->regs[ACA_REG_IDX_MISC0]) : 1ULL; + +	return aca_error_cache_log_bank_error(handle, &info, err_type, count);  }  static const struct aca_bank_ops umc_v12_0_aca_bank_ops = { -	.aca_bank_generate_report = umc_v12_0_aca_bank_generate_report, +	.aca_bank_parser = umc_v12_0_aca_bank_parser,  };  const struct aca_info umc_v12_0_aca_info = {  	.hwip = ACA_HWIP_TYPE_UMC, -	.mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK, +	.mask = ACA_ERROR_UE_MASK | ACA_ERROR_CE_MASK | ACA_ERROR_DEFERRED_MASK,  	.bank_ops = &umc_v12_0_aca_bank_ops,  }; @@ -554,6 +548,152 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common  	return 0;  } +static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, +			uint64_t status, uint64_t ipid, uint64_t addr) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	uint16_t hwid, mcatype; +	struct ta_ras_query_address_input addr_in; +	uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; +	uint64_t err_addr, hash_val = 0; +	struct ras_ecc_err *ecc_err; +	int count; +	int ret; + +	hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); +	mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); + +	if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) +		return 0; + +	if (!status) +		return 0; + +	if (!umc_v12_0_is_deferred_error(adev, status)) +		return 0; + +	err_addr = REG_GET_FIELD(addr, +				MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + +	dev_info(adev->dev, +		"UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n", +		ipid, +		MCA_IPID_2_SOCKET_ID(ipid), +		MCA_IPID_2_DIE_ID(ipid), +		MCA_IPID_2_UMC_INST(ipid), +		MCA_IPID_2_UMC_CH(ipid), +		err_addr); + +	memset(page_pfn, 0, sizeof(page_pfn)); + +	memset(&addr_in, 0, sizeof(addr_in)); +	addr_in.ma.err_addr = err_addr; +	addr_in.ma.ch_inst = MCA_IPID_2_UMC_CH(ipid); +	addr_in.ma.umc_inst = MCA_IPID_2_UMC_INST(ipid); +	addr_in.ma.node_inst = MCA_IPID_2_DIE_ID(ipid); +	addr_in.ma.socket_id = MCA_IPID_2_SOCKET_ID(ipid); + +	count = umc_v12_0_convert_err_addr(adev, +				&addr_in, page_pfn, ARRAY_SIZE(page_pfn)); +	if (count <= 0) { +		dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count); +		return 0; +	} + +	ret = amdgpu_umc_build_pages_hash(adev, +			page_pfn, count, &hash_val); +	if (ret) { +		dev_err(adev->dev, "Fail to build error pages hash\n"); +		return ret; +	} + +	ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL); +	if (!ecc_err) +		return -ENOMEM; + +	ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL); +	if (!ecc_err->err_pages.pfn) { +		kfree(ecc_err); +		return -ENOMEM; +	} + +	memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn)); +	ecc_err->err_pages.count = count; + +	ecc_err->hash_index = hash_val; +	ecc_err->status = status; +	ecc_err->ipid = ipid; +	ecc_err->addr = addr; + +	ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); +	if (ret) { +		if (ret == -EEXIST) +			con->umc_ecc_log.de_updated = true; +		else +			dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); + +		kfree(ecc_err->err_pages.pfn); +		kfree(ecc_err); +		return ret; +	} + +	con->umc_ecc_log.de_updated = true; + +	return 0; +} + +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev, +				struct ras_ecc_err *ecc_err, void *ras_error_status) +{ +	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; +	uint32_t i = 0; +	int ret = 0; + +	if (!err_data || !ecc_err) +		return -EINVAL; + +	for (i = 0; i < ecc_err->err_pages.count; i++) { +		ret = amdgpu_umc_fill_error_record(err_data, +				ecc_err->addr, +				ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT, +				MCA_IPID_2_UMC_CH(ecc_err->ipid), +				MCA_IPID_2_UMC_INST(ecc_err->ipid)); +		if (ret) +			break; +	} + +	err_data->de_count++; + +	return ret; +} + +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev, +					void *ras_error_status) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT]; +	struct radix_tree_root *ecc_tree; +	int new_detected, ret, i; + +	ecc_tree = &con->umc_ecc_log.de_page_tree; + +	mutex_lock(&con->umc_ecc_log.lock); +	new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries, +			0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG); +	for (i = 0; i < new_detected; i++) { +		if (!entries[i]) +			continue; + +		ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status); +		if (ret) { +			dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret); +			break; +		} +		radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG); +	} +	mutex_unlock(&con->umc_ecc_log.lock); +} +  struct amdgpu_umc_ras umc_v12_0_ras = {  	.ras_block = {  		.hw_ops = &umc_v12_0_ras_hw_ops, @@ -561,8 +701,8 @@ struct amdgpu_umc_ras umc_v12_0_ras = {  	},  	.err_cnt_init = umc_v12_0_err_cnt_init,  	.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, -	.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count, -	.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address, +	.ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,  	.check_ecc_err_status = umc_v12_0_check_ecc_err_status, +	.update_ecc_status = umc_v12_0_update_ecc_status,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index 5973bfb14fce..b4974793850b 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -55,83 +55,38 @@  #define UMC_V12_0_NA_MAP_PA_NUM        8  /* R13 bit shift should be considered, double the number */  #define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2) -/* bank bits in MCA error address */ -#define UMC_V12_0_MCA_B0_BIT 6 -#define UMC_V12_0_MCA_B1_BIT 7 -#define UMC_V12_0_MCA_B2_BIT 8 -#define UMC_V12_0_MCA_B3_BIT 9 +  /* column bits in SOC physical address */  #define UMC_V12_0_PA_C2_BIT 15  #define UMC_V12_0_PA_C4_BIT 21  /* row bits in SOC physical address */  #define UMC_V12_0_PA_R13_BIT 35 -/* channel index bits in SOC physical address */ -#define UMC_V12_0_PA_CH4_BIT 12 -#define UMC_V12_0_PA_CH5_BIT 13 -#define UMC_V12_0_PA_CH6_BIT 14 - -/* bank hash settings */ -#define UMC_V12_0_XOR_EN0 1 -#define UMC_V12_0_XOR_EN1 1 -#define UMC_V12_0_XOR_EN2 1 -#define UMC_V12_0_XOR_EN3 1 -#define UMC_V12_0_COL_XOR0 0x0 -#define UMC_V12_0_COL_XOR1 0x0 -#define UMC_V12_0_COL_XOR2 0x800 -#define UMC_V12_0_COL_XOR3 0x1000 -#define UMC_V12_0_ROW_XOR0 0x11111 -#define UMC_V12_0_ROW_XOR1 0x22222 -#define UMC_V12_0_ROW_XOR2 0x4444 -#define UMC_V12_0_ROW_XOR3 0x8888 - -/* channel hash settings */ -#define UMC_V12_0_HASH_4K 0 -#define UMC_V12_0_HASH_64K 1 -#define UMC_V12_0_HASH_2M 1 -#define UMC_V12_0_HASH_1G 1 -#define UMC_V12_0_HASH_1T 1 - -/* XOR some bits of PA into CH4~CH6 bits (bits 12~14 of PA), - * hash bit is only effective when related setting is enabled - */ -#define UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) ((((channel_idx) >> 5) & 0x1) ^ \ -				(((pa)  >> 20) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \ -				(((pa)  >> 27) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \ -				(((pa)  >> 34) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \ -				(((pa)  >> 41) & 0x1ULL & UMC_V12_0_HASH_1T)) -#define UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) ((((channel_idx) >> 6) & 0x1) ^ \ -				(((pa)  >> 21) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \ -				(((pa)  >> 28) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \ -				(((pa)  >> 35) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \ -				(((pa)  >> 42) & 0x1ULL & UMC_V12_0_HASH_1T)) -#define UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) ((((channel_idx) >> 4) & 0x1) ^ \ -				(((pa)  >> 19) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \ -				(((pa)  >> 26) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \ -				(((pa)  >> 33) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \ -				(((pa)  >> 40) & 0x1ULL & UMC_V12_0_HASH_1T) ^ \ -				(((pa)  >> 47) & 0x1ULL & UMC_V12_0_HASH_4K)) -#define UMC_V12_0_SET_CHANNEL_HASH(channel_idx, pa) do { \ -		(pa) &= ~(0x7ULL << UMC_V12_0_PA_CH4_BIT); \ -		(pa) |= (UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) << UMC_V12_0_PA_CH4_BIT); \ -		(pa) |= (UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) << UMC_V12_0_PA_CH5_BIT); \ -		(pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \ -	} while (0) + +#define MCA_UMC_HWID_V12_0     0x96 +#define MCA_UMC_MCATYPE_V12_0  0x0  #define MCA_IPID_LO_2_UMC_CH(_ipid_lo) (((((_ipid_lo) >> 20) & 0x1) * 4) + \  			(((_ipid_lo) >> 12) & 0xF))  #define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7) +#define MCA_IPID_2_DIE_ID(ipid)  ((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) >> 2) & 0x03) + +#define MCA_IPID_2_UMC_CH(ipid) \ +	(MCA_IPID_LO_2_UMC_CH(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo))) + +#define MCA_IPID_2_UMC_INST(ipid) \ +	(MCA_IPID_LO_2_UMC_INST(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo))) + +#define MCA_IPID_2_SOCKET_ID(ipid) \ +	(((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \ +	 (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03)) +  bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);  bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);  bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);  typedef bool (*check_error_type_func)(struct amdgpu_device *adev, uint64_t mc_umc_status); -extern const uint32_t -	umc_v12_0_channel_idx_tbl[] -			[UMC_V12_0_UMC_INSTANCE_NUM] -			[UMC_V12_0_CHANNEL_INSTANCE_NUM]; -  extern struct amdgpu_umc_ras umc_v12_0_ras;  #endif diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c index c4c77257710c..a32f87992f20 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c @@ -442,11 +442,6 @@ static void umc_v8_10_ecc_info_query_ras_error_address(struct amdgpu_device *ade  		umc_v8_10_ecc_info_query_error_address, ras_error_status);  } -static void umc_v8_10_set_eeprom_table_version(struct amdgpu_ras_eeprom_table_header *hdr) -{ -	hdr->version = RAS_TABLE_VER_V2_1; -} -  const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {  	.query_ras_error_count = umc_v8_10_query_ras_error_count,  	.query_ras_error_address = umc_v8_10_query_ras_error_address, @@ -460,5 +455,4 @@ struct amdgpu_umc_ras umc_v8_10_ras = {  	.query_ras_poison_mode = umc_v8_10_query_ras_poison_mode,  	.ecc_info_query_ras_error_count = umc_v8_10_ecc_info_query_ras_error_count,  	.ecc_info_query_ras_error_address = umc_v8_10_ecc_info_query_ras_error_address, -	.set_eeprom_table_version = umc_v8_10_set_eeprom_table_version,  }; diff --git a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c index 8e7b763cfdb7..bd57896ab85d 100644 --- a/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umsch_mm_v4_0.c @@ -60,7 +60,7 @@ static int umsch_mm_v4_0_load_microcode(struct amdgpu_umsch_mm *umsch)  	umsch->cmd_buf_curr_ptr = umsch->cmd_buf_ptr; -	if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 5)) { +	if (amdgpu_ip_version(adev, VCN_HWIP, 0) >= IP_VERSION(4, 0, 5)) {  		WREG32_SOC15(VCN, 0, regUVD_IPX_DLDO_CONFIG,  			1 << UVD_IPX_DLDO_CONFIG__ONO0_PWR_CONFIG__SHIFT);  		SOC15_WAIT_ON_RREG(VCN, 0, regUVD_IPX_DLDO_STATUS, @@ -225,6 +225,8 @@ static int umsch_mm_v4_0_ring_start(struct amdgpu_umsch_mm *umsch)  	WREG32_SOC15(VCN, 0, regVCN_UMSCH_RB_SIZE, ring->ring_size); +	ring->wptr = 0; +  	data = RREG32_SOC15(VCN, 0, regVCN_RB_ENABLE);  	data &= ~(VCN_RB_ENABLE__AUDIO_RB_EN_MASK);  	WREG32_SOC15(VCN, 0, regVCN_RB_ENABLE, data); @@ -248,7 +250,7 @@ static int umsch_mm_v4_0_ring_stop(struct amdgpu_umsch_mm *umsch)  	data = REG_SET_FIELD(data, VCN_UMSCH_RB_DB_CTRL, EN, 0);  	WREG32_SOC15(VCN, 0, regVCN_UMSCH_RB_DB_CTRL, data); -	if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 5)) { +	if (amdgpu_ip_version(adev, VCN_HWIP, 0) >= IP_VERSION(4, 0, 5)) {  		WREG32_SOC15(VCN, 0, regUVD_IPX_DLDO_CONFIG,  			2 << UVD_IPX_DLDO_CONFIG__ONO0_PWR_CONFIG__SHIFT);  		SOC15_WAIT_ON_RREG(VCN, 0, regUVD_IPX_DLDO_STATUS, @@ -271,6 +273,8 @@ static int umsch_mm_v4_0_set_hw_resources(struct amdgpu_umsch_mm *umsch)  	set_hw_resources.vmid_mask_mm_vcn = umsch->vmid_mask_mm_vcn;  	set_hw_resources.vmid_mask_mm_vpe = umsch->vmid_mask_mm_vpe; +	set_hw_resources.collaboration_mask_vpe = +		adev->vpe.collaborate_mode ? 0x3 : 0x0;  	set_hw_resources.engine_mask = umsch->engine_mask;  	set_hw_resources.vcn0_hqd_mask[0] = umsch->vcn0_hqd_mask; @@ -346,6 +350,7 @@ static int umsch_mm_v4_0_add_queue(struct amdgpu_umsch_mm *umsch,  	add_queue.h_queue = input_ptr->h_queue;  	add_queue.vm_context_cntl = input_ptr->vm_context_cntl;  	add_queue.is_context_suspended = input_ptr->is_context_suspended; +	add_queue.collaboration_mode = adev->vpe.collaborate_mode ? 1 : 0;  	add_queue.api_status.api_completion_fence_addr = umsch->ring.fence_drv.gpu_addr;  	add_queue.api_status.api_completion_fence_value = ++umsch->ring.fence_drv.sync_seq; diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c index a6006f231c65..805d6662c88b 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c @@ -819,6 +819,8 @@ static const struct amd_ip_funcs uvd_v3_1_ip_funcs = {  	.soft_reset = uvd_v3_1_soft_reset,  	.set_clockgating_state = uvd_v3_1_set_clockgating_state,  	.set_powergating_state = uvd_v3_1_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version uvd_v3_1_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c index 1aa09ad7bbe3..3f19c606f4de 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c @@ -769,6 +769,8 @@ static const struct amd_ip_funcs uvd_v4_2_ip_funcs = {  	.soft_reset = uvd_v4_2_soft_reset,  	.set_clockgating_state = uvd_v4_2_set_clockgating_state,  	.set_powergating_state = uvd_v4_2_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs uvd_v4_2_ring_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c index f8b229b75435..efd903c21d48 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c @@ -877,6 +877,8 @@ static const struct amd_ip_funcs uvd_v5_0_ip_funcs = {  	.set_clockgating_state = uvd_v5_0_set_clockgating_state,  	.set_powergating_state = uvd_v5_0_set_powergating_state,  	.get_clockgating_state = uvd_v5_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs uvd_v5_0_ring_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c index a9a6880f44e3..495de5068455 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c @@ -1545,6 +1545,8 @@ static const struct amd_ip_funcs uvd_v6_0_ip_funcs = {  	.set_clockgating_state = uvd_v6_0_set_clockgating_state,  	.set_powergating_state = uvd_v6_0_set_powergating_state,  	.get_clockgating_state = uvd_v6_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs uvd_v6_0_ring_phys_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c index a08e7abca423..66fada199bda 100644 --- a/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vce_v2_0.c @@ -626,6 +626,8 @@ static const struct amd_ip_funcs vce_v2_0_ip_funcs = {  	.soft_reset = vce_v2_0_soft_reset,  	.set_clockgating_state = vce_v2_0_set_clockgating_state,  	.set_powergating_state = vce_v2_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs vce_v2_0_ring_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c index f4760748d349..32517c364cf7 100644 --- a/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vce_v3_0.c @@ -913,6 +913,8 @@ static const struct amd_ip_funcs vce_v3_0_ip_funcs = {  	.set_clockgating_state = vce_v3_0_set_clockgating_state,  	.set_powergating_state = vce_v3_0_set_powergating_state,  	.get_clockgating_state = vce_v3_0_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs vce_v3_0_ring_phys_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c index aaceecd558cf..cb253bd3a2a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c @@ -1902,6 +1902,8 @@ static const struct amd_ip_funcs vcn_v1_0_ip_funcs = {  	.post_soft_reset = NULL /* vcn_v1_0_post_soft_reset */,  	.set_clockgating_state = vcn_v1_0_set_clockgating_state,  	.set_powergating_state = vcn_v1_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  /* diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c index e357d8cf0c01..f18fd61c435e 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c @@ -2008,6 +2008,8 @@ static const struct amd_ip_funcs vcn_v2_0_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = vcn_v2_0_set_clockgating_state,  	.set_powergating_state = vcn_v2_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ring_funcs vcn_v2_0_dec_ring_vm_funcs = { diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c index 1cd8a94b0fbc..baec14bde2a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_5.c @@ -1901,6 +1901,8 @@ static const struct amd_ip_funcs vcn_v2_5_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = vcn_v2_5_set_clockgating_state,  	.set_powergating_state = vcn_v2_5_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amd_ip_funcs vcn_v2_6_ip_funcs = { @@ -1921,6 +1923,8 @@ static const struct amd_ip_funcs vcn_v2_6_ip_funcs = {          .post_soft_reset = NULL,          .set_clockgating_state = vcn_v2_5_set_clockgating_state,          .set_powergating_state = vcn_v2_5_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version vcn_v2_5_ip_block = diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c index 8f82fb887e9c..6b31cf4b8aac 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c @@ -359,6 +359,7 @@ static int vcn_v3_0_hw_init(void *handle)  		}  	} +	return 0;  done:  	if (!r)  		DRM_INFO("VCN decode and encode initialized successfully(under %s).\n", @@ -2230,6 +2231,8 @@ static const struct amd_ip_funcs vcn_v3_0_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = vcn_v3_0_set_clockgating_state,  	.set_powergating_state = vcn_v3_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version vcn_v3_0_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c index 832d15f7b5f6..ac1b8ead03b3 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c @@ -288,6 +288,7 @@ static int vcn_v4_0_hw_init(void *handle)  		}  	} +	return 0;  done:  	if (!r)  		DRM_INFO("VCN decode and encode initialized successfully(under %s).\n", @@ -2130,6 +2131,8 @@ static const struct amd_ip_funcs vcn_v4_0_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = vcn_v4_0_set_clockgating_state,  	.set_powergating_state = vcn_v4_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version vcn_v4_0_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index 203fa988322b..2279d8fce03d 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -1660,6 +1660,8 @@ static const struct amd_ip_funcs vcn_v4_0_3_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = vcn_v4_0_3_set_clockgating_state,  	.set_powergating_state = vcn_v4_0_3_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version vcn_v4_0_3_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c index 501e53e69f2a..81fb99729f37 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_5.c @@ -237,6 +237,7 @@ static int vcn_v4_0_5_hw_init(void *handle)  			goto done;  	} +	return 0;  done:  	if (!r)  		DRM_INFO("VCN decode and encode initialized successfully(under %s).\n", @@ -1752,6 +1753,8 @@ static const struct amd_ip_funcs vcn_v4_0_5_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = vcn_v4_0_5_set_clockgating_state,  	.set_powergating_state = vcn_v4_0_5_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version vcn_v4_0_5_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c index bc60c554eb32..851975b5ce29 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_0.c @@ -95,7 +95,7 @@ static int vcn_v5_0_0_sw_init(void *handle)  		return r;  	for (i = 0; i < adev->vcn.num_vcn_inst; i++) { -		volatile struct amdgpu_vcn4_fw_shared *fw_shared; +		volatile struct amdgpu_vcn5_fw_shared *fw_shared;  		if (adev->vcn.harvest_config & (1 << i))  			continue; @@ -154,7 +154,7 @@ static int vcn_v5_0_0_sw_fini(void *handle)  	if (drm_dev_enter(adev_to_drm(adev), &idx)) {  		for (i = 0; i < adev->vcn.num_vcn_inst; i++) { -			volatile struct amdgpu_vcn4_fw_shared *fw_shared; +			volatile struct amdgpu_vcn5_fw_shared *fw_shared;  			if (adev->vcn.harvest_config & (1 << i))  				continue; @@ -203,6 +203,7 @@ static int vcn_v5_0_0_hw_init(void *handle)  			goto done;  	} +	return 0;  done:  	if (!r)  		DRM_INFO("VCN decode and encode initialized successfully(under %s).\n", @@ -334,7 +335,7 @@ static void vcn_v5_0_0_mc_resume(struct amdgpu_device *adev, int inst)  		upper_32_bits(adev->vcn.inst[inst].fw_shared.gpu_addr));  	WREG32_SOC15(VCN, inst, regUVD_VCPU_NONCACHE_OFFSET0, 0);  	WREG32_SOC15(VCN, inst, regUVD_VCPU_NONCACHE_SIZE0, -		AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn4_fw_shared))); +		AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn5_fw_shared)));  }  /** @@ -438,7 +439,7 @@ static void vcn_v5_0_0_mc_resume_dpg_mode(struct amdgpu_device *adev, int inst_i  		VCN, inst_idx, regUVD_VCPU_NONCACHE_OFFSET0), 0, 0, indirect);  	WREG32_SOC24_DPG_MODE(inst_idx, SOC24_DPG_MODE_OFFSET(  		VCN, inst_idx, regUVD_VCPU_NONCACHE_SIZE0), -		AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn4_fw_shared)), 0, indirect); +		AMDGPU_GPU_PAGE_ALIGN(sizeof(struct amdgpu_vcn5_fw_shared)), 0, indirect);  	/* VCN global tiling registers */  	WREG32_SOC24_DPG_MODE(inst_idx, SOC24_DPG_MODE_OFFSET( @@ -615,7 +616,7 @@ static void vcn_v5_0_0_enable_clock_gating(struct amdgpu_device *adev, int inst)   */  static int vcn_v5_0_0_start_dpg_mode(struct amdgpu_device *adev, int inst_idx, bool indirect)  { -	volatile struct amdgpu_vcn4_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; +	volatile struct amdgpu_vcn5_fw_shared *fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;  	struct amdgpu_ring *ring;  	uint32_t tmp; @@ -712,7 +713,7 @@ static int vcn_v5_0_0_start_dpg_mode(struct amdgpu_device *adev, int inst_idx, b   */  static int vcn_v5_0_0_start(struct amdgpu_device *adev)  { -	volatile struct amdgpu_vcn4_fw_shared *fw_shared; +	volatile struct amdgpu_vcn5_fw_shared *fw_shared;  	struct amdgpu_ring *ring;  	uint32_t tmp;  	int i, j, k, r; @@ -893,7 +894,7 @@ static void vcn_v5_0_0_stop_dpg_mode(struct amdgpu_device *adev, int inst_idx)   */  static int vcn_v5_0_0_stop(struct amdgpu_device *adev)  { -	volatile struct amdgpu_vcn4_fw_shared *fw_shared; +	volatile struct amdgpu_vcn5_fw_shared *fw_shared;  	uint32_t tmp;  	int i, r = 0; @@ -1328,6 +1329,8 @@ static const struct amd_ip_funcs vcn_v5_0_0_ip_funcs = {  	.post_soft_reset = NULL,  	.set_clockgating_state = vcn_v5_0_0_set_clockgating_state,  	.set_powergating_state = vcn_v5_0_0_set_powergating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  const struct amdgpu_ip_block_version vcn_v5_0_0_ip_block = { diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index 1a98812981f4..d39c670f6220 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -897,7 +897,7 @@ static int vi_asic_pci_config_reset(struct amdgpu_device *adev)  	return r;  } -static bool vi_asic_supports_baco(struct amdgpu_device *adev) +static int vi_asic_supports_baco(struct amdgpu_device *adev)  {  	switch (adev->asic_type) {  	case CHIP_FIJI: @@ -908,14 +908,14 @@ static bool vi_asic_supports_baco(struct amdgpu_device *adev)  	case CHIP_TOPAZ:  		return amdgpu_dpm_is_baco_supported(adev);  	default: -		return false; +		return 0;  	}  }  static enum amd_reset_method  vi_asic_reset_method(struct amdgpu_device *adev)  { -	bool baco_reset; +	int baco_reset;  	if (amdgpu_reset_method == AMD_RESET_METHOD_LEGACY ||  	    amdgpu_reset_method == AMD_RESET_METHOD_BACO) @@ -935,7 +935,7 @@ vi_asic_reset_method(struct amdgpu_device *adev)  		baco_reset = amdgpu_dpm_is_baco_supported(adev);  		break;  	default: -		baco_reset = false; +		baco_reset = 0;  		break;  	} @@ -2058,6 +2058,8 @@ static const struct amd_ip_funcs vi_common_ip_funcs = {  	.set_clockgating_state = vi_common_set_clockgating_state,  	.set_powergating_state = vi_common_set_powergating_state,  	.get_clockgating_state = vi_common_get_clockgating_state, +	.dump_ip_state = NULL, +	.print_ip_state = NULL,  };  static const struct amdgpu_ip_block_version vi_common_ip_block = diff --git a/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c b/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c index 769eb8f7bb3c..09315dd5a1ec 100644 --- a/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c @@ -144,6 +144,12 @@ static int vpe_v6_1_load_microcode(struct amdgpu_vpe *vpe)  			WREG32(vpe_get_reg_offset(vpe, j, regVPEC_CNTL), ret);  	} +	/* setup collaborate mode */ +	vpe_v6_1_set_collaborate_mode(vpe, true); +	/* setup DPM */ +	if (amdgpu_vpe_configure_dpm(vpe)) +		dev_warn(adev->dev, "VPE failed to enable DPM\n"); +  	/*  	 * For VPE 6.1.1, still only need to add master's offset, and psp will apply it to slave as well.  	 * Here use instance 0 as master. @@ -159,11 +165,7 @@ static int vpe_v6_1_load_microcode(struct amdgpu_vpe *vpe)  		adev->vpe.cmdbuf_cpu_addr[0] = f32_offset;  		adev->vpe.cmdbuf_cpu_addr[1] = f32_cntl; -		amdgpu_vpe_psp_update_sram(adev); -		vpe_v6_1_set_collaborate_mode(vpe, true); -		amdgpu_vpe_configure_dpm(vpe); - -		return 0; +		return amdgpu_vpe_psp_update_sram(adev);  	}  	vpe_hdr = (const struct vpe_firmware_header_v1_0 *)adev->vpe.fw->data; @@ -196,8 +198,6 @@ static int vpe_v6_1_load_microcode(struct amdgpu_vpe *vpe)  	}  	vpe_v6_1_halt(vpe, false); -	vpe_v6_1_set_collaborate_mode(vpe, true); -	amdgpu_vpe_configure_dpm(vpe);  	return 0;  } |