diff options
| author | Thomas Gleixner <[email protected]> | 2020-06-11 15:17:57 +0200 | 
|---|---|---|
| committer | Thomas Gleixner <[email protected]> | 2020-06-11 15:17:57 +0200 | 
| commit | f77d26a9fc525286bcef3d4f98b52e17482cf49c (patch) | |
| tree | 6b179c9aa84787773cb601a14a64255e2912154b /drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | |
| parent | b6bea24d41519e8c31e4798f1c1a3f67e540c5d0 (diff) | |
| parent | f0178fc01fe46bab6a95415f5647d1a74efcad1b (diff) | |
Merge branch 'x86/entry' into ras/core
to fixup conflicts in arch/x86/kernel/cpu/mce/core.c so MCE specific follow
up patches can be applied without creating a horrible merge conflict
afterwards.
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 146 | 
1 files changed, 84 insertions, 62 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 95b3327168ac..91837a991319 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -325,9 +325,18 @@ success:  static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,  					  struct amdgpu_hive_info *hive)  { +	char node[10]; +	memset(node, 0, sizeof(node)); +  	device_remove_file(adev->dev, &dev_attr_xgmi_device_id); -	sysfs_remove_link(&adev->dev->kobj, adev->ddev->unique); -	sysfs_remove_link(hive->kobj, adev->ddev->unique); +	device_remove_file(adev->dev, &dev_attr_xgmi_error); + +	if (adev != hive->adev) +		sysfs_remove_link(&adev->dev->kobj,"xgmi_hive_info"); + +	sprintf(node, "node%d", hive->number_devices); +	sysfs_remove_link(hive->kobj, node); +  } @@ -373,7 +382,13 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo  	if (lock)  		mutex_lock(&tmp->hive_lock); -	tmp->pstate = -1; +	tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; +	tmp->hi_req_gpu = NULL; +	/* +	 * hive pstate on boot is high in vega20 so we have to go to low +	 * pstate on after boot. +	 */ +	tmp->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;  	mutex_unlock(&xgmi_mutex);  	return tmp; @@ -383,56 +398,59 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)  {  	int ret = 0;  	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); -	struct amdgpu_device *tmp_adev; -	bool update_hive_pstate = true; -	bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20; +	struct amdgpu_device *request_adev = hive->hi_req_gpu ? +						hive->hi_req_gpu : adev; +	bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; +	bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; -	if (!hive) +	/* fw bug so temporarily disable pstate switching */ +	return 0; + +	if (!hive || adev->asic_type != CHIP_VEGA20)  		return 0;  	mutex_lock(&hive->hive_lock); -	if (hive->pstate == pstate) { -		adev->pstate = is_high_pstate ? pstate : adev->pstate; +	if (is_hi_req) +		hive->hi_req_count++; +	else +		hive->hi_req_count--; + +	/* +	 * Vega20 only needs single peer to request pstate high for the hive to +	 * go high but all peers must request pstate low for the hive to go low +	 */ +	if (hive->pstate == pstate || +			(!is_hi_req && hive->hi_req_count && !init_low))  		goto out; -	} -	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); +	dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate); -	ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate); +	ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);  	if (ret) { -		dev_err(adev->dev, +		dev_err(request_adev->dev,  			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d", -			adev->gmc.xgmi.node_id, -			adev->gmc.xgmi.hive_id, ret); +			request_adev->gmc.xgmi.node_id, +			request_adev->gmc.xgmi.hive_id, ret);  		goto out;  	} -	/* Update device pstate */ -	adev->pstate = pstate; - -	/* -	 * Update the hive pstate only all devices of the hive -	 * are in the same pstate -	 */ -	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { -		if (tmp_adev->pstate != adev->pstate) { -			update_hive_pstate = false; -			break; -		} -	} -	if (update_hive_pstate || is_high_pstate) +	if (init_low) +		hive->pstate = hive->hi_req_count ? +					hive->pstate : AMDGPU_XGMI_PSTATE_MIN; +	else {  		hive->pstate = pstate; - +		hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? +							adev : NULL; +	}  out:  	mutex_unlock(&hive->hive_lock); -  	return ret;  }  int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev)  { -	int ret = -EINVAL; +	int ret;  	/* Each psp need to set the latest topology */  	ret = psp_xgmi_set_topology_info(&adev->psp, @@ -507,9 +525,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)  		goto exit;  	} -	/* Set default device pstate */ -	adev->pstate = -1; -  	top_info = &adev->psp.xgmi_context.top_info;  	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); @@ -577,14 +592,14 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)  	if (!hive)  		return -EINVAL; -	if (!(hive->number_devices--)) { +	task_barrier_rem_task(&hive->tb); +	amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); +	mutex_unlock(&hive->hive_lock); + +	if(!(--hive->number_devices)){  		amdgpu_xgmi_sysfs_destroy(adev, hive);  		mutex_destroy(&hive->hive_lock);  		mutex_destroy(&hive->reset_lock); -	} else { -		task_barrier_rem_task(&hive->tb); -		amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); -		mutex_unlock(&hive->hive_lock);  	}  	return psp_xgmi_terminate(&adev->psp); @@ -604,6 +619,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)  	    adev->gmc.xgmi.num_physical_nodes == 0)  		return 0; +	amdgpu_xgmi_reset_ras_error_count(adev); +  	if (!adev->gmc.xgmi.ras_if) {  		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);  		if (!adev->gmc.xgmi.ras_if) @@ -641,31 +658,34 @@ void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)  uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,  					   uint64_t addr)  { -	uint32_t df_inst_id; -	uint64_t dram_base_addr = 0; -	const struct amdgpu_df_funcs *df_funcs = adev->df.funcs; - -	if ((!df_funcs)                 || -	    (!df_funcs->get_df_inst_id) || -	    (!df_funcs->get_dram_base_addr)) { -		dev_warn(adev->dev, -			 "XGMI: relative phy_addr algorithm is not supported\n"); -		return addr; -	} - -	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) { -		dev_warn(adev->dev, -			 "failed to disable DF-Cstate, DF register may not be accessible\n"); -		return addr; -	} +	struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; +	return (addr + xgmi->physical_node_id * xgmi->node_segment_size); +} -	df_inst_id = df_funcs->get_df_inst_id(adev); -	dram_base_addr = df_funcs->get_dram_base_addr(adev, df_inst_id); +static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) +{ +	WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); +	WREG32_PCIE(pcs_status_reg, 0); +} -	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) -		dev_warn(adev->dev, "failed to enable DF-Cstate\n"); +void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) +{ +	uint32_t i; -	return addr + dram_base_addr; +	switch (adev->asic_type) { +	case CHIP_ARCTURUS: +		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) +			pcs_clear_status(adev, +					 xgmi_pcs_err_status_reg_arct[i]); +		break; +	case CHIP_VEGA20: +		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) +			pcs_clear_status(adev, +					 xgmi_pcs_err_status_reg_vg20[i]); +		break; +	default: +		break; +	}  }  static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, @@ -758,6 +778,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,  		break;  	} +	amdgpu_xgmi_reset_ras_error_count(adev); +  	err_data->ue_count += ue_cnt;  	err_data->ce_count += ce_cnt;  |