diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 332 | 
1 files changed, 247 insertions, 85 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7753a2e64d41..bcacf2e35eba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -74,6 +74,7 @@  #include "amdgpu_fru_eeprom.h"  #include "amdgpu_reset.h"  #include "amdgpu_virt.h" +#include "amdgpu_dev_coredump.h"  #include <linux/suspend.h>  #include <drm/task_barrier.h> @@ -143,6 +144,8 @@ const char *amdgpu_asic_name[] = {  	"LAST",  }; +static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); +  /**   * DOC: pcie_replay_count   * @@ -335,16 +338,93 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)   *   * @dev: drm_device pointer   * - * Returns true if the device supporte BACO, - * otherwise return false. + * Return: + * 1 if the device supporte BACO; + * 3 if the device support MACO (only works if BACO is supported) + * otherwise return 0.   */ -bool amdgpu_device_supports_baco(struct drm_device *dev) +int amdgpu_device_supports_baco(struct drm_device *dev)  {  	struct amdgpu_device *adev = drm_to_adev(dev);  	return amdgpu_asic_supports_baco(adev);  } +void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev) +{ +	struct drm_device *dev; +	int bamaco_support; + +	dev = adev_to_drm(adev); + +	adev->pm.rpm_mode = AMDGPU_RUNPM_NONE; +	bamaco_support = amdgpu_device_supports_baco(dev); + +	switch (amdgpu_runtime_pm) { +	case 2: +		if (bamaco_support & MACO_SUPPORT) { +			adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; +			dev_info(adev->dev, "Forcing BAMACO for runtime pm\n"); +		} else if (bamaco_support == BACO_SUPPORT) { +			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; +			dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n"); +		} +		break; +	case 1: +		if (bamaco_support & BACO_SUPPORT) { +			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; +			dev_info(adev->dev, "Forcing BACO for runtime pm\n"); +		} +		break; +	case -1: +	case -2: +		if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */ +			adev->pm.rpm_mode = AMDGPU_RUNPM_PX; +			dev_info(adev->dev, "Using ATPX for runtime pm\n"); +		} else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */ +			adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO; +			dev_info(adev->dev, "Using BOCO for runtime pm\n"); +		} else { +			if (!bamaco_support) +				goto no_runtime_pm; + +			switch (adev->asic_type) { +			case CHIP_VEGA20: +			case CHIP_ARCTURUS: +				/* BACO are not supported on vega20 and arctrus */ +				break; +			case CHIP_VEGA10: +				/* enable BACO as runpm mode if noretry=0 */ +				if (!adev->gmc.noretry) +					adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; +				break; +			default: +				/* enable BACO as runpm mode on CI+ */ +				adev->pm.rpm_mode = AMDGPU_RUNPM_BACO; +				break; +			} + +			if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) { +				if (bamaco_support & MACO_SUPPORT) { +					adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO; +					dev_info(adev->dev, "Using BAMACO for runtime pm\n"); +				} else { +					dev_info(adev->dev, "Using BACO for runtime pm\n"); +				} +			} +		} +		break; +	case 0: +		dev_info(adev->dev, "runtime pm is manually disabled\n"); +		break; +	default: +		break; +	} + +no_runtime_pm: +	if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) +		dev_info(adev->dev, "Runtime PM not available\n"); +}  /**   * amdgpu_device_supports_smart_shift - Is the device dGPU with   * smart shift support @@ -599,7 +679,7 @@ uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,  		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,  							 GC_HWIP, false,  							 &rlcg_flag)) { -			ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); +			ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));  		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&  		    amdgpu_sriov_runtime(adev) &&  		    down_read_trylock(&adev->reset_domain->sem)) { @@ -730,7 +810,7 @@ void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,  		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,  							 GC_HWIP, true,  							 &rlcg_flag)) { -			amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); +			amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));  		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&  		    amdgpu_sriov_runtime(adev) &&  		    down_read_trylock(&adev->reset_domain->sem)) { @@ -1228,6 +1308,7 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev)  	amdgpu_asic_pre_asic_init(adev);  	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || +	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||  	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {  		amdgpu_psp_wait_for_bootloader(adev);  		ret = amdgpu_atomfirmware_asic_init(adev, true); @@ -1402,13 +1483,17 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev)   */  int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)  { -	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb); +	unsigned long flags, offset; +	spin_lock_irqsave(&adev->wb.lock, flags); +	offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);  	if (offset < adev->wb.num_wb) {  		__set_bit(offset, adev->wb.used); +		spin_unlock_irqrestore(&adev->wb.lock, flags);  		*wb = offset << 3; /* convert to dw offset */  		return 0;  	} else { +		spin_unlock_irqrestore(&adev->wb.lock, flags);  		return -EINVAL;  	}  } @@ -1423,9 +1508,13 @@ int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)   */  void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)  { +	unsigned long flags; +  	wb >>= 3; +	spin_lock_irqsave(&adev->wb.lock, flags);  	if (wb < adev->wb.num_wb)  		__clear_bit(wb, adev->wb.used); +	spin_unlock_irqrestore(&adev->wb.lock, flags);  }  /** @@ -1455,7 +1544,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)  	/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */  	if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) -		DRM_WARN("System can't access extended configuration space,please check!!\n"); +		DRM_WARN("System can't access extended configuration space, please check!!\n");  	/* skip if the bios has already enabled large BAR */  	if (adev->gmc.real_vram_size && @@ -2261,7 +2350,6 @@ void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)  static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)  {  	const char *chip_name; -	char fw_name[40];  	int err;  	const struct gpu_info_firmware_header_v1_0 *hdr; @@ -2295,12 +2383,12 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)  		break;  	} -	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); -	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name); +	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, +				   "amdgpu/%s_gpu_info.bin", chip_name);  	if (err) {  		dev_err(adev->dev, -			"Failed to get gpu_info firmware \"%s\"\n", -			fw_name); +			"Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n", +			chip_name);  		goto out;  	} @@ -3054,7 +3142,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  		return r;  	} -	amdgpu_ras_set_error_query_ready(adev, true); +	if (!amdgpu_in_reset(adev)) +		amdgpu_ras_set_error_query_ready(adev, true);  	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);  	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); @@ -3960,6 +4049,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	mutex_init(&adev->grbm_idx_mutex);  	mutex_init(&adev->mn_lock);  	mutex_init(&adev->virt.vf_errors.lock); +	mutex_init(&adev->virt.rlcg_reg_lock);  	hash_init(adev->mn_hash);  	mutex_init(&adev->psp.mutex);  	mutex_init(&adev->notifier_lock); @@ -3981,6 +4071,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	spin_lock_init(&adev->se_cac_idx_lock);  	spin_lock_init(&adev->audio_endpt_idx_lock);  	spin_lock_init(&adev->mm_stats.lock); +	spin_lock_init(&adev->wb.lock);  	INIT_LIST_HEAD(&adev->shadow_list);  	mutex_init(&adev->shadow_list_lock); @@ -4069,6 +4160,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	/* Enable TMZ based on IP_VERSION */  	amdgpu_gmc_tmz_set(adev); +	if (amdgpu_sriov_vf(adev) && +	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0)) +		/* VF MMIO access (except mailbox range) from CPU +		 * will be blocked during sriov runtime +		 */ +		adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT; +  	amdgpu_gmc_noretry_set(adev);  	/* Need to get xgmi info early to decide the reset behavior*/  	if (adev->gmc.xgmi.supported) { @@ -4915,7 +5013,8 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)  		shadow = vmbo->shadow;  		/* No need to recover an evicted BO */ -		if (shadow->tbo.resource->mem_type != TTM_PL_TT || +		if (!shadow->tbo.resource || +		    shadow->tbo.resource->mem_type != TTM_PL_TT ||  		    shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||  		    shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)  			continue; @@ -4959,27 +5058,30 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)   * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf   *   * @adev: amdgpu_device pointer - * @from_hypervisor: request from hypervisor + * @reset_context: amdgpu reset context pointer   *   * do VF FLR and reinitialize Asic   * return 0 means succeeded otherwise failed   */  static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, -				     bool from_hypervisor) +				     struct amdgpu_reset_context *reset_context)  {  	int r;  	struct amdgpu_hive_info *hive = NULL; -	int retry_limit = 0; -retry: -	amdgpu_amdkfd_pre_reset(adev); - -	if (from_hypervisor) +	if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { +		if (!amdgpu_ras_get_fed_status(adev)) +			amdgpu_virt_ready_to_reset(adev); +		amdgpu_virt_wait_reset(adev); +		clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);  		r = amdgpu_virt_request_full_gpu(adev, true); -	else +	} else {  		r = amdgpu_virt_reset_gpu(adev); +	}  	if (r)  		return r; + +	amdgpu_ras_set_fed(adev, false);  	amdgpu_irq_gpu_reset_resume_helper(adev);  	/* some sw clean up VF needs to do before recover */ @@ -4988,7 +5090,7 @@ retry:  	/* Resume IP prior to SMC */  	r = amdgpu_device_ip_reinit_early_sriov(adev);  	if (r) -		goto error; +		return r;  	amdgpu_virt_init_data_exchange(adev); @@ -4999,38 +5101,41 @@ retry:  	/* now we are okay to resume SMC/CP/SDMA */  	r = amdgpu_device_ip_reinit_late_sriov(adev);  	if (r) -		goto error; +		return r;  	hive = amdgpu_get_xgmi_hive(adev);  	/* Update PSP FW topology after reset */  	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)  		r = amdgpu_xgmi_update_topology(hive, adev); -  	if (hive)  		amdgpu_put_xgmi_hive(hive); +	if (r) +		return r; -	if (!r) { -		r = amdgpu_ib_ring_tests(adev); - -		amdgpu_amdkfd_post_reset(adev); -	} +	r = amdgpu_ib_ring_tests(adev); +	if (r) +		return r; -error: -	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { +	if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {  		amdgpu_inc_vram_lost(adev);  		r = amdgpu_device_recover_vram(adev);  	} -	amdgpu_virt_release_full_gpu(adev, true); +	if (r) +		return r; -	if (AMDGPU_RETRY_SRIOV_RESET(r)) { -		if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) { -			retry_limit++; -			goto retry; -		} else -			DRM_ERROR("GPU reset retry is beyond the retry limit\n"); -	} +	/* need to be called during full access so we can't do it later like +	 * bare-metal does. +	 */ +	amdgpu_amdkfd_post_reset(adev); +	amdgpu_virt_release_full_gpu(adev, true); -	return r; +	/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ +	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) || +	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || +	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) || +	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) +		amdgpu_ras_resume(adev); +	return 0;  }  /** @@ -5121,11 +5226,14 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)  	dev_info(adev->dev, "GPU mode1 reset\n"); +	/* Cache the state before bus master disable. The saved config space +	 * values are used in other cases like restore after mode-2 reset. +	 */ +	amdgpu_device_cache_pci_state(adev->pdev); +  	/* disable BM */  	pci_clear_master(adev->pdev); -	amdgpu_device_cache_pci_state(adev->pdev); -  	if (amdgpu_dpm_is_mode1_reset_supported(adev)) {  		dev_info(adev->dev, "GPU smu mode1 reset\n");  		ret = amdgpu_dpm_mode1_reset(adev); @@ -5263,11 +5371,23 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  	struct amdgpu_device *tmp_adev = NULL;  	bool need_full_reset, skip_hw_reset, vram_lost = false;  	int r = 0; +	uint32_t i;  	/* Try reset handler method first */  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,  				    reset_list); -	amdgpu_reset_reg_dumps(tmp_adev); + +	if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { +		amdgpu_reset_reg_dumps(tmp_adev); + +		dev_info(tmp_adev->dev, "Dumping IP State\n"); +		/* Trigger ip dump before we reset the asic */ +		for (i = 0; i < tmp_adev->num_ip_blocks; i++) +			if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) +				tmp_adev->ip_blocks[i].version->funcs +				->dump_ip_state((void *)tmp_adev); +		dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); +	}  	reset_context->reset_device_list = device_list_handle;  	r = amdgpu_reset_perform_reset(tmp_adev, reset_context); @@ -5340,7 +5460,8 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  				vram_lost = amdgpu_device_check_vram_lost(tmp_adev); -				amdgpu_coredump(tmp_adev, vram_lost, reset_context); +				if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) +					amdgpu_coredump(tmp_adev, vram_lost, reset_context);  				if (vram_lost) {  					DRM_INFO("VRAM is lost due to GPU reset!\n"); @@ -5538,6 +5659,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)  } +static int amdgpu_device_health_check(struct list_head *device_list_handle) +{ +	struct amdgpu_device *tmp_adev; +	int ret = 0; +	u32 status; + +	list_for_each_entry(tmp_adev, device_list_handle, reset_list) { +		pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); +		if (PCI_POSSIBLE_ERROR(status)) { +			dev_err(tmp_adev->dev, "device lost from bus!"); +			ret = -ENODEV; +		} +	} + +	return ret; +} +  /**   * amdgpu_device_gpu_recover - reset the asic and recover scheduler   * @@ -5561,6 +5699,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	int i, r = 0;  	bool need_emergency_restart = false;  	bool audio_suspended = false; +	int retry_limit = AMDGPU_MAX_RETRY_LIMIT;  	/*  	 * Special case: RAS triggered and full reset isn't supported @@ -5595,7 +5734,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	 * to put adev in the 1st position.  	 */  	INIT_LIST_HEAD(&device_list); -	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { +	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {  		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {  			list_add_tail(&tmp_adev->reset_list, &device_list);  			if (adev->shutdown) @@ -5609,6 +5748,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  		device_list_handle = &device_list;  	} +	if (!amdgpu_sriov_vf(adev)) { +		r = amdgpu_device_health_check(device_list_handle); +		if (r) +			goto end_reset; +	} +  	/* We need to lock reset domain only once both for XGMI and single device */  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,  				    reset_list); @@ -5636,8 +5781,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  		cancel_delayed_work_sync(&tmp_adev->delayed_init_work); -		if (!amdgpu_sriov_vf(tmp_adev)) -			amdgpu_amdkfd_pre_reset(tmp_adev); +		amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);  		/*  		 * Mark these ASICs to be reseted as untracked first @@ -5690,33 +5834,40 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  				  r, adev_to_drm(tmp_adev)->unique);  			tmp_adev->asic_reset_res = r;  		} - -		/* -		 * Drop all pending non scheduler resets. Scheduler resets -		 * were already dropped during drm_sched_stop -		 */ -		amdgpu_device_stop_pending_resets(tmp_adev);  	}  	/* Actual ASIC resets if needed.*/  	/* Host driver will handle XGMI hive reset for SRIOV */  	if (amdgpu_sriov_vf(adev)) { -		r = amdgpu_device_reset_sriov(adev, job ? false : true); +		if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { +			dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); +			amdgpu_ras_set_fed(adev, true); +			set_bit(AMDGPU_HOST_FLR, &reset_context->flags); +		} + +		r = amdgpu_device_reset_sriov(adev, reset_context); +		if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { +			amdgpu_virt_release_full_gpu(adev, true); +			goto retry; +		}  		if (r)  			adev->asic_reset_res = r; - -		/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ -		if (amdgpu_ip_version(adev, GC_HWIP, 0) == -			    IP_VERSION(9, 4, 2) || -		    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || -		    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) -			amdgpu_ras_resume(adev);  	} else {  		r = amdgpu_do_asic_reset(device_list_handle, reset_context);  		if (r && r == -EAGAIN)  			goto retry;  	} +	list_for_each_entry(tmp_adev, device_list_handle, reset_list) { +		/* +		 * Drop any pending non scheduler resets queued before reset is done. +		 * Any reset scheduled after this point would be valid. Scheduler resets +		 * were already dropped during drm_sched_stop and no new ones can come +		 * in before drm_sched_start. +		 */ +		amdgpu_device_stop_pending_resets(tmp_adev); +	} +  skip_hw_reset:  	/* Post ASIC reset for all devs .*/ @@ -5774,6 +5925,7 @@ skip_sched_resume:  					    reset_list);  	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); +end_reset:  	if (hive) {  		mutex_unlock(&hive->hive_lock);  		amdgpu_put_xgmi_hive(hive); @@ -5809,13 +5961,18 @@ static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,  	*speed = PCI_SPEED_UNKNOWN;  	*width = PCIE_LNK_WIDTH_UNKNOWN; -	while ((parent = pci_upstream_bridge(parent))) { -		/* skip upstream/downstream switches internal to dGPU*/ -		if (parent->vendor == PCI_VENDOR_ID_ATI) -			continue; -		*speed = pcie_get_speed_cap(parent); -		*width = pcie_get_width_cap(parent); -		break; +	if (amdgpu_device_pcie_dynamic_switching_supported(adev)) { +		while ((parent = pci_upstream_bridge(parent))) { +			/* skip upstream/downstream switches internal to dGPU*/ +			if (parent->vendor == PCI_VENDOR_ID_ATI) +				continue; +			*speed = pcie_get_speed_cap(parent); +			*width = pcie_get_width_cap(parent); +			break; +		} +	} else { +		/* use the current speeds rather than max if switching is not supported */ +		pcie_bandwidth_available(adev->pdev, NULL, speed, width);  	}  } @@ -6030,7 +6187,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev)  	    adev->nbio.funcs->enable_doorbell_interrupt)  		adev->nbio.funcs->enable_doorbell_interrupt(adev, true); -	if (amdgpu_passthrough(adev) && +	if (amdgpu_passthrough(adev) && adev->nbio.funcs &&  	    adev->nbio.funcs->clear_doorbell_interrupt)  		adev->nbio.funcs->clear_doorbell_interrupt(adev); @@ -6130,19 +6287,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)  	struct amdgpu_reset_context reset_context;  	u32 memsize;  	struct list_head device_list; -	struct amdgpu_hive_info *hive; -	int hive_ras_recovery = 0; -	struct amdgpu_ras *ras;  	/* PCI error slot reset should be skipped During RAS recovery */ -	hive = amdgpu_get_xgmi_hive(adev); -	if (hive) { -		hive_ras_recovery = atomic_read(&hive->ras_recovery); -		amdgpu_put_xgmi_hive(hive); -	} -	ras = amdgpu_ras_get_context(adev); -	if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && -		 ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) +	if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || +	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && +	    amdgpu_ras_in_recovery(adev))  		return PCI_ERS_RESULT_RECOVERED;  	DRM_INFO("PCI error: slot reset callback!!\n"); @@ -6385,6 +6534,22 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,  }  /** + * amdgpu_device_get_gang - return a reference to the current gang + * @adev: amdgpu_device pointer + * + * Returns: A new reference to the current gang leader. + */ +struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev) +{ +	struct dma_fence *fence; + +	rcu_read_lock(); +	fence = dma_fence_get_rcu_safe(&adev->gang_submit); +	rcu_read_unlock(); +	return fence; +} + +/**   * amdgpu_device_switch_gang - switch to a new gang   * @adev: amdgpu_device pointer   * @gang: the gang to switch to @@ -6400,10 +6565,7 @@ struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,  	do {  		dma_fence_put(old); -		rcu_read_lock(); -		old = dma_fence_get_rcu_safe(&adev->gang_submit); -		rcu_read_unlock(); - +		old = amdgpu_device_get_gang(adev);  		if (old == gang)  			break; |