diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 422 | 
1 files changed, 273 insertions, 149 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f84f9e35a73b..a027a8f7b281 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -64,9 +64,11 @@  #include "amdgpu_xgmi.h"  #include "amdgpu_ras.h"  #include "amdgpu_pmu.h" +#include "amdgpu_fru_eeprom.h"  #include <linux/suspend.h>  #include <drm/task_barrier.h> +#include <linux/pm_runtime.h>  MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");  MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); @@ -138,6 +140,72 @@ static DEVICE_ATTR(pcie_replay_count, S_IRUGO,  static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);  /** + * DOC: product_name + * + * The amdgpu driver provides a sysfs API for reporting the product name + * for the device + * The file serial_number is used for this and returns the product name + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_product_name(struct device *dev, +		struct device_attribute *attr, char *buf) +{ +	struct drm_device *ddev = dev_get_drvdata(dev); +	struct amdgpu_device *adev = ddev->dev_private; + +	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); +} + +static DEVICE_ATTR(product_name, S_IRUGO, +		amdgpu_device_get_product_name, NULL); + +/** + * DOC: product_number + * + * The amdgpu driver provides a sysfs API for reporting the part number + * for the device + * The file serial_number is used for this and returns the part number + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_product_number(struct device *dev, +		struct device_attribute *attr, char *buf) +{ +	struct drm_device *ddev = dev_get_drvdata(dev); +	struct amdgpu_device *adev = ddev->dev_private; + +	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); +} + +static DEVICE_ATTR(product_number, S_IRUGO, +		amdgpu_device_get_product_number, NULL); + +/** + * DOC: serial_number + * + * The amdgpu driver provides a sysfs API for reporting the serial number + * for the device + * The file serial_number is used for this and returns the serial number + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_serial_number(struct device *dev, +		struct device_attribute *attr, char *buf) +{ +	struct drm_device *ddev = dev_get_drvdata(dev); +	struct amdgpu_device *adev = ddev->dev_private; + +	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); +} + +static DEVICE_ATTR(serial_number, S_IRUGO, +		amdgpu_device_get_serial_number, NULL); + +/**   * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control   *   * @dev: drm_device pointer @@ -231,10 +299,10 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,  }  /* - * MMIO register access helper functions. + * device register access helper functions.   */  /** - * amdgpu_mm_rreg - read a memory mapped IO register + * amdgpu_device_rreg - read a register   *   * @adev: amdgpu_device pointer   * @reg: dword aligned register offset @@ -242,25 +310,19 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,   *   * Returns the 32 bit value from the offset specified.   */ -uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, -			uint32_t acc_flags) +uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, uint32_t reg, +			    uint32_t acc_flags)  {  	uint32_t ret; -	if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))  		return amdgpu_kiq_rreg(adev, reg); -	if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) +	if ((reg * 4) < adev->rmmio_size)  		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); -	else { -		unsigned long flags; - -		spin_lock_irqsave(&adev->mmio_idx_lock, flags); -		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); -		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); -		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); -	} -	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); +	else +		ret = adev->pcie_rreg(adev, (reg * 4)); +	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);  	return ret;  } @@ -306,28 +368,19 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)  		BUG();  } -void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) +void static inline amdgpu_device_wreg_no_kiq(struct amdgpu_device *adev, uint32_t reg, +					     uint32_t v, uint32_t acc_flags)  { -	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); +	trace_amdgpu_device_wreg(adev->pdev->device, reg, v); -	if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) +	if ((reg * 4) < adev->rmmio_size)  		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); -	else { -		unsigned long flags; - -		spin_lock_irqsave(&adev->mmio_idx_lock, flags); -		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); -		writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); -		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); -	} - -	if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { -		udelay(500); -	} +	else +		adev->pcie_wreg(adev, (reg * 4), v);  }  /** - * amdgpu_mm_wreg - write to a memory mapped IO register + * amdgpu_device_wreg - write to a register   *   * @adev: amdgpu_device pointer   * @reg: dword aligned register offset @@ -336,17 +389,13 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,   *   * Writes the value specified to the offset specified.   */ -void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, -		    uint32_t acc_flags) +void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, +			uint32_t acc_flags)  { -	if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { -		adev->last_mm_index = v; -	} - -	if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) +	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))  		return amdgpu_kiq_wreg(adev, reg, v); -	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); +	amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags);  }  /* @@ -365,7 +414,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t  			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);  	} -	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); +	amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags);  }  /** @@ -397,20 +446,12 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)   */  void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)  { -	if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { -		adev->last_mm_index = v; -	} -  	if ((reg * 4) < adev->rio_mem_size)  		iowrite32(v, adev->rio_mem + (reg * 4));  	else {  		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));  		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));  	} - -	if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { -		udelay(500); -	}  }  /** @@ -1126,6 +1167,8 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)  	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); +	amdgpu_gmc_tmz_set(adev); +  	return 0;  } @@ -1147,7 +1190,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero  		return;  	if (state == VGA_SWITCHEROO_ON) { -		pr_info("amdgpu: switched on\n"); +		pr_info("switched on\n");  		/* don't suspend or resume card normally */  		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; @@ -1161,7 +1204,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero  		dev->switch_power_state = DRM_SWITCH_POWER_ON;  		drm_kms_helper_poll_enable(dev);  	} else { -		pr_info("amdgpu: switched off\n"); +		pr_info("switched off\n");  		drm_kms_helper_poll_disable(dev);  		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;  		amdgpu_device_suspend(dev, true); @@ -1524,9 +1567,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)  		chip_name = "vega12";  		break;  	case CHIP_RAVEN: -		if (adev->rev_id >= 8) +		if (adev->apu_flags & AMD_APU_IS_RAVEN2)  			chip_name = "raven2"; -		else if (adev->pdev->device == 0x15d8) +		else if (adev->apu_flags & AMD_APU_IS_PICASSO)  			chip_name = "picasso";  		else  			chip_name = "raven"; @@ -1574,8 +1617,10 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)  			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +  								le32_to_cpu(hdr->header.ucode_array_offset_bytes)); -		if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) +		if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { +			amdgpu_discovery_get_gfx_info(adev);  			goto parse_soc_bounding_box; +		}  		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);  		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); @@ -1721,19 +1766,31 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  		return -EINVAL;  	} -	r = amdgpu_device_parse_gpu_info_fw(adev); -	if (r) -		return r; - -	if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) -		amdgpu_discovery_get_gfx_info(adev); -  	amdgpu_amdkfd_device_probe(adev);  	if (amdgpu_sriov_vf(adev)) { +		/* handle vbios stuff prior full access mode for new handshake */ +		if (adev->virt.req_init_data_ver == 1) { +			if (!amdgpu_get_bios(adev)) { +				DRM_ERROR("failed to get vbios\n"); +				return -EINVAL; +			} + +			r = amdgpu_atombios_init(adev); +			if (r) { +				dev_err(adev->dev, "amdgpu_atombios_init failed\n"); +				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0); +				return r; +			} +		} +	} + +	/* we need to send REQ_GPU here for legacy handshaker otherwise the vbios +	 * will not be prepared by host for this VF */ +	if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver < 1) {  		r = amdgpu_virt_request_full_gpu(adev, true);  		if (r) -			return -EAGAIN; +			return r;  	}  	adev->pm.pp_feature = amdgpu_pp_feature_mask; @@ -1763,6 +1820,14 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  		}  		/* get the vbios after the asic_funcs are set up */  		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { +			r = amdgpu_device_parse_gpu_info_fw(adev); +			if (r) +				return r; + +			/* skip vbios handling for new handshake */ +			if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver == 1) +				continue; +  			/* Read BIOS */  			if (!amdgpu_get_bios(adev))  				return -EINVAL; @@ -1889,6 +1954,12 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  	if (r)  		return r; +	if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver > 0) { +		r = amdgpu_virt_request_full_gpu(adev, true); +		if (r) +			return -EAGAIN; +	} +  	for (i = 0; i < adev->num_ip_blocks; i++) {  		if (!adev->ip_blocks[i].status.valid)  			continue; @@ -1975,6 +2046,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  		amdgpu_xgmi_add_device(adev);  	amdgpu_amdkfd_device_init(adev); +	amdgpu_fru_get_product_info(adev); +  init_failed:  	if (amdgpu_sriov_vf(adev))  		amdgpu_virt_release_full_gpu(adev, true); @@ -2171,6 +2244,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  		adev->ip_blocks[i].status.late_initialized = true;  	} +	amdgpu_ras_set_error_query_ready(adev, true); +  	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);  	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); @@ -2203,7 +2278,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  				if (gpu_instance->adev->flags & AMD_IS_APU)  					continue; -				r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0); +				r = amdgpu_xgmi_set_pstate(gpu_instance->adev, +						AMDGPU_XGMI_PSTATE_MIN);  				if (r) {  					DRM_ERROR("pstate setting failed (%d).\n", r);  					break; @@ -2785,12 +2861,12 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)  	 * By default timeout for non compute jobs is 10000.  	 * And there is no timeout enforced on compute jobs.  	 * In SR-IOV or passthrough mode, timeout for compute -	 * jobs are 10000 by default. +	 * jobs are 60000 by default.  	 */  	adev->gfx_timeout = msecs_to_jiffies(10000);  	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;  	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) -		adev->compute_timeout = adev->gfx_timeout; +		adev->compute_timeout =  msecs_to_jiffies(60000);  	else  		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; @@ -2841,6 +2917,14 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)  	return ret;  } +static const struct attribute *amdgpu_dev_attributes[] = { +	&dev_attr_product_name.attr, +	&dev_attr_product_number.attr, +	&dev_attr_serial_number.attr, +	&dev_attr_pcie_replay_count.attr, +	NULL +}; +  /**   * amdgpu_device_init - initialize the driver   * @@ -2942,9 +3026,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	INIT_LIST_HEAD(&adev->shadow_list);  	mutex_init(&adev->shadow_list_lock); -	INIT_LIST_HEAD(&adev->ring_lru_list); -	spin_lock_init(&adev->ring_lru_list_lock); -  	INIT_DELAYED_WORK(&adev->delayed_init_work,  			  amdgpu_device_delayed_init_work_handler);  	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -2953,7 +3034,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);  	adev->gfx.gfx_off_req_count = 1; -	adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; +	adev->pm.ac_power = power_supply_is_system_supplied() > 0;  	/* Registers mapping */  	/* TODO: block userspace mapping of io register */ @@ -3002,18 +3083,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)  		adev->enable_mes = true; -	if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { -		r = amdgpu_discovery_init(adev); -		if (r) { -			dev_err(adev->dev, "amdgpu_discovery_init failed\n"); -			return r; -		} -	} - -	/* early init functions */ -	r = amdgpu_device_ip_early_init(adev); -	if (r) -		return r; +	/* detect hw virtualization here */ +	amdgpu_detect_virtualization(adev);  	r = amdgpu_device_get_job_timeout_settings(adev);  	if (r) { @@ -3021,6 +3092,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,  		return r;  	} +	/* early init functions */ +	r = amdgpu_device_ip_early_init(adev); +	if (r) +		return r; +  	/* doorbell bar mapping and doorbell index init*/  	amdgpu_device_doorbell_init(adev); @@ -3127,14 +3203,13 @@ fence_driver_init:  		goto failed;  	} -	DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", +	dev_info(adev->dev, +		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",  			adev->gfx.config.max_shader_engines,  			adev->gfx.config.max_sh_per_se,  			adev->gfx.config.max_cu_per_sh,  			adev->gfx.cu_info.number); -	amdgpu_ctx_init_sched(adev); -  	adev->accel_working = true;  	amdgpu_vm_check_compute_bug(adev); @@ -3199,9 +3274,9 @@ fence_driver_init:  	queue_delayed_work(system_wq, &adev->delayed_init_work,  			   msecs_to_jiffies(AMDGPU_RESUME_MS)); -	r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); +	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);  	if (r) { -		dev_err(adev->dev, "Could not create pcie_replay_count"); +		dev_err(adev->dev, "Could not create amdgpu device attr\n");  		return r;  	} @@ -3284,9 +3359,10 @@ void amdgpu_device_fini(struct amdgpu_device *adev)  	adev->rmmio = NULL;  	amdgpu_device_doorbell_fini(adev); -	device_remove_file(adev->dev, &dev_attr_pcie_replay_count);  	if (adev->ucode_sysfs_en)  		amdgpu_ucode_sysfs_fini(adev); + +	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);  	if (IS_ENABLED(CONFIG_PERF_EVENTS))  		amdgpu_pmu_fini(adev);  	if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) @@ -3372,15 +3448,12 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  		}  	} -	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); -	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - -	amdgpu_amdkfd_suspend(adev, !fbcon); -  	amdgpu_ras_suspend(adev);  	r = amdgpu_device_ip_suspend_phase1(adev); +	amdgpu_amdkfd_suspend(adev, !fbcon); +  	/* evict vram memory */  	amdgpu_bo_evict_vram(adev); @@ -3757,6 +3830,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,  	if (r)  		return r; +	amdgpu_amdkfd_pre_reset(adev); +  	/* Resume IP prior to SMC */  	r = amdgpu_device_ip_reinit_early_sriov(adev);  	if (r) @@ -3851,6 +3926,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  	int i, r = 0;  	bool need_full_reset  = *need_full_reset_arg; +	amdgpu_debugfs_wait_dump(adev); +  	/* block all schedulers and reset given job's ring */  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  		struct amdgpu_ring *ring = adev->rings[i]; @@ -4055,6 +4132,64 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)  	mutex_unlock(&adev->lock_reset);  } +static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) +{ +	struct pci_dev *p = NULL; + +	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), +			adev->pdev->bus->number, 1); +	if (p) { +		pm_runtime_enable(&(p->dev)); +		pm_runtime_resume(&(p->dev)); +	} +} + +static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) +{ +	enum amd_reset_method reset_method; +	struct pci_dev *p = NULL; +	u64 expires; + +	/* +	 * For now, only BACO and mode1 reset are confirmed +	 * to suffer the audio issue without proper suspended. +	 */ +	reset_method = amdgpu_asic_reset_method(adev); +	if ((reset_method != AMD_RESET_METHOD_BACO) && +	     (reset_method != AMD_RESET_METHOD_MODE1)) +		return -EINVAL; + +	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), +			adev->pdev->bus->number, 1); +	if (!p) +		return -ENODEV; + +	expires = pm_runtime_autosuspend_expiration(&(p->dev)); +	if (!expires) +		/* +		 * If we cannot get the audio device autosuspend delay, +		 * a fixed 4S interval will be used. Considering 3S is +		 * the audio controller default autosuspend delay setting. +		 * 4S used here is guaranteed to cover that. +		 */ +		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; + +	while (!pm_runtime_status_suspended(&(p->dev))) { +		if (!pm_runtime_suspend(&(p->dev))) +			break; + +		if (expires < ktime_get_mono_fast_ns()) { +			dev_warn(adev->dev, "failed to suspend display audio\n"); +			/* TODO: abort the succeeding gpu reset? */ +			return -ETIMEDOUT; +		} +	} + +	pm_runtime_disable(&(p->dev)); + +	return 0; +} +  /**   * amdgpu_device_gpu_recover - reset the asic and recover scheduler   * @@ -4070,7 +4205,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  			      struct amdgpu_job *job)  {  	struct list_head device_list, *device_list_handle =  NULL; -	bool need_full_reset, job_signaled; +	bool need_full_reset = false; +	bool job_signaled = false;  	struct amdgpu_hive_info *hive = NULL;  	struct amdgpu_device *tmp_adev = NULL;  	int i, r = 0; @@ -4078,6 +4214,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	bool use_baco =  		(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?  		true : false; +	bool audio_suspended = false;  	/*  	 * Flush RAM to disk so that after reboot @@ -4091,16 +4228,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  		emergency_restart();  	} -	need_full_reset = job_signaled = false; -	INIT_LIST_HEAD(&device_list); -  	dev_info(adev->dev, "GPU %s begin!\n",  		(in_ras_intr && !use_baco) ? "jobs stop":"reset"); -	cancel_delayed_work_sync(&adev->delayed_init_work); - -	hive = amdgpu_get_xgmi_hive(adev, false); -  	/*  	 * Here we trylock to avoid chain of resets executing from  	 * either trigger by jobs on different adevs in XGMI hive or jobs on @@ -4108,39 +4238,25 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	 * We always reset all schedulers for device and all devices for XGMI  	 * hive so that should take care of them too.  	 */ - +	hive = amdgpu_get_xgmi_hive(adev, true);  	if (hive && !mutex_trylock(&hive->reset_lock)) {  		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",  			  job ? job->base.id : -1, hive->hive_id); +		mutex_unlock(&hive->hive_lock);  		return 0;  	} -	/* Start with adev pre asic reset first for soft reset check.*/ -	if (!amdgpu_device_lock_adev(adev, !hive)) { -		DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", -			  job ? job->base.id : -1); -		return 0; -	} - -	/* Block kfd: SRIOV would do it separately */ -	if (!amdgpu_sriov_vf(adev)) -                amdgpu_amdkfd_pre_reset(adev); - -	/* Build list of devices to reset */ -	if  (adev->gmc.xgmi.num_physical_nodes > 1) { -		if (!hive) { -			/*unlock kfd: SRIOV would do it separately */ -			if (!amdgpu_sriov_vf(adev)) -		                amdgpu_amdkfd_post_reset(adev); -			amdgpu_device_unlock_adev(adev); +	/* +	 * Build list of devices to reset. +	 * In case we are in XGMI hive mode, resort the device list +	 * to put adev in the 1st position. +	 */ +	INIT_LIST_HEAD(&device_list); +	if (adev->gmc.xgmi.num_physical_nodes > 1) { +		if (!hive)  			return -ENODEV; -		} - -		/* -		 * In case we are in XGMI hive mode device reset is done for all the -		 * nodes in the hive to retrain all XGMI links and hence the reset -		 * sequence is executed in loop on all nodes. -		 */ +		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) +			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);  		device_list_handle = &hive->device_list;  	} else {  		list_add_tail(&adev->gmc.xgmi.head, &device_list); @@ -4149,19 +4265,40 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	/* block all schedulers and reset given job's ring */  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { -		if (tmp_adev != adev) { -			amdgpu_device_lock_adev(tmp_adev, false); -			if (!amdgpu_sriov_vf(tmp_adev)) -			                amdgpu_amdkfd_pre_reset(tmp_adev); +		if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { +			DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", +				  job ? job->base.id : -1); +			mutex_unlock(&hive->hive_lock); +			return 0;  		}  		/* +		 * Try to put the audio codec into suspend state +		 * before gpu reset started. +		 * +		 * Due to the power domain of the graphics device +		 * is shared with AZ power domain. Without this, +		 * we may change the audio hardware from behind +		 * the audio driver's back. That will trigger +		 * some audio codec errors. +		 */ +		if (!amdgpu_device_suspend_display_audio(tmp_adev)) +			audio_suspended = true; + +		amdgpu_ras_set_error_query_ready(tmp_adev, false); + +		cancel_delayed_work_sync(&tmp_adev->delayed_init_work); + +		if (!amdgpu_sriov_vf(tmp_adev)) +			amdgpu_amdkfd_pre_reset(tmp_adev); + +		/*  		 * Mark these ASICs to be reseted as untracked first  		 * And add them back after reset completed  		 */  		amdgpu_unregister_gpu_instance(tmp_adev); -		amdgpu_fbdev_set_suspend(adev, 1); +		amdgpu_fbdev_set_suspend(tmp_adev, 1);  		/* disable ras on ALL IPs */  		if (!(in_ras_intr && !use_baco) && @@ -4181,7 +4318,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  		}  	} -  	if (in_ras_intr && !use_baco)  		goto skip_sched_resume; @@ -4192,30 +4328,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	 * job->base holds a reference to parent fence  	 */  	if (job && job->base.s_fence->parent && -	    dma_fence_is_signaled(job->base.s_fence->parent)) +	    dma_fence_is_signaled(job->base.s_fence->parent)) {  		job_signaled = true; - -	if (job_signaled) {  		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");  		goto skip_hw_reset;  	} - -	/* Guilty job will be freed after this*/ -	r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset); -	if (r) { -		/*TODO Should we stop ?*/ -		DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", -			  r, adev->ddev->unique); -		adev->asic_reset_res = r; -	} -  retry:	/* Rest of adevs pre asic reset from XGMI hive. */  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - -		if (tmp_adev == adev) -			continue; -  		r = amdgpu_device_pre_asic_reset(tmp_adev,  						 NULL,  						 &need_full_reset); @@ -4277,11 +4397,15 @@ skip_sched_resume:  		/*unlock kfd: SRIOV would do it separately */  		if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))  	                amdgpu_amdkfd_post_reset(tmp_adev); +		if (audio_suspended) +			amdgpu_device_resume_display_audio(tmp_adev);  		amdgpu_device_unlock_adev(tmp_adev);  	} -	if (hive) +	if (hive) {  		mutex_unlock(&hive->reset_lock); +		mutex_unlock(&hive->hive_lock); +	}  	if (r)  		dev_info(adev->dev, "GPU reset end with ret = %d\n", r); |