diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1139 | 
1 files changed, 663 insertions, 476 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6238701cde23..7753a2e64d41 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -32,8 +32,6 @@  #include <linux/slab.h>  #include <linux/iommu.h>  #include <linux/pci.h> -#include <linux/devcoredump.h> -#include <generated/utsrelease.h>  #include <linux/pci-p2pdma.h>  #include <linux/apple-gmux.h> @@ -43,6 +41,7 @@  #include <drm/drm_fb_helper.h>  #include <drm/drm_probe_helper.h>  #include <drm/amdgpu_drm.h> +#include <linux/device.h>  #include <linux/vgaarb.h>  #include <linux/vga_switcheroo.h>  #include <linux/efi.h> @@ -74,6 +73,7 @@  #include "amdgpu_pmu.h"  #include "amdgpu_fru_eeprom.h"  #include "amdgpu_reset.h" +#include "amdgpu_virt.h"  #include <linux/suspend.h>  #include <drm/task_barrier.h> @@ -96,6 +96,9 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");  #define AMDGPU_RESUME_MS		2000  #define AMDGPU_MAX_RETRY_LIMIT		2  #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL) +#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) +#define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) +#define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)  static const struct drm_driver amdgpu_kms_driver; @@ -159,76 +162,138 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,  	return sysfs_emit(buf, "%llu\n", cnt);  } -static DEVICE_ATTR(pcie_replay_count, S_IRUGO, +static DEVICE_ATTR(pcie_replay_count, 0444,  		amdgpu_device_get_pcie_replay_count, NULL); -static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); - -/** - * DOC: product_name - * - * The amdgpu driver provides a sysfs API for reporting the product name - * for the device - * The file product_name is used for this and returns the product name - * as returned from the FRU. - * NOTE: This is only available for certain server cards - */ - -static ssize_t amdgpu_device_get_product_name(struct device *dev, -		struct device_attribute *attr, char *buf) +static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj, +					  struct bin_attribute *attr, char *buf, +					  loff_t ppos, size_t count)  { +	struct device *dev = kobj_to_dev(kobj);  	struct drm_device *ddev = dev_get_drvdata(dev);  	struct amdgpu_device *adev = drm_to_adev(ddev); +	ssize_t bytes_read; + +	switch (ppos) { +	case AMDGPU_SYS_REG_STATE_XGMI: +		bytes_read = amdgpu_asic_get_reg_state( +			adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count); +		break; +	case AMDGPU_SYS_REG_STATE_WAFL: +		bytes_read = amdgpu_asic_get_reg_state( +			adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count); +		break; +	case AMDGPU_SYS_REG_STATE_PCIE: +		bytes_read = amdgpu_asic_get_reg_state( +			adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count); +		break; +	case AMDGPU_SYS_REG_STATE_USR: +		bytes_read = amdgpu_asic_get_reg_state( +			adev, AMDGPU_REG_STATE_TYPE_USR, buf, count); +		break; +	case AMDGPU_SYS_REG_STATE_USR_1: +		bytes_read = amdgpu_asic_get_reg_state( +			adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count); +		break; +	default: +		return -EINVAL; +	} + +	return bytes_read; +} + +BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL, +	 AMDGPU_SYS_REG_STATE_END); + +int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev) +{ +	int ret; -	return sysfs_emit(buf, "%s\n", adev->product_name); +	if (!amdgpu_asic_get_reg_state_supported(adev)) +		return 0; + +	ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state); + +	return ret;  } -static DEVICE_ATTR(product_name, S_IRUGO, -		amdgpu_device_get_product_name, NULL); +void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev) +{ +	if (!amdgpu_asic_get_reg_state_supported(adev)) +		return; +	sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state); +}  /** - * DOC: product_number + * DOC: board_info + * + * The amdgpu driver provides a sysfs API for giving board related information. + * It provides the form factor information in the format + * + *   type : form factor + * + * Possible form factor values + * + * - "cem"		- PCIE CEM card + * - "oam"		- Open Compute Accelerator Module + * - "unknown"	- Not known   * - * The amdgpu driver provides a sysfs API for reporting the part number - * for the device - * The file product_number is used for this and returns the part number - * as returned from the FRU. - * NOTE: This is only available for certain server cards   */ -static ssize_t amdgpu_device_get_product_number(struct device *dev, -		struct device_attribute *attr, char *buf) +static ssize_t amdgpu_device_get_board_info(struct device *dev, +					    struct device_attribute *attr, +					    char *buf)  {  	struct drm_device *ddev = dev_get_drvdata(dev);  	struct amdgpu_device *adev = drm_to_adev(ddev); +	enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; +	const char *pkg; -	return sysfs_emit(buf, "%s\n", adev->product_number); +	if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) +		pkg_type = adev->smuio.funcs->get_pkg_type(adev); + +	switch (pkg_type) { +	case AMDGPU_PKG_TYPE_CEM: +		pkg = "cem"; +		break; +	case AMDGPU_PKG_TYPE_OAM: +		pkg = "oam"; +		break; +	default: +		pkg = "unknown"; +		break; +	} + +	return sysfs_emit(buf, "%s : %s\n", "type", pkg);  } -static DEVICE_ATTR(product_number, S_IRUGO, -		amdgpu_device_get_product_number, NULL); +static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); -/** - * DOC: serial_number - * - * The amdgpu driver provides a sysfs API for reporting the serial number - * for the device - * The file serial_number is used for this and returns the serial number - * as returned from the FRU. - * NOTE: This is only available for certain server cards - */ +static struct attribute *amdgpu_board_attrs[] = { +	&dev_attr_board_info.attr, +	NULL, +}; -static ssize_t amdgpu_device_get_serial_number(struct device *dev, -		struct device_attribute *attr, char *buf) +static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, +					     struct attribute *attr, int n)  { +	struct device *dev = kobj_to_dev(kobj);  	struct drm_device *ddev = dev_get_drvdata(dev);  	struct amdgpu_device *adev = drm_to_adev(ddev); -	return sysfs_emit(buf, "%s\n", adev->serial); +	if (adev->flags & AMD_IS_APU) +		return 0; + +	return attr->mode;  } -static DEVICE_ATTR(serial_number, S_IRUGO, -		amdgpu_device_get_serial_number, NULL); +static const struct attribute_group amdgpu_board_attrs_group = { +	.attrs = amdgpu_board_attrs, +	.is_visible = amdgpu_board_attrs_is_visible +}; + +static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); +  /**   * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control @@ -370,10 +435,16 @@ size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,  		if (write) {  			memcpy_toio(addr, buf, count); +			/* Make sure HDP write cache flush happens without any reordering +			 * after the system memory contents are sent over PCIe device +			 */  			mb();  			amdgpu_device_flush_hdp(adev, NULL);  		} else {  			amdgpu_device_invalidate_hdp(adev, NULL); +			/* Make sure HDP read cache is invalidated before issuing a read +			 * to the PCIe device +			 */  			mb();  			memcpy_fromio(buf, addr, count);  		} @@ -464,7 +535,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,  		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&  		    amdgpu_sriov_runtime(adev) &&  		    down_read_trylock(&adev->reset_domain->sem)) { -			ret = amdgpu_kiq_rreg(adev, reg); +			ret = amdgpu_kiq_rreg(adev, reg, 0);  			up_read(&adev->reset_domain->sem);  		} else {  			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -481,8 +552,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,  /*   * MMIO register read with bytes helper functions   * @offset:bytes offset from MMIO start - * -*/ + */  /**   * amdgpu_mm_rreg8 - read a memory mapped IO register @@ -502,12 +572,55 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)  	BUG();  } + +/** + * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC + * + * @adev: amdgpu_device pointer + * @reg: dword aligned register offset + * @acc_flags: access flags which require special behavior + * @xcc_id: xcc accelerated compute core id + * + * Returns the 32 bit value from the offset specified. + */ +uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, +				uint32_t reg, uint32_t acc_flags, +				uint32_t xcc_id) +{ +	uint32_t ret, rlcg_flag; + +	if (amdgpu_device_skip_hw_access(adev)) +		return 0; + +	if ((reg * 4) < adev->rmmio_size) { +		if (amdgpu_sriov_vf(adev) && +		    !amdgpu_sriov_runtime(adev) && +		    adev->gfx.rlc.rlcg_reg_access_supported && +		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, +							 GC_HWIP, false, +							 &rlcg_flag)) { +			ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); +		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && +		    amdgpu_sriov_runtime(adev) && +		    down_read_trylock(&adev->reset_domain->sem)) { +			ret = amdgpu_kiq_rreg(adev, reg, xcc_id); +			up_read(&adev->reset_domain->sem); +		} else { +			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); +		} +	} else { +		ret = adev->pcie_rreg(adev, reg * 4); +	} + +	return ret; +} +  /*   * MMIO register write with bytes helper functions   * @offset:bytes offset from MMIO start   * @value: the value want to be written to the register - * -*/ + */ +  /**   * amdgpu_mm_wreg8 - read a memory mapped IO register   * @@ -549,7 +662,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,  		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&  		    amdgpu_sriov_runtime(adev) &&  		    down_read_trylock(&adev->reset_domain->sem)) { -			amdgpu_kiq_wreg(adev, reg, v); +			amdgpu_kiq_wreg(adev, reg, v, 0);  			up_read(&adev->reset_domain->sem);  		} else {  			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); @@ -567,11 +680,13 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,   * @adev: amdgpu_device pointer   * @reg: mmio/rlc register   * @v: value to write + * @xcc_id: xcc accelerated compute core id   *   * this function is invoked only for the debugfs register access   */  void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, -			     uint32_t reg, uint32_t v) +			     uint32_t reg, uint32_t v, +			     uint32_t xcc_id)  {  	if (amdgpu_device_skip_hw_access(adev))  		return; @@ -580,7 +695,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,  	    adev->gfx.rlc.funcs &&  	    adev->gfx.rlc.funcs->is_rlcg_access_range) {  		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) -			return amdgpu_sriov_wreg(adev, reg, v, 0, 0); +			return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);  	} else if ((reg * 4) >= adev->rmmio_size) {  		adev->pcie_wreg(adev, reg * 4, v);  	} else { @@ -589,90 +704,43 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,  }  /** - * amdgpu_mm_rdoorbell - read a doorbell dword + * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC   *   * @adev: amdgpu_device pointer - * @index: doorbell index - * - * Returns the value in the doorbell aperture at the - * requested doorbell index (CIK). - */ -u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) -{ -	if (amdgpu_device_skip_hw_access(adev)) -		return 0; - -	if (index < adev->doorbell.num_kernel_doorbells) { -		return readl(adev->doorbell.ptr + index); -	} else { -		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); -		return 0; -	} -} - -/** - * amdgpu_mm_wdoorbell - write a doorbell dword - * - * @adev: amdgpu_device pointer - * @index: doorbell index - * @v: value to write - * - * Writes @v to the doorbell aperture at the - * requested doorbell index (CIK). - */ -void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) -{ -	if (amdgpu_device_skip_hw_access(adev)) -		return; - -	if (index < adev->doorbell.num_kernel_doorbells) { -		writel(v, adev->doorbell.ptr + index); -	} else { -		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); -	} -} - -/** - * amdgpu_mm_rdoorbell64 - read a doorbell Qword - * - * @adev: amdgpu_device pointer - * @index: doorbell index + * @reg: dword aligned register offset + * @v: 32 bit value to write to the register + * @acc_flags: access flags which require special behavior + * @xcc_id: xcc accelerated compute core id   * - * Returns the value in the doorbell aperture at the - * requested doorbell index (VEGA10+). + * Writes the value specified to the offset specified.   */ -u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) +void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, +			uint32_t reg, uint32_t v, +			uint32_t acc_flags, uint32_t xcc_id)  { -	if (amdgpu_device_skip_hw_access(adev)) -		return 0; - -	if (index < adev->doorbell.num_kernel_doorbells) { -		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); -	} else { -		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); -		return 0; -	} -} +	uint32_t rlcg_flag; -/** - * amdgpu_mm_wdoorbell64 - write a doorbell Qword - * - * @adev: amdgpu_device pointer - * @index: doorbell index - * @v: value to write - * - * Writes @v to the doorbell aperture at the - * requested doorbell index (VEGA10+). - */ -void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) -{  	if (amdgpu_device_skip_hw_access(adev))  		return; -	if (index < adev->doorbell.num_kernel_doorbells) { -		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); +	if ((reg * 4) < adev->rmmio_size) { +		if (amdgpu_sriov_vf(adev) && +		    !amdgpu_sriov_runtime(adev) && +		    adev->gfx.rlc.rlcg_reg_access_supported && +		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, +							 GC_HWIP, true, +							 &rlcg_flag)) { +			amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); +		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && +		    amdgpu_sriov_runtime(adev) && +		    down_read_trylock(&adev->reset_domain->sem)) { +			amdgpu_kiq_wreg(adev, reg, v, xcc_id); +			up_read(&adev->reset_domain->sem); +		} else { +			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); +		}  	} else { -		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); +		adev->pcie_wreg(adev, reg * 4, v);  	}  } @@ -716,12 +784,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,  	void __iomem *pcie_index_hi_offset;  	void __iomem *pcie_data_offset; -	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); -	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); -	if (adev->nbio.funcs->get_pcie_index_hi_offset) -		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); -	else +	if (unlikely(!adev->nbio.funcs)) { +		pcie_index = AMDGPU_PCIE_INDEX_FALLBACK; +		pcie_data = AMDGPU_PCIE_DATA_FALLBACK; +	} else { +		pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); +		pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); +	} + +	if (reg_addr >> 32) { +		if (unlikely(!adev->nbio.funcs)) +			pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK; +		else +			pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); +	} else {  		pcie_index_hi = 0; +	}  	spin_lock_irqsave(&adev->pcie_idx_lock, flags);  	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -785,6 +863,56 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,  	return r;  } +u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, +				  u64 reg_addr) +{ +	unsigned long flags, pcie_index, pcie_data; +	unsigned long pcie_index_hi = 0; +	void __iomem *pcie_index_offset; +	void __iomem *pcie_index_hi_offset; +	void __iomem *pcie_data_offset; +	u64 r; + +	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); +	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); +	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) +		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); + +	spin_lock_irqsave(&adev->pcie_idx_lock, flags); +	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; +	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; +	if (pcie_index_hi != 0) +		pcie_index_hi_offset = (void __iomem *)adev->rmmio + +			pcie_index_hi * 4; + +	/* read low 32 bits */ +	writel(reg_addr, pcie_index_offset); +	readl(pcie_index_offset); +	if (pcie_index_hi != 0) { +		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); +		readl(pcie_index_hi_offset); +	} +	r = readl(pcie_data_offset); +	/* read high 32 bits */ +	writel(reg_addr + 4, pcie_index_offset); +	readl(pcie_index_offset); +	if (pcie_index_hi != 0) { +		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); +		readl(pcie_index_hi_offset); +	} +	r |= ((u64)readl(pcie_data_offset) << 32); + +	/* clear the high bits */ +	if (pcie_index_hi != 0) { +		writel(0, pcie_index_hi_offset); +		readl(pcie_index_hi_offset); +	} + +	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + +	return r; +} +  /**   * amdgpu_device_indirect_wreg - write an indirect register address   * @@ -824,7 +952,7 @@ void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,  	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);  	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); -	if (adev->nbio.funcs->get_pcie_index_hi_offset) +	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))  		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);  	else  		pcie_index_hi = 0; @@ -889,6 +1017,55 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,  	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);  } +void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, +				   u64 reg_addr, u64 reg_data) +{ +	unsigned long flags, pcie_index, pcie_data; +	unsigned long pcie_index_hi = 0; +	void __iomem *pcie_index_offset; +	void __iomem *pcie_index_hi_offset; +	void __iomem *pcie_data_offset; + +	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); +	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); +	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) +		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); + +	spin_lock_irqsave(&adev->pcie_idx_lock, flags); +	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; +	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; +	if (pcie_index_hi != 0) +		pcie_index_hi_offset = (void __iomem *)adev->rmmio + +				pcie_index_hi * 4; + +	/* write low 32 bits */ +	writel(reg_addr, pcie_index_offset); +	readl(pcie_index_offset); +	if (pcie_index_hi != 0) { +		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); +		readl(pcie_index_hi_offset); +	} +	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); +	readl(pcie_data_offset); +	/* write high 32 bits */ +	writel(reg_addr + 4, pcie_index_offset); +	readl(pcie_index_offset); +	if (pcie_index_hi != 0) { +		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); +		readl(pcie_index_hi_offset); +	} +	writel((u32)(reg_data >> 32), pcie_data_offset); +	readl(pcie_data_offset); + +	/* clear the high bits */ +	if (pcie_index_hi != 0) { +		writel(0, pcie_index_hi_offset); +		readl(pcie_index_hi_offset); +	} + +	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); +} +  /**   * amdgpu_device_get_rev_id - query device rev_id   * @@ -966,6 +1143,13 @@ static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)  	return 0;  } +static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) +{ +	DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); +	BUG(); +	return 0; +} +  /**   * amdgpu_invalid_wreg64 - dummy reg write function   * @@ -983,6 +1167,13 @@ static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint  	BUG();  } +static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) +{ +	DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", +		  reg, v); +	BUG(); +} +  /**   * amdgpu_block_invalid_rreg - dummy reg read function   * @@ -1032,13 +1223,20 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,   */  static int amdgpu_device_asic_init(struct amdgpu_device *adev)  { +	int ret; +  	amdgpu_asic_pre_asic_init(adev); -	if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || -	    adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) -		return amdgpu_atomfirmware_asic_init(adev, true); -	else +	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || +	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { +		amdgpu_psp_wait_for_bootloader(adev); +		ret = amdgpu_atomfirmware_asic_init(adev, true); +		return ret; +	} else {  		return amdgpu_atom_asic_init(adev->mode_info.atom_context); +	} + +	return 0;  }  /** @@ -1078,7 +1276,7 @@ static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)   * @registers: pointer to the register array   * @array_size: size of the register array   * - * Programs an array or registers with and and or masks. + * Programs an array or registers with and or masks.   * This is a helper for setting golden registers.   */  void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, @@ -1136,83 +1334,6 @@ int amdgpu_device_pci_reset(struct amdgpu_device *adev)  }  /* - * GPU doorbell aperture helpers function. - */ -/** - * amdgpu_device_doorbell_init - Init doorbell driver information. - * - * @adev: amdgpu_device pointer - * - * Init doorbell driver information (CIK) - * Returns 0 on success, error on failure. - */ -static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) -{ - -	/* No doorbell on SI hardware generation */ -	if (adev->asic_type < CHIP_BONAIRE) { -		adev->doorbell.base = 0; -		adev->doorbell.size = 0; -		adev->doorbell.num_kernel_doorbells = 0; -		adev->doorbell.ptr = NULL; -		return 0; -	} - -	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) -		return -EINVAL; - -	amdgpu_asic_init_doorbell_index(adev); - -	/* doorbell bar mapping */ -	adev->doorbell.base = pci_resource_start(adev->pdev, 2); -	adev->doorbell.size = pci_resource_len(adev->pdev, 2); - -	if (adev->enable_mes) { -		adev->doorbell.num_kernel_doorbells = -			adev->doorbell.size / sizeof(u32); -	} else { -		adev->doorbell.num_kernel_doorbells = -			min_t(u32, adev->doorbell.size / sizeof(u32), -			      adev->doorbell_index.max_assignment+1); -		if (adev->doorbell.num_kernel_doorbells == 0) -			return -EINVAL; - -		/* For Vega, reserve and map two pages on doorbell BAR since SDMA -		 * paging queue doorbell use the second page. The -		 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the -		 * doorbells are in the first page. So with paging queue enabled, -		 * the max num_kernel_doorbells should + 1 page (0x400 in dword) -		 */ -		if (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(4, 0, 0) && -		    adev->ip_versions[SDMA0_HWIP][0] < IP_VERSION(4, 2, 0)) -			adev->doorbell.num_kernel_doorbells += 0x400; -	} - -	adev->doorbell.ptr = ioremap(adev->doorbell.base, -				     adev->doorbell.num_kernel_doorbells * -				     sizeof(u32)); -	if (adev->doorbell.ptr == NULL) -		return -ENOMEM; - -	return 0; -} - -/** - * amdgpu_device_doorbell_fini - Tear down doorbell driver information. - * - * @adev: amdgpu_device pointer - * - * Tear down doorbell driver information (CIK) - */ -static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) -{ -	iounmap(adev->doorbell.ptr); -	adev->doorbell.ptr = NULL; -} - - - -/*   * amdgpu_device_wb_*()   * Writeback is the method by which the GPU updates special pages in memory   * with the status of certain GPU events (fences, ring pointers,etc.). @@ -1321,14 +1442,21 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)  	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);  	struct pci_bus *root;  	struct resource *res; -	unsigned i; +	unsigned int i;  	u16 cmd;  	int r; +	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) +		return 0; +  	/* Bypass for VF */  	if (amdgpu_sriov_vf(adev))  		return 0; +	/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */ +	if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR)) +		DRM_WARN("System can't access extended configuration space,please check!!\n"); +  	/* skip if the bios has already enabled large BAR */  	if (adev->gmc.real_vram_size &&  	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) @@ -1359,7 +1487,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)  			      cmd & ~PCI_COMMAND_MEMORY);  	/* Free the VRAM and doorbell BAR, we most likely need to move both. */ -	amdgpu_device_doorbell_fini(adev); +	amdgpu_doorbell_fini(adev);  	if (adev->asic_type >= CHIP_BONAIRE)  		pci_release_resource(adev->pdev, 2); @@ -1376,7 +1504,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)  	/* When the doorbell or fb BAR isn't available we have no chance of  	 * using the device.  	 */ -	r = amdgpu_device_doorbell_init(adev); +	r = amdgpu_doorbell_init(adev);  	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))  		return -ENODEV; @@ -1387,9 +1515,8 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)  static bool amdgpu_device_read_bios(struct amdgpu_device *adev)  { -	if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) { +	if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))  		return false; -	}  	return true;  } @@ -1425,12 +1552,14 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)  		if (adev->asic_type == CHIP_FIJI) {  			int err;  			uint32_t fw_ver; +  			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);  			/* force vPost if error occured */  			if (err)  				return true;  			fw_ver = *((uint32_t *)adev->pm.fw->data + 69); +			release_firmware(adev->pm.fw);  			if (fw_ver < 0x00160e00)  				return true;  		} @@ -1459,44 +1588,53 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)  }  /* - * On APUs with >= 64GB white flickering has been observed w/ SG enabled. - * Disable S/G on such systems until we have a proper fix. - * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 - * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 + * Check whether seamless boot is supported. + * + * So far we only support seamless boot on DCE 3.0 or later. + * If users report that it works on older ASICS as well, we may + * loosen this.   */ -bool amdgpu_sg_display_supported(struct amdgpu_device *adev) +bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)  { -	switch (amdgpu_sg_display) { +	switch (amdgpu_seamless) {  	case -1:  		break; -	case 0: -		return false;  	case 1:  		return true; +	case 0: +		return false;  	default: +		DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", +			  amdgpu_seamless);  		return false;  	} -	if ((totalram_pages() << (PAGE_SHIFT - 10)) + -	    (adev->gmc.real_vram_size / 1024) >= 64000000) { -		DRM_WARN("Disabling S/G due to >=64GB RAM\n"); + +	if (!(adev->flags & AMD_IS_APU))  		return false; -	} -	return true; + +	if (adev->mman.keep_stolen_vga_memory) +		return false; + +	return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);  }  /* - * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic - * speed switching. Until we have confirmation from Intel that a specific host - * supports it, it's safer that we keep it disabled for all. + * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids + * don't support dynamic speed switching. Until we have confirmation from Intel + * that a specific host supports it, it's safer that we keep it disabled for all.   *   * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/   * https://gitlab.freedesktop.org/drm/amd/-/issues/2663   */ -bool amdgpu_device_pcie_dynamic_switching_supported(void) +static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)  {  #if IS_ENABLED(CONFIG_X86)  	struct cpuinfo_x86 *c = &cpu_data(0); +	/* eGPU change speeds based on USB4 fabric conditions */ +	if (dev_is_removable(adev->dev)) +		return true; +  	if (c->x86_vendor == X86_VENDOR_INTEL)  		return false;  #endif @@ -1525,20 +1663,13 @@ bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)  	default:  		return false;  	} +	if (adev->flags & AMD_IS_APU) +		return false; +	if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) +		return false;  	return pcie_aspm_enabled(adev->pdev);  } -bool amdgpu_device_aspm_support_quirk(void) -{ -#if IS_ENABLED(CONFIG_X86) -	struct cpuinfo_x86 *c = &cpu_data(0); - -	return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); -#else -	return true; -#endif -} -  /* if we get transitioned to only one device, take VGA back */  /**   * amdgpu_device_vga_set_decode - enable/disable vga decode @@ -1553,6 +1684,7 @@ static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,  		bool state)  {  	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); +  	amdgpu_asic_set_vga_state(adev, state);  	if (state)  		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | @@ -1575,7 +1707,8 @@ static void amdgpu_device_check_block_size(struct amdgpu_device *adev)  {  	/* defines number of bits in page table versus page directory,  	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the -	 * page table and the remaining bits are in the page directory */ +	 * page table and the remaining bits are in the page directory +	 */  	if (amdgpu_vm_block_size == -1)  		return; @@ -1785,6 +1918,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,  	} else {  		pr_info("switched off\n");  		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; +		amdgpu_device_prepare(dev);  		amdgpu_device_suspend(dev, true);  		amdgpu_device_cache_pci_state(pdev);  		/* Shut down the device */ @@ -1807,7 +1941,7 @@ static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)  {  	struct drm_device *dev = pci_get_drvdata(pdev); -	/* +       /*  	* FIXME: open_count is protected by drm_global_mutex but that would lead to  	* locking inversion with the driver load path. And the access here is  	* completely racy anyway. So don't bother with locking for now. @@ -2133,15 +2267,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)  	adev->firmware.gpu_info_fw = NULL; -	if (adev->mman.discovery_bin) { -		/* -		 * FIXME: The bounding box is still needed by Navi12, so -		 * temporarily read it from gpu_info firmware. Should be dropped -		 * when DAL no longer needs it. -		 */ -		if (adev->asic_type != CHIP_NAVI12) -			return 0; -	} +	if (adev->mman.discovery_bin) +		return 0;  	switch (adev->asic_type) {  	default: @@ -2256,7 +2383,6 @@ out:   */  static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  { -	struct drm_device *dev = adev_to_drm(adev);  	struct pci_dev *parent;  	int i, r;  	bool total; @@ -2327,11 +2453,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  	    (amdgpu_is_atpx_hybrid() ||  	     amdgpu_has_atpx_dgpu_power_cntl()) &&  	    ((adev->flags & AMD_IS_APU) == 0) && -	    !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) +	    !dev_is_removable(&adev->pdev->dev))  		adev->flags |= AMD_IS_PX;  	if (!(adev->flags & AMD_IS_APU)) { -		parent = pci_upstream_bridge(adev->pdev); +		parent = pcie_find_root_port(adev->pdev);  		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;  	} @@ -2341,6 +2467,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)  		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;  	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)  		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; +	if (!amdgpu_device_pcie_dynamic_switching_supported(adev)) +		adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;  	total = true;  	for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2517,7 +2645,8 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)  			break;  		} -		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, +		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL, +				   DRM_SCHED_PRIORITY_COUNT,  				   ring->num_hw_submission, 0,  				   timeout, adev->reset_domain->wq,  				   ring->sched_score, ring->name, @@ -2527,6 +2656,18 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)  				  ring->name);  			return r;  		} +		r = amdgpu_uvd_entity_init(adev, ring); +		if (r) { +			DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", +				  ring->name); +			return r; +		} +		r = amdgpu_vce_entity_init(adev, ring); +		if (r) { +			DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", +				  ring->name); +			return r; +		}  	}  	amdgpu_xcp_update_partition_sched_list(adev); @@ -2607,6 +2748,12 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  					goto init_failed;  				}  			} + +			r = amdgpu_seq64_init(adev); +			if (r) { +				DRM_ERROR("allocate seq64 failed %d\n", r); +				goto init_failed; +			}  		}  	} @@ -2687,6 +2834,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  	if (r)  		goto init_failed; +	if (adev->mman.buffer_funcs_ring->sched.ready) +		amdgpu_ttm_set_buffer_funcs_status(adev, true); +  	/* Don't init kfd if whole hive need to be reset during init */  	if (!adev->gmc.xgmi.pending_reset) {  		kgd2kfd_init_zone_device(adev); @@ -2969,7 +3119,7 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)  {  	int i, r; -	if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) +	if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))  		return;  	for (i = 0; i < adev->num_ip_blocks; i++) { @@ -3066,6 +3216,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)  			amdgpu_device_wb_fini(adev);  			amdgpu_device_mem_scratch_fini(adev);  			amdgpu_ib_pool_fini(adev); +			amdgpu_seq64_fini(adev);  		}  		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); @@ -3222,8 +3373,10 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)  		/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */  		if (adev->in_s0ix && -		    (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && -		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) +		    (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= +		     IP_VERSION(5, 0, 0)) && +		    (adev->ip_blocks[i].version->type == +		     AMD_IP_BLOCK_TYPE_SDMA))  			continue;  		/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. @@ -3282,6 +3435,8 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev)  		amdgpu_virt_request_full_gpu(adev, false);  	} +	amdgpu_ttm_set_buffer_funcs_status(adev, false); +  	r = amdgpu_device_ip_suspend_phase1(adev);  	if (r)  		return r; @@ -3452,7 +3607,7 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)   *   * Main resume function for hardware IPs.  The hardware IPs   * are split into two resume functions because they are - * are also used in in recovering from a GPU reset and some additional + * also used in recovering from a GPU reset and some additional   * steps need to be take between them.  In this case (S3/S4) they are   * run sequentially.   * Returns 0 on success, negative error code on failure. @@ -3461,12 +3616,6 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev)  {  	int r; -	if (!adev->in_s0ix) { -		r = amdgpu_amdkfd_resume_iommu(adev); -		if (r) -			return r; -	} -  	r = amdgpu_device_ip_resume_phase1(adev);  	if (r)  		return r; @@ -3477,6 +3626,9 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev)  	r = amdgpu_device_ip_resume_phase2(adev); +	if (adev->mman.buffer_funcs_ring->sched.ready) +		amdgpu_ttm_set_buffer_funcs_status(adev, true); +  	return r;  } @@ -3554,8 +3706,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)  #else  	default:  		if (amdgpu_dc > 0) -			DRM_INFO_ONCE("Display Core has been requested via kernel parameter " -					 "but isn't supported by ASIC, ignoring\n"); +			DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");  		return false;  #endif  	} @@ -3607,9 +3758,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)  		if (adev->asic_reset_res)  			goto fail; -		if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && -		    adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) -			adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); +		amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);  	} else {  		task_barrier_full(&hive->tb); @@ -3711,9 +3860,6 @@ static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)  }  static const struct attribute *amdgpu_dev_attributes[] = { -	&dev_attr_product_name.attr, -	&dev_attr_product_number.attr, -	&dev_attr_serial_number.attr,  	&dev_attr_pcie_replay_count.attr,  	NULL  }; @@ -3724,10 +3870,6 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)  		adev->gfx.mcbp = true;  	else if (amdgpu_mcbp == 0)  		adev->gfx.mcbp = false; -	else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && -		 (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && -		 adev->gfx.num_gfx_rings) -		adev->gfx.mcbp = true;  	if (amdgpu_sriov_vf(adev))  		adev->gfx.mcbp = true; @@ -3790,6 +3932,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	adev->pciep_wreg = &amdgpu_invalid_wreg;  	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;  	adev->pcie_wreg64 = &amdgpu_invalid_wreg64; +	adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; +	adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;  	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;  	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;  	adev->didt_rreg = &amdgpu_invalid_rreg; @@ -3804,7 +3948,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,  		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);  	/* mutex initialization are all done here so we -	 * can recall function without having locking issues */ +	 * can recall function without having locking issues +	 */  	mutex_init(&adev->firmware.mutex);  	mutex_init(&adev->pm.mutex);  	mutex_init(&adev->gfx.gpu_clock_mutex); @@ -3844,6 +3989,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	INIT_LIST_HEAD(&adev->ras_list); +	INIT_LIST_HEAD(&adev->pm.od_kobj_list); +  	INIT_DELAYED_WORK(&adev->delayed_init_work,  			  amdgpu_device_delayed_init_work_handler);  	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -3881,11 +4028,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,  		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);  	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); -	if (adev->rmmio == NULL) { +	if (!adev->rmmio)  		return -ENOMEM; -	} +  	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); -	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); +	DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);  	/*  	 * Reset domain needs to be present early, before XGMI hive discovered @@ -3907,13 +4054,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,  		return r;  	} +	amdgpu_device_set_mcbp(adev); +  	/* early init functions */  	r = amdgpu_device_ip_early_init(adev);  	if (r)  		return r; -	amdgpu_device_set_mcbp(adev); -  	/* Get rid of things like offb */  	r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);  	if (r) @@ -3940,7 +4087,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	 * internal path natively support atomics, set have_atomics_support to true.  	 */  	} else if ((adev->flags & AMD_IS_APU) && -		   (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { +		   (amdgpu_ip_version(adev, GC_HWIP, 0) > +		    IP_VERSION(9, 0, 0))) {  		adev->have_atomics_support = true;  	} else {  		adev->have_atomics_support = @@ -3953,7 +4101,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,  		dev_info(adev->dev, "PCIE atomic ops is not supported\n");  	/* doorbell bar mapping and doorbell index init*/ -	amdgpu_device_doorbell_init(adev); +	amdgpu_doorbell_init(adev);  	if (amdgpu_emu_mode == 1) {  		/* post the asic on emulation mode */ @@ -3987,18 +4135,22 @@ int amdgpu_device_init(struct amdgpu_device *adev,  					adev->ip_blocks[i].status.hw = true;  				}  			} +		} else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) && +				   !amdgpu_device_has_display_hardware(adev)) { +					r = psp_gpu_reset(adev);  		} else { -			tmp = amdgpu_reset_method; -			/* It should do a default reset when loading or reloading the driver, -			 * regardless of the module parameter reset_method. -			 */ -			amdgpu_reset_method = AMD_RESET_METHOD_NONE; -			r = amdgpu_asic_reset(adev); -			amdgpu_reset_method = tmp; -			if (r) { -				dev_err(adev->dev, "asic reset on init failed\n"); -				goto failed; -			} +				tmp = amdgpu_reset_method; +				/* It should do a default reset when loading or reloading the driver, +				 * regardless of the module parameter reset_method. +				 */ +				amdgpu_reset_method = AMD_RESET_METHOD_NONE; +				r = amdgpu_asic_reset(adev); +				amdgpu_reset_method = tmp; +		} + +		if (r) { +		  dev_err(adev->dev, "asic reset on init failed\n"); +		  goto failed;  		}  	} @@ -4080,30 +4232,6 @@ fence_driver_init:  	/* Get a log2 for easy divisions. */  	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); -	r = amdgpu_atombios_sysfs_init(adev); -	if (r) -		drm_err(&adev->ddev, -			"registering atombios sysfs failed (%d).\n", r); - -	r = amdgpu_pm_sysfs_init(adev); -	if (r) -		DRM_ERROR("registering pm sysfs failed (%d).\n", r); - -	r = amdgpu_ucode_sysfs_init(adev); -	if (r) { -		adev->ucode_sysfs_en = false; -		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); -	} else -		adev->ucode_sysfs_en = true; - -	r = amdgpu_psp_sysfs_init(adev); -	if (r) { -		adev->psp_sysfs_en = false; -		if (!amdgpu_sriov_vf(adev)) -			DRM_ERROR("Creating psp sysfs failed\n"); -	} else -		adev->psp_sysfs_en = true; -  	/*  	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.  	 * Otherwise the mgpu fan boost feature will be skipped due to the @@ -4132,10 +4260,39 @@ fence_driver_init:  		flush_delayed_work(&adev->delayed_init_work);  	} +	/* +	 * Place those sysfs registering after `late_init`. As some of those +	 * operations performed in `late_init` might affect the sysfs +	 * interfaces creating. +	 */ +	r = amdgpu_atombios_sysfs_init(adev); +	if (r) +		drm_err(&adev->ddev, +			"registering atombios sysfs failed (%d).\n", r); + +	r = amdgpu_pm_sysfs_init(adev); +	if (r) +		DRM_ERROR("registering pm sysfs failed (%d).\n", r); + +	r = amdgpu_ucode_sysfs_init(adev); +	if (r) { +		adev->ucode_sysfs_en = false; +		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); +	} else +		adev->ucode_sysfs_en = true; +  	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);  	if (r)  		dev_err(adev->dev, "Could not create amdgpu device attr\n"); +	r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); +	if (r) +		dev_err(adev->dev, +			"Could not create amdgpu board attributes\n"); + +	amdgpu_fru_sysfs_init(adev); +	amdgpu_reg_state_sysfs_init(adev); +  	if (IS_ENABLED(CONFIG_PERF_EVENTS))  		r = amdgpu_pmu_init(adev);  	if (r) @@ -4147,13 +4304,14 @@ fence_driver_init:  	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */  	/* this will fail for cards that aren't VGA class devices, just -	 * ignore it */ +	 * ignore it +	 */  	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)  		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);  	px = amdgpu_device_supports_px(ddev); -	if (px || (!pci_is_thunderbolt_attached(adev->pdev) && +	if (px || (!dev_is_removable(&adev->pdev->dev) &&  				apple_gmux_detect(NULL, NULL)))  		vga_switcheroo_register_client(adev->pdev,  					       &amdgpu_switcheroo_ops, px); @@ -4199,7 +4357,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)  	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);  	/* Unmap all mapped bars - Doorbell, registers and VRAM */ -	amdgpu_device_doorbell_fini(adev); +	amdgpu_doorbell_fini(adev);  	iounmap(adev->rmmio);  	adev->rmmio = NULL; @@ -4230,7 +4388,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)  	/* make sure IB test finished before entering exclusive mode  	 * to avoid preemption on IB test -	 * */ +	 */  	if (amdgpu_sriov_vf(adev)) {  		amdgpu_virt_request_full_gpu(adev, false);  		amdgpu_virt_fini_data_exchange(adev); @@ -4253,13 +4411,16 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)  		amdgpu_pm_sysfs_fini(adev);  	if (adev->ucode_sysfs_en)  		amdgpu_ucode_sysfs_fini(adev); -	if (adev->psp_sysfs_en) -		amdgpu_psp_sysfs_fini(adev);  	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); +	amdgpu_fru_sysfs_fini(adev); + +	amdgpu_reg_state_sysfs_fini(adev);  	/* disable ras feature must before hw fini */  	amdgpu_ras_pre_fini(adev); +	amdgpu_ttm_set_buffer_funcs_status(adev, false); +  	amdgpu_device_ip_fini_early(adev);  	amdgpu_irq_fini_hw(adev); @@ -4297,9 +4458,12 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)  	kfree(adev->bios);  	adev->bios = NULL; +	kfree(adev->fru_info); +	adev->fru_info = NULL; +  	px = amdgpu_device_supports_px(adev_to_drm(adev)); -	if (px || (!pci_is_thunderbolt_attached(adev->pdev) && +	if (px || (!dev_is_removable(&adev->pdev->dev) &&  				apple_gmux_detect(NULL, NULL)))  		vga_switcheroo_unregister_client(adev->pdev); @@ -4313,7 +4477,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)  		iounmap(adev->rmmio);  		adev->rmmio = NULL; -		amdgpu_device_doorbell_fini(adev); +		amdgpu_doorbell_fini(adev);  		drm_dev_exit(idx);  	} @@ -4356,6 +4520,50 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)   * Suspend & resume.   */  /** + * amdgpu_device_prepare - prepare for device suspend + * + * @dev: drm dev pointer + * + * Prepare to put the hw in the suspend state (all asics). + * Returns 0 for success or an error on failure. + * Called at driver suspend. + */ +int amdgpu_device_prepare(struct drm_device *dev) +{ +	struct amdgpu_device *adev = drm_to_adev(dev); +	int i, r; + +	amdgpu_choose_low_power_state(adev); + +	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) +		return 0; + +	/* Evict the majority of BOs before starting suspend sequence */ +	r = amdgpu_device_evict_resources(adev); +	if (r) +		goto unprepare; + +	flush_delayed_work(&adev->gfx.gfx_off_delay_work); + +	for (i = 0; i < adev->num_ip_blocks; i++) { +		if (!adev->ip_blocks[i].status.valid) +			continue; +		if (!adev->ip_blocks[i].version->funcs->prepare_suspend) +			continue; +		r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); +		if (r) +			goto unprepare; +	} + +	return 0; + +unprepare: +	adev->in_s0ix = adev->in_s3 = false; + +	return r; +} + +/**   * amdgpu_device_suspend - initiate device suspend   *   * @dev: drm dev pointer @@ -4375,11 +4583,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	adev->in_suspend = true; -	/* Evict the majority of BOs before grabbing the full access */ -	r = amdgpu_device_evict_resources(adev); -	if (r) -		return r; -  	if (amdgpu_sriov_vf(adev)) {  		amdgpu_virt_fini_data_exchange(adev);  		r = amdgpu_virt_request_full_gpu(adev, false); @@ -4394,7 +4597,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);  	cancel_delayed_work_sync(&adev->delayed_init_work); -	flush_delayed_work(&adev->gfx.gfx_off_delay_work);  	amdgpu_ras_suspend(adev); @@ -4407,6 +4609,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	if (r)  		return r; +	amdgpu_ttm_set_buffer_funcs_status(adev, false); +  	amdgpu_fence_driver_hw_fini(adev);  	amdgpu_device_ip_suspend_phase2(adev); @@ -4414,6 +4618,10 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	if (amdgpu_sriov_vf(adev))  		amdgpu_virt_release_full_gpu(adev, false); +	r = amdgpu_dpm_notify_rlc_state(adev, false); +	if (r) +		return r; +  	return 0;  } @@ -4459,19 +4667,18 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	}  	amdgpu_fence_driver_hw_init(adev); -	r = amdgpu_device_ip_late_init(adev); -	if (r) -		goto exit; - -	queue_delayed_work(system_wq, &adev->delayed_init_work, -			   msecs_to_jiffies(AMDGPU_RESUME_MS)); -  	if (!adev->in_s0ix) {  		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);  		if (r)  			goto exit;  	} +	r = amdgpu_device_ip_late_init(adev); +	if (r) +		goto exit; + +	queue_delayed_work(system_wq, &adev->delayed_init_work, +			   msecs_to_jiffies(AMDGPU_RESUME_MS));  exit:  	if (amdgpu_sriov_vf(adev)) {  		amdgpu_virt_init_data_exchange(adev); @@ -4773,6 +4980,10 @@ retry:  		r = amdgpu_virt_reset_gpu(adev);  	if (r)  		return r; +	amdgpu_irq_gpu_reset_resume_helper(adev); + +	/* some sw clean up VF needs to do before recover */ +	amdgpu_virt_post_reset(adev);  	/* Resume IP prior to SMC */  	r = amdgpu_device_ip_reinit_early_sriov(adev); @@ -4799,7 +5010,6 @@ retry:  		amdgpu_put_xgmi_hive(hive);  	if (!r) { -		amdgpu_irq_gpu_reset_resume_helper(adev);  		r = amdgpu_ib_ring_tests(adev);  		amdgpu_amdkfd_post_reset(adev); @@ -4838,7 +5048,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  		struct amdgpu_ring *ring = adev->rings[i]; -		if (!ring || !ring->sched.thread) +		if (!amdgpu_ring_sched_ready(ring))  			continue;  		spin_lock(&ring->sched.job_list_lock); @@ -4925,9 +5135,12 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)  	}  	if (ret) -		dev_err(adev->dev, "GPU mode1 reset failed\n"); +		goto mode1_reset_failed;  	amdgpu_device_load_pci_state(adev->pdev); +	ret = amdgpu_psp_wait_for_bootloader(adev); +	if (ret) +		goto mode1_reset_failed;  	/* wait for asic to come out of reset */  	for (i = 0; i < adev->usec_timeout; i++) { @@ -4938,7 +5151,17 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)  		udelay(1);  	} +	if (i >= adev->usec_timeout) { +		ret = -ETIMEDOUT; +		goto mode1_reset_failed; +	} +  	amdgpu_atombios_scratch_regs_engine_hung(adev, false); + +	return 0; + +mode1_reset_failed: +	dev_err(adev->dev, "GPU mode1 reset failed\n");  	return ret;  } @@ -4964,11 +5187,12 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  		struct amdgpu_ring *ring = adev->rings[i]; -		if (!ring || !ring->sched.thread) +		if (!amdgpu_ring_sched_ready(ring))  			continue; -		/*clear job fence from fence drv to avoid force_completion -		 *leave NULL and vm flush fence in fence drv */ +		/* Clear job fence from fence drv to avoid force_completion +		 * leave NULL and vm flush fence in fence drv +		 */  		amdgpu_fence_driver_clear_job_fences(ring);  		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */ @@ -4982,7 +5206,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);  	/* If reset handler not implemented, continue; otherwise return */ -	if (r == -ENOSYS) +	if (r == -EOPNOTSUPP)  		r = 0;  	else  		return r; @@ -5022,75 +5246,23 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)  	lockdep_assert_held(&adev->reset_domain->sem); -	for (i = 0; i < adev->num_regs; i++) { -		adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); -		trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], -					     adev->reset_dump_reg_value[i]); -	} - -	return 0; -} - -#ifdef CONFIG_DEV_COREDUMP -static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, -		size_t count, void *data, size_t datalen) -{ -	struct drm_printer p; -	struct amdgpu_device *adev = data; -	struct drm_print_iterator iter; -	int i; - -	iter.data = buffer; -	iter.offset = 0; -	iter.start = offset; -	iter.remain = count; - -	p = drm_coredump_printer(&iter); - -	drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); -	drm_printf(&p, "kernel: " UTS_RELEASE "\n"); -	drm_printf(&p, "module: " KBUILD_MODNAME "\n"); -	drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); -	if (adev->reset_task_info.pid) -		drm_printf(&p, "process_name: %s PID: %d\n", -			   adev->reset_task_info.process_name, -			   adev->reset_task_info.pid); +	for (i = 0; i < adev->reset_info.num_regs; i++) { +		adev->reset_info.reset_dump_reg_value[i] = +			RREG32(adev->reset_info.reset_dump_reg_list[i]); -	if (adev->reset_vram_lost) -		drm_printf(&p, "VRAM is lost due to GPU reset!\n"); -	if (adev->num_regs) { -		drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n"); - -		for (i = 0; i < adev->num_regs; i++) -			drm_printf(&p, "0x%08x: 0x%08x\n", -				   adev->reset_dump_reg_list[i], -				   adev->reset_dump_reg_value[i]); +		trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], +					     adev->reset_info.reset_dump_reg_value[i]);  	} -	return count - iter.remain; -} - -static void amdgpu_devcoredump_free(void *data) -{ +	return 0;  } -static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) -{ -	struct drm_device *dev = adev_to_drm(adev); - -	ktime_get_ts64(&adev->reset_time); -	dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, -		      amdgpu_devcoredump_read, amdgpu_devcoredump_free); -} -#endif -  int amdgpu_do_asic_reset(struct list_head *device_list_handle,  			 struct amdgpu_reset_context *reset_context)  {  	struct amdgpu_device *tmp_adev = NULL;  	bool need_full_reset, skip_hw_reset, vram_lost = false;  	int r = 0; -	bool gpu_reset_for_dev_remove = 0;  	/* Try reset handler method first */  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, @@ -5100,7 +5272,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  	reset_context->reset_device_list = device_list_handle;  	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);  	/* If reset handler not implemented, continue; otherwise return */ -	if (r == -ENOSYS) +	if (r == -EOPNOTSUPP)  		r = 0;  	else  		return r; @@ -5110,10 +5282,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);  	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); -	gpu_reset_for_dev_remove = -		test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && -			test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); -  	/*  	 * ASIC reset has to be done on all XGMI hive nodes ASAP  	 * to allow proper links negotiation in FW (within 1 sec) @@ -5131,7 +5299,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  			if (r) {  				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",  					 r, adev_to_drm(tmp_adev)->unique); -				break; +				goto out;  			}  		} @@ -5150,52 +5318,30 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  	if (!r && amdgpu_ras_intr_triggered()) {  		list_for_each_entry(tmp_adev, device_list_handle, reset_list) { -			if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && -			    tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) -				tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); +			amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);  		}  		amdgpu_ras_intr_cleared();  	} -	/* Since the mode1 reset affects base ip blocks, the -	 * phase1 ip blocks need to be resumed. Otherwise there -	 * will be a BIOS signature error and the psp bootloader -	 * can't load kdb on the next amdgpu install. -	 */ -	if (gpu_reset_for_dev_remove) { -		list_for_each_entry(tmp_adev, device_list_handle, reset_list) -			amdgpu_device_ip_resume_phase1(tmp_adev); - -		goto end; -	} -  	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {  		if (need_full_reset) {  			/* post card */ +			amdgpu_ras_set_fed(tmp_adev, false);  			r = amdgpu_device_asic_init(tmp_adev);  			if (r) {  				dev_warn(tmp_adev->dev, "asic atom init failed!");  			} else {  				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); -				r = amdgpu_amdkfd_resume_iommu(tmp_adev); -				if (r) -					goto out;  				r = amdgpu_device_ip_resume_phase1(tmp_adev);  				if (r)  					goto out;  				vram_lost = amdgpu_device_check_vram_lost(tmp_adev); -#ifdef CONFIG_DEV_COREDUMP -				tmp_adev->reset_vram_lost = vram_lost; -				memset(&tmp_adev->reset_task_info, 0, -						sizeof(tmp_adev->reset_task_info)); -				if (reset_context->job && reset_context->job->vm) -					tmp_adev->reset_task_info = -						reset_context->job->vm->task_info; -				amdgpu_reset_capture_coredumpm(tmp_adev); -#endif + +				amdgpu_coredump(tmp_adev, vram_lost, reset_context); +  				if (vram_lost) {  					DRM_INFO("VRAM is lost due to GPU reset!\n");  					amdgpu_inc_vram_lost(tmp_adev); @@ -5205,10 +5351,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  				if (r)  					return r; +				r = amdgpu_xcp_restore_partition_mode( +					tmp_adev->xcp_mgr); +				if (r) +					goto out; +  				r = amdgpu_device_ip_resume_phase2(tmp_adev);  				if (r)  					goto out; +				if (tmp_adev->mman.buffer_funcs_ring->sched.ready) +					amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); +  				if (vram_lost)  					amdgpu_device_fill_reset_magic(tmp_adev); @@ -5407,11 +5561,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	int i, r = 0;  	bool need_emergency_restart = false;  	bool audio_suspended = false; -	bool gpu_reset_for_dev_remove = false; - -	gpu_reset_for_dev_remove = -			test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && -				test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);  	/*  	 * Special case: RAS triggered and full reset isn't supported @@ -5422,7 +5571,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	 * Flush RAM to disk so that after reboot  	 * the user can read log and see why the system rebooted.  	 */ -	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { +	if (need_emergency_restart && amdgpu_ras_get_context(adev) && +		amdgpu_ras_get_context(adev)->reboot) {  		DRM_WARN("Emergency reboot.");  		ksys_sync_helper(); @@ -5448,7 +5598,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {  		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {  			list_add_tail(&tmp_adev->reset_list, &device_list); -			if (gpu_reset_for_dev_remove && adev->shutdown) +			if (adev->shutdown)  				tmp_adev->shutdown = true;  		}  		if (!list_is_first(&adev->reset_list, &device_list)) @@ -5505,7 +5655,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  			struct amdgpu_ring *ring = tmp_adev->rings[i]; -			if (!ring || !ring->sched.thread) +			if (!amdgpu_ring_sched_ready(ring))  				continue;  			drm_sched_stop(&ring->sched, job ? &job->base : NULL); @@ -5533,10 +5683,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  retry:	/* Rest of adevs pre asic reset from XGMI hive. */  	list_for_each_entry(tmp_adev, device_list_handle, reset_list) { -		if (gpu_reset_for_dev_remove) { -			/* Workaroud for ASICs need to disable SMC first */ -			amdgpu_device_smu_fini_early(tmp_adev); -		}  		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);  		/*TODO Should we stop ?*/  		if (r) { @@ -5560,16 +5706,15 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  			adev->asic_reset_res = r;  		/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ -		if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || -		    adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) +		if (amdgpu_ip_version(adev, GC_HWIP, 0) == +			    IP_VERSION(9, 4, 2) || +		    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || +		    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))  			amdgpu_ras_resume(adev);  	} else {  		r = amdgpu_do_asic_reset(device_list_handle, reset_context);  		if (r && r == -EAGAIN)  			goto retry; - -		if (!r && gpu_reset_for_dev_remove) -			goto recover_end;  	}  skip_hw_reset: @@ -5580,18 +5725,14 @@ skip_hw_reset:  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  			struct amdgpu_ring *ring = tmp_adev->rings[i]; -			if (!ring || !ring->sched.thread) +			if (!amdgpu_ring_sched_ready(ring))  				continue;  			drm_sched_start(&ring->sched, true);  		} -		if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) -			amdgpu_mes_self_test(tmp_adev); - -		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { +		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)  			drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); -		}  		if (tmp_adev->asic_reset_res)  			r = tmp_adev->asic_reset_res; @@ -5629,7 +5770,6 @@ skip_sched_resume:  		amdgpu_ras_set_error_query_ready(tmp_adev, true);  	} -recover_end:  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,  					    reset_list);  	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); @@ -5647,6 +5787,39 @@ recover_end:  }  /** + * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner + * + * @adev: amdgpu_device pointer + * @speed: pointer to the speed of the link + * @width: pointer to the width of the link + * + * Evaluate the hierarchy to find the speed and bandwidth capabilities of the + * first physical partner to an AMD dGPU. + * This will exclude any virtual switches and links. + */ +static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev, +					    enum pci_bus_speed *speed, +					    enum pcie_link_width *width) +{ +	struct pci_dev *parent = adev->pdev; + +	if (!speed || !width) +		return; + +	*speed = PCI_SPEED_UNKNOWN; +	*width = PCIE_LNK_WIDTH_UNKNOWN; + +	while ((parent = pci_upstream_bridge(parent))) { +		/* skip upstream/downstream switches internal to dGPU*/ +		if (parent->vendor == PCI_VENDOR_ID_ATI) +			continue; +		*speed = pcie_get_speed_cap(parent); +		*width = pcie_get_width_cap(parent); +		break; +	} +} + +/**   * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot   *   * @adev: amdgpu_device pointer @@ -5679,8 +5852,8 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)  	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)  		return; -	pcie_bandwidth_available(adev->pdev, NULL, -				 &platform_speed_cap, &platform_link_width); +	amdgpu_device_partner_bandwidth(adev, &platform_speed_cap, +					&platform_link_width);  	if (adev->pm.pcie_gen_mask == 0) {  		/* asic caps */ @@ -5907,7 +6080,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  			struct amdgpu_ring *ring = adev->rings[i]; -			if (!ring || !ring->sched.thread) +			if (!amdgpu_ring_sched_ready(ring))  				continue;  			drm_sched_stop(&ring->sched, NULL); @@ -5957,6 +6130,20 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)  	struct amdgpu_reset_context reset_context;  	u32 memsize;  	struct list_head device_list; +	struct amdgpu_hive_info *hive; +	int hive_ras_recovery = 0; +	struct amdgpu_ras *ras; + +	/* PCI error slot reset should be skipped During RAS recovery */ +	hive = amdgpu_get_xgmi_hive(adev); +	if (hive) { +		hive_ras_recovery = atomic_read(&hive->ras_recovery); +		amdgpu_put_xgmi_hive(hive); +	} +	ras = amdgpu_ras_get_context(adev); +	if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) && +		 ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) +		return PCI_ERS_RESULT_RECOVERED;  	DRM_INFO("PCI error: slot reset callback!!\n"); @@ -6035,7 +6222,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  		struct amdgpu_ring *ring = adev->rings[i]; -		if (!ring || !ring->sched.thread) +		if (!amdgpu_ring_sched_ready(ring))  			continue;  		drm_sched_start(&ring->sched, true); @@ -6264,7 +6451,7 @@ bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)  		return true;  	default:  		/* IP discovery */ -		if (!adev->ip_versions[DCE_HWIP][0] || +		if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||  		    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))  			return false;  		return true; |