diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 803 | 
1 files changed, 643 insertions, 160 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d0b8d0d341af..37da3537ba2e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -130,7 +130,7 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,  		struct device_attribute *attr, char *buf)  {  	struct drm_device *ddev = dev_get_drvdata(dev); -	struct amdgpu_device *adev = ddev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(ddev);  	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);  	return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); @@ -155,7 +155,7 @@ static ssize_t amdgpu_device_get_product_name(struct device *dev,  		struct device_attribute *attr, char *buf)  {  	struct drm_device *ddev = dev_get_drvdata(dev); -	struct amdgpu_device *adev = ddev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(ddev);  	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);  } @@ -177,7 +177,7 @@ static ssize_t amdgpu_device_get_product_number(struct device *dev,  		struct device_attribute *attr, char *buf)  {  	struct drm_device *ddev = dev_get_drvdata(dev); -	struct amdgpu_device *adev = ddev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(ddev);  	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);  } @@ -199,7 +199,7 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev,  		struct device_attribute *attr, char *buf)  {  	struct drm_device *ddev = dev_get_drvdata(dev); -	struct amdgpu_device *adev = ddev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(ddev);  	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);  } @@ -217,7 +217,7 @@ static DEVICE_ATTR(serial_number, S_IRUGO,   */  bool amdgpu_device_supports_boco(struct drm_device *dev)  { -	struct amdgpu_device *adev = dev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(dev);  	if (adev->flags & AMD_IS_PX)  		return true; @@ -234,7 +234,7 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)   */  bool amdgpu_device_supports_baco(struct drm_device *dev)  { -	struct amdgpu_device *adev = dev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(dev);  	return amdgpu_asic_supports_baco(adev);  } @@ -301,10 +301,10 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,  }  /* - * MMIO register access helper functions. + * register access helper functions.   */  /** - * amdgpu_mm_rreg - read a memory mapped IO register + * amdgpu_device_rreg - read a memory mapped IO or indirect register   *   * @adev: amdgpu_device pointer   * @reg: dword aligned register offset @@ -312,25 +312,29 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,   *   * Returns the 32 bit value from the offset specified.   */ -uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, -			uint32_t acc_flags) +uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, +			    uint32_t reg, uint32_t acc_flags)  {  	uint32_t ret; -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) -		return amdgpu_kiq_rreg(adev, reg); - -	if ((reg * 4) < adev->rmmio_size) -		ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); -	else { -		unsigned long flags; +	if (adev->in_pci_err_recovery) +		return 0; -		spin_lock_irqsave(&adev->mmio_idx_lock, flags); -		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); -		ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); -		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); +	if ((reg * 4) < adev->rmmio_size) { +		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && +		    amdgpu_sriov_runtime(adev) && +		    down_read_trylock(&adev->reset_sem)) { +			ret = amdgpu_kiq_rreg(adev, reg); +			up_read(&adev->reset_sem); +		} else { +			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); +		} +	} else { +		ret = adev->pcie_rreg(adev, reg * 4);  	} -	trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); + +	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); +  	return ret;  } @@ -348,7 +352,11 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,   *   * Returns the 8 bit value from the offset specified.   */ -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) +{ +	if (adev->in_pci_err_recovery) +		return 0; +  	if (offset < adev->rmmio_size)  		return (readb(adev->rmmio + offset));  	BUG(); @@ -369,31 +377,19 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {   *   * Writes the value specified to the offset specified.   */ -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) +{ +	if (adev->in_pci_err_recovery) +		return; +  	if (offset < adev->rmmio_size)  		writeb(value, adev->rmmio + offset);  	else  		BUG();  } -void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) -{ -	trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); - -	if ((reg * 4) < adev->rmmio_size) -		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); -	else { -		unsigned long flags; - -		spin_lock_irqsave(&adev->mmio_idx_lock, flags); -		writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); -		writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); -		spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); -	} -} -  /** - * amdgpu_mm_wreg - write to a memory mapped IO register + * amdgpu_device_wreg - write to a memory mapped IO or indirect register   *   * @adev: amdgpu_device pointer   * @reg: dword aligned register offset @@ -402,13 +398,27 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,   *   * Writes the value specified to the offset specified.   */ -void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, -		    uint32_t acc_flags) +void amdgpu_device_wreg(struct amdgpu_device *adev, +			uint32_t reg, uint32_t v, +			uint32_t acc_flags)  { -	if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) -		return amdgpu_kiq_wreg(adev, reg, v); +	if (adev->in_pci_err_recovery) +		return; + +	if ((reg * 4) < adev->rmmio_size) { +		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && +		    amdgpu_sriov_runtime(adev) && +		    down_read_trylock(&adev->reset_sem)) { +			amdgpu_kiq_wreg(adev, reg, v); +			up_read(&adev->reset_sem); +		} else { +			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); +		} +	} else { +		adev->pcie_wreg(adev, reg * 4, v); +	} -	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); +	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);  }  /* @@ -416,18 +426,20 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,   *   * this function is invoked only the debugfs register access   * */ -void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, -		    uint32_t acc_flags) +void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, +			     uint32_t reg, uint32_t v)  { -	if (amdgpu_sriov_fullaccess(adev) && -		adev->gfx.rlc.funcs && -		adev->gfx.rlc.funcs->is_rlcg_access_range) { +	if (adev->in_pci_err_recovery) +		return; +	if (amdgpu_sriov_fullaccess(adev) && +	    adev->gfx.rlc.funcs && +	    adev->gfx.rlc.funcs->is_rlcg_access_range) {  		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))  			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); +	} else { +		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));  	} - -	amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);  }  /** @@ -440,6 +452,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t   */  u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)  { +	if (adev->in_pci_err_recovery) +		return 0; +  	if ((reg * 4) < adev->rio_mem_size)  		return ioread32(adev->rio_mem + (reg * 4));  	else { @@ -459,6 +474,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)   */  void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)  { +	if (adev->in_pci_err_recovery) +		return; +  	if ((reg * 4) < adev->rio_mem_size)  		iowrite32(v, adev->rio_mem + (reg * 4));  	else { @@ -478,6 +496,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)   */  u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)  { +	if (adev->in_pci_err_recovery) +		return 0; +  	if (index < adev->doorbell.num_doorbells) {  		return readl(adev->doorbell.ptr + index);  	} else { @@ -498,6 +519,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)   */  void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)  { +	if (adev->in_pci_err_recovery) +		return; +  	if (index < adev->doorbell.num_doorbells) {  		writel(v, adev->doorbell.ptr + index);  	} else { @@ -516,6 +540,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)   */  u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)  { +	if (adev->in_pci_err_recovery) +		return 0; +  	if (index < adev->doorbell.num_doorbells) {  		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));  	} else { @@ -536,6 +563,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)   */  void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)  { +	if (adev->in_pci_err_recovery) +		return; +  	if (index < adev->doorbell.num_doorbells) {  		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);  	} else { @@ -544,6 +574,135 @@ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)  }  /** + * amdgpu_device_indirect_rreg - read an indirect register + * + * @adev: amdgpu_device pointer + * @pcie_index: mmio register offset + * @pcie_data: mmio register offset + * + * Returns the value of indirect register @reg_addr + */ +u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, +				u32 pcie_index, u32 pcie_data, +				u32 reg_addr) +{ +	unsigned long flags; +	u32 r; +	void __iomem *pcie_index_offset; +	void __iomem *pcie_data_offset; + +	spin_lock_irqsave(&adev->pcie_idx_lock, flags); +	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; +	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + +	writel(reg_addr, pcie_index_offset); +	readl(pcie_index_offset); +	r = readl(pcie_data_offset); +	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + +	return r; +} + +/** + * amdgpu_device_indirect_rreg64 - read a 64bits indirect register + * + * @adev: amdgpu_device pointer + * @pcie_index: mmio register offset + * @pcie_data: mmio register offset + * + * Returns the value of indirect register @reg_addr + */ +u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, +				  u32 pcie_index, u32 pcie_data, +				  u32 reg_addr) +{ +	unsigned long flags; +	u64 r; +	void __iomem *pcie_index_offset; +	void __iomem *pcie_data_offset; + +	spin_lock_irqsave(&adev->pcie_idx_lock, flags); +	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; +	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + +	/* read low 32 bits */ +	writel(reg_addr, pcie_index_offset); +	readl(pcie_index_offset); +	r = readl(pcie_data_offset); +	/* read high 32 bits */ +	writel(reg_addr + 4, pcie_index_offset); +	readl(pcie_index_offset); +	r |= ((u64)readl(pcie_data_offset) << 32); +	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + +	return r; +} + +/** + * amdgpu_device_indirect_wreg - write an indirect register address + * + * @adev: amdgpu_device pointer + * @pcie_index: mmio register offset + * @pcie_data: mmio register offset + * @reg_addr: indirect register offset + * @reg_data: indirect register data + * + */ +void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, +				 u32 pcie_index, u32 pcie_data, +				 u32 reg_addr, u32 reg_data) +{ +	unsigned long flags; +	void __iomem *pcie_index_offset; +	void __iomem *pcie_data_offset; + +	spin_lock_irqsave(&adev->pcie_idx_lock, flags); +	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; +	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + +	writel(reg_addr, pcie_index_offset); +	readl(pcie_index_offset); +	writel(reg_data, pcie_data_offset); +	readl(pcie_data_offset); +	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); +} + +/** + * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address + * + * @adev: amdgpu_device pointer + * @pcie_index: mmio register offset + * @pcie_data: mmio register offset + * @reg_addr: indirect register offset + * @reg_data: indirect register data + * + */ +void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, +				   u32 pcie_index, u32 pcie_data, +				   u32 reg_addr, u64 reg_data) +{ +	unsigned long flags; +	void __iomem *pcie_index_offset; +	void __iomem *pcie_data_offset; + +	spin_lock_irqsave(&adev->pcie_idx_lock, flags); +	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; +	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + +	/* write low 32 bits */ +	writel(reg_addr, pcie_index_offset); +	readl(pcie_index_offset); +	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); +	readl(pcie_data_offset); +	/* write high 32 bits */ +	writel(reg_addr + 4, pcie_index_offset); +	readl(pcie_index_offset); +	writel((u32)(reg_data >> 32), pcie_data_offset); +	readl(pcie_data_offset); +	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); +} + +/**   * amdgpu_invalid_rreg - dummy reg read function   *   * @adev: amdgpu device pointer @@ -652,6 +811,20 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,  }  /** + * amdgpu_device_asic_init - Wrapper for atom asic_init + * + * @dev: drm_device pointer + * + * Does any asic specific work and then calls atom asic init. + */ +static int amdgpu_device_asic_init(struct amdgpu_device *adev) +{ +	amdgpu_asic_pre_asic_init(adev); + +	return amdgpu_atom_asic_init(adev->mode_info.atom_context); +} + +/**   * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page   *   * @adev: amdgpu device pointer @@ -1197,6 +1370,15 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)  	amdgpu_gmc_tmz_set(adev); +	if (amdgpu_num_kcq == -1) { +		amdgpu_num_kcq = 8; +	} else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) { +		amdgpu_num_kcq = 8; +		dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n"); +	} + +	amdgpu_gmc_noretry_set(adev); +  	return 0;  } @@ -1209,7 +1391,8 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)   * Callback for the switcheroo driver.  Suspends or resumes the   * the asics before or after it is powered up using ACPI methods.   */ -static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) +static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, +					enum vga_switcheroo_state state)  {  	struct drm_device *dev = pci_get_drvdata(pdev);  	int r; @@ -1223,7 +1406,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero  		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;  		pci_set_power_state(dev->pdev, PCI_D0); -		pci_restore_state(dev->pdev); +		amdgpu_device_load_pci_state(dev->pdev);  		r = pci_enable_device(dev->pdev);  		if (r)  			DRM_WARN("pci_enable_device failed (%d)\n", r); @@ -1236,7 +1419,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero  		drm_kms_helper_poll_disable(dev);  		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;  		amdgpu_device_suspend(dev, true); -		pci_save_state(dev->pdev); +		amdgpu_device_cache_pci_state(dev->pdev);  		/* Shut down the device */  		pci_disable_device(dev->pdev);  		pci_set_power_state(dev->pdev, PCI_D3cold); @@ -1502,7 +1685,7 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)  	adev->enable_virtual_display = false;  	if (amdgpu_virtual_display) { -		struct drm_device *ddev = adev->ddev; +		struct drm_device *ddev = adev_to_drm(adev);  		const char *pci_address_name = pci_name(ddev->pdev);  		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; @@ -1561,7 +1744,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)  	adev->firmware.gpu_info_fw = NULL; -	if (adev->discovery_bin) { +	if (adev->mman.discovery_bin) {  		amdgpu_discovery_get_gfx_info(adev);  		/* @@ -1929,7 +2112,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)  			if (adev->ip_blocks[i].status.hw == true)  				break; -			if (adev->in_gpu_reset || adev->in_suspend) { +			if (amdgpu_in_reset(adev) || adev->in_suspend) {  				r = adev->ip_blocks[i].version->funcs->resume(adev);  				if (r) {  					DRM_ERROR("resume of IP block <%s> failed %d\n", @@ -2049,13 +2232,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  	 * it should be called after amdgpu_device_ip_hw_init_phase2  since  	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning  	 * for I2C communication which only true at this point. -	 * recovery_init may fail, but it can free all resources allocated by -	 * itself and its failure should not stop amdgpu init process. +	 * +	 * amdgpu_ras_recovery_init may fail, but the upper only cares the +	 * failure from bad gpu situation and stop amdgpu init process +	 * accordingly. For other failed cases, it will still release all +	 * the resource and print error message, rather than returning one +	 * negative value to upper level.  	 *  	 * Note: theoretically, this should be called before all vram allocations  	 * to protect retired page from abusing  	 */ -	amdgpu_ras_recovery_init(adev); +	r = amdgpu_ras_recovery_init(adev); +	if (r) +		goto init_failed;  	if (adev->gmc.xgmi.num_physical_nodes > 1)  		amdgpu_xgmi_add_device(adev); @@ -2100,7 +2289,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)  			AMDGPU_RESET_MAGIC_NUM))  		return true; -	if (!adev->in_gpu_reset) +	if (!amdgpu_in_reset(adev))  		return false;  	/* @@ -2211,9 +2400,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)  		gpu_ins = &(mgpu_info.gpu_ins[i]);  		adev = gpu_ins->adev;  		if (!(adev->flags & AMD_IS_APU) && -		    !gpu_ins->mgpu_fan_enabled && -		    adev->powerplay.pp_funcs && -		    adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { +		    !gpu_ins->mgpu_fan_enabled) {  			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);  			if (ret)  				break; @@ -2568,17 +2755,16 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)  		AMD_IP_BLOCK_TYPE_IH,  	}; -	for (i = 0; i < adev->num_ip_blocks; i++) -		adev->ip_blocks[i].status.hw = false; -  	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {  		int j;  		struct amdgpu_ip_block *block; -		for (j = 0; j < adev->num_ip_blocks; j++) { -			block = &adev->ip_blocks[j]; +		block = &adev->ip_blocks[i]; +		block->status.hw = false; -			if (block->version->type != ip_order[i] || +		for (j = 0; j < ARRAY_SIZE(ip_order); j++) { + +			if (block->version->type != ip_order[j] ||  				!block->status.valid)  				continue; @@ -2771,6 +2957,12 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)  {  	switch (asic_type) {  #if defined(CONFIG_DRM_AMD_DC) +#if defined(CONFIG_DRM_AMD_DC_SI) +	case CHIP_TAHITI: +	case CHIP_PITCAIRN: +	case CHIP_VERDE: +	case CHIP_OLAND: +#endif  	case CHIP_BONAIRE:  	case CHIP_KAVERI:  	case CHIP_KABINI: @@ -2825,7 +3017,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)   */  bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)  { -	if (amdgpu_sriov_vf(adev)) +	if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)  		return false;  	return amdgpu_device_asic_has_dc_support(adev->asic_type); @@ -2836,7 +3028,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)  {  	struct amdgpu_device *adev =  		container_of(__work, struct amdgpu_device, xgmi_reset_work); -	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); +	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);  	/* It's a bug to not have a hive within this function */  	if (WARN_ON(!hive)) @@ -2851,13 +3043,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)  	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {  		task_barrier_enter(&hive->tb); -		adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); +		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));  		if (adev->asic_reset_res)  			goto fail;  		task_barrier_exit(&hive->tb); -		adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); +		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));  		if (adev->asic_reset_res)  			goto fail; @@ -2873,7 +3065,8 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)  fail:  	if (adev->asic_reset_res)  		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", -			 adev->asic_reset_res, adev->ddev->unique); +			 adev->asic_reset_res, adev_to_drm(adev)->unique); +	amdgpu_put_xgmi_hive(hive);  }  static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) @@ -2952,12 +3145,11 @@ static const struct attribute *amdgpu_dev_attributes[] = {  	NULL  }; +  /**   * amdgpu_device_init - initialize the driver   *   * @adev: amdgpu_device pointer - * @ddev: drm dev pointer - * @pdev: pci dev pointer   * @flags: driver flags   *   * Initializes the driver info and hw (all asics). @@ -2965,18 +3157,15 @@ static const struct attribute *amdgpu_dev_attributes[] = {   * Called at driver startup.   */  int amdgpu_device_init(struct amdgpu_device *adev, -		       struct drm_device *ddev, -		       struct pci_dev *pdev,  		       uint32_t flags)  { +	struct drm_device *ddev = adev_to_drm(adev); +	struct pci_dev *pdev = adev->pdev;  	int r, i;  	bool boco = false;  	u32 max_MBps;  	adev->shutdown = false; -	adev->dev = &pdev->dev; -	adev->ddev = ddev; -	adev->pdev = pdev;  	adev->flags = flags;  	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) @@ -3032,7 +3221,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	mutex_init(&adev->mn_lock);  	mutex_init(&adev->virt.vf_errors.lock);  	hash_init(adev->mn_hash); -	mutex_init(&adev->lock_reset); +	atomic_set(&adev->in_gpu_reset, 0); +	init_rwsem(&adev->reset_sem);  	mutex_init(&adev->psp.mutex);  	mutex_init(&adev->notifier_lock); @@ -3127,13 +3317,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	r = amdgpu_device_get_job_timeout_settings(adev);  	if (r) {  		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); -		return r; +		goto failed_unmap;  	}  	/* early init functions */  	r = amdgpu_device_ip_early_init(adev);  	if (r) -		return r; +		goto failed_unmap;  	/* doorbell bar mapping and doorbell index init*/  	amdgpu_device_doorbell_init(adev); @@ -3174,6 +3364,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,  		}  	} +	pci_enable_pcie_error_reporting(adev->ddev.pdev); +  	/* Post card if necessary */  	if (amdgpu_device_need_post(adev)) {  		if (!adev->bios) { @@ -3182,7 +3374,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,  			goto failed;  		}  		DRM_INFO("GPU posting now...\n"); -		r = amdgpu_atom_asic_init(adev->mode_info.atom_context); +		r = amdgpu_device_asic_init(adev);  		if (r) {  			dev_err(adev->dev, "gpu post error!\n");  			goto failed; @@ -3220,7 +3412,7 @@ fence_driver_init:  	}  	/* init the mode config */ -	drm_mode_config_init(adev->ddev); +	drm_mode_config_init(adev_to_drm(adev));  	r = amdgpu_device_ip_init(adev);  	if (r) { @@ -3316,16 +3508,18 @@ fence_driver_init:  		flush_delayed_work(&adev->delayed_init_work);  	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); -	if (r) { +	if (r)  		dev_err(adev->dev, "Could not create amdgpu device attr\n"); -		return r; -	}  	if (IS_ENABLED(CONFIG_PERF_EVENTS))  		r = amdgpu_pmu_init(adev);  	if (r)  		dev_err(adev->dev, "amdgpu_pmu_init failed\n"); +	/* Have stored pci confspace at hand for restore in sudden PCI error */ +	if (amdgpu_device_cache_pci_state(adev->pdev)) +		pci_restore_state(pdev); +  	return 0;  failed: @@ -3333,6 +3527,10 @@ failed:  	if (boco)  		vga_switcheroo_fini_domain_pm_ops(adev->dev); +failed_unmap: +	iounmap(adev->rmmio); +	adev->rmmio = NULL; +  	return r;  } @@ -3346,31 +3544,33 @@ failed:   */  void amdgpu_device_fini(struct amdgpu_device *adev)  { -	int r; - -	DRM_INFO("amdgpu: finishing device.\n"); +	dev_info(adev->dev, "amdgpu: finishing device.\n");  	flush_delayed_work(&adev->delayed_init_work);  	adev->shutdown = true; +	kfree(adev->pci_state); +  	/* make sure IB test finished before entering exclusive mode  	 * to avoid preemption on IB test  	 * */ -	if (amdgpu_sriov_vf(adev)) +	if (amdgpu_sriov_vf(adev)) {  		amdgpu_virt_request_full_gpu(adev, false); +		amdgpu_virt_fini_data_exchange(adev); +	}  	/* disable all interrupts */  	amdgpu_irq_disable_all(adev);  	if (adev->mode_info.mode_config_initialized){  		if (!amdgpu_device_has_dc_support(adev)) -			drm_helper_force_disable_all(adev->ddev); +			drm_helper_force_disable_all(adev_to_drm(adev));  		else -			drm_atomic_helper_shutdown(adev->ddev); +			drm_atomic_helper_shutdown(adev_to_drm(adev));  	}  	amdgpu_fence_driver_fini(adev);  	if (adev->pm_sysfs_en)  		amdgpu_pm_sysfs_fini(adev);  	amdgpu_fbdev_fini(adev); -	r = amdgpu_device_ip_fini(adev); +	amdgpu_device_ip_fini(adev);  	release_firmware(adev->firmware.gpu_info_fw);  	adev->firmware.gpu_info_fw = NULL;  	adev->accel_working = false; @@ -3388,7 +3588,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)  	     amdgpu_has_atpx_dgpu_power_cntl()) &&  	    !pci_is_thunderbolt_attached(adev->pdev))  		vga_switcheroo_unregister_client(adev->pdev); -	if (amdgpu_device_supports_boco(adev->ddev)) +	if (amdgpu_device_supports_boco(adev_to_drm(adev)))  		vga_switcheroo_fini_domain_pm_ops(adev->dev);  	vga_client_register(adev->pdev, NULL, NULL, NULL);  	if (adev->rio_mem) @@ -3404,7 +3604,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)  	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);  	if (IS_ENABLED(CONFIG_PERF_EVENTS))  		amdgpu_pmu_fini(adev); -	if (adev->discovery_bin) +	if (adev->mman.discovery_bin)  		amdgpu_discovery_fini(adev);  } @@ -3430,11 +3630,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	struct drm_connector_list_iter iter;  	int r; -	if (dev == NULL || dev->dev_private == NULL) { -		return -ENODEV; -	} - -	adev = dev->dev_private; +	adev = drm_to_adev(dev);  	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)  		return 0; @@ -3522,7 +3718,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  {  	struct drm_connector *connector;  	struct drm_connector_list_iter iter; -	struct amdgpu_device *adev = dev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(dev);  	struct drm_crtc *crtc;  	int r = 0; @@ -3531,14 +3727,14 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	/* post card */  	if (amdgpu_device_need_post(adev)) { -		r = amdgpu_atom_asic_init(adev->mode_info.atom_context); +		r = amdgpu_device_asic_init(adev);  		if (r) -			DRM_ERROR("amdgpu asic init failed\n"); +			dev_err(adev->dev, "amdgpu asic init failed\n");  	}  	r = amdgpu_device_ip_resume(adev);  	if (r) { -		DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); +		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);  		return r;  	}  	amdgpu_fence_driver_resume(adev); @@ -3562,7 +3758,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  				if (r == 0) {  					r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);  					if (r != 0) -						DRM_ERROR("Failed to pin cursor BO (%d)\n", r); +						dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);  					amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);  					amdgpu_bo_unreserve(aobj);  				} @@ -3652,7 +3848,7 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)  			adev->ip_blocks[i].status.hang =  				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);  		if (adev->ip_blocks[i].status.hang) { -			DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); +			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);  			asic_hang = true;  		}  	} @@ -3713,7 +3909,7 @@ static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)  		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||  		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {  			if (adev->ip_blocks[i].status.hang) { -				DRM_INFO("Some block need full reset!\n"); +				dev_info(adev->dev, "Some block need full reset!\n");  				return true;  			}  		} @@ -3801,7 +3997,7 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)  	else  		tmo = msecs_to_jiffies(100); -	DRM_INFO("recover vram bo from shadow start\n"); +	dev_info(adev->dev, "recover vram bo from shadow start\n");  	mutex_lock(&adev->shadow_list_lock);  	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { @@ -3837,11 +4033,11 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)  	dma_fence_put(fence);  	if (r < 0 || tmo <= 0) { -		DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); +		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);  		return -EIO;  	} -	DRM_INFO("recover vram bo from shadow done\n"); +	dev_info(adev->dev, "recover vram bo from shadow done\n");  	return 0;  } @@ -3876,7 +4072,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,  	amdgpu_virt_init_data_exchange(adev);  	/* we need recover gart prior to run SMC/CP/SDMA resume */ -	amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); +	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));  	r = amdgpu_device_fw_loading(adev);  	if (r) @@ -3902,6 +4098,34 @@ error:  }  /** + * amdgpu_device_has_job_running - check if there is any job in mirror list + * + * @adev: amdgpu device pointer + * + * check if there is any job in mirror list + */ +bool amdgpu_device_has_job_running(struct amdgpu_device *adev) +{ +	int i; +	struct drm_sched_job *job; + +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { +		struct amdgpu_ring *ring = adev->rings[i]; + +		if (!ring || !ring->sched.thread) +			continue; + +		spin_lock(&ring->sched.job_list_lock); +		job = list_first_entry_or_null(&ring->sched.ring_mirror_list, +				struct drm_sched_job, node); +		spin_unlock(&ring->sched.job_list_lock); +		if (job) +			return true; +	} +	return false; +} + +/**   * amdgpu_device_should_recover_gpu - check if we should try GPU recovery   *   * @adev: amdgpu device pointer @@ -3912,7 +4136,7 @@ error:  bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)  {  	if (!amdgpu_device_ip_check_soft_reset(adev)) { -		DRM_INFO("Timeout, but no hardware hang detected.\n"); +		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");  		return false;  	} @@ -3952,7 +4176,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)  	return true;  disabled: -		DRM_INFO("GPU recovery disabled.\n"); +		dev_info(adev->dev, "GPU recovery disabled.\n");  		return false;  } @@ -3966,6 +4190,11 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  	amdgpu_debugfs_wait_dump(adev); +	if (amdgpu_sriov_vf(adev)) { +		/* stop the data exchange thread */ +		amdgpu_virt_fini_data_exchange(adev); +	} +  	/* block all schedulers and reset given job's ring */  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {  		struct amdgpu_ring *ring = adev->rings[i]; @@ -3991,7 +4220,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  			r = amdgpu_device_ip_soft_reset(adev);  			amdgpu_device_ip_post_soft_reset(adev);  			if (r || amdgpu_device_ip_check_soft_reset(adev)) { -				DRM_INFO("soft reset failed, will fallback to full reset!\n"); +				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");  				need_full_reset = true;  			}  		} @@ -4007,7 +4236,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,  static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,  			       struct list_head *device_list_handle, -			       bool *need_full_reset_arg) +			       bool *need_full_reset_arg, +			       bool skip_hw_reset)  {  	struct amdgpu_device *tmp_adev = NULL;  	bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@ -4017,7 +4247,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,  	 * ASIC reset has to be done on all HGMI hive nodes ASAP  	 * to allow proper links negotiation in FW (within 1 sec)  	 */ -	if (need_full_reset) { +	if (!skip_hw_reset && need_full_reset) {  		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {  			/* For XGMI run all resets in parallel to speed up the process */  			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -4027,8 +4257,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,  				r = amdgpu_asic_reset(tmp_adev);  			if (r) { -				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", -					 r, tmp_adev->ddev->unique); +				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", +					 r, adev_to_drm(tmp_adev)->unique);  				break;  			}  		} @@ -4060,8 +4290,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {  		if (need_full_reset) {  			/* post card */ -			if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) -				DRM_WARN("asic atom init failed!"); +			if (amdgpu_device_asic_init(tmp_adev)) +				dev_warn(tmp_adev->dev, "asic atom init failed!");  			if (!r) {  				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); @@ -4075,8 +4305,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,  					amdgpu_inc_vram_lost(tmp_adev);  				} -				r = amdgpu_gtt_mgr_recover( -					&tmp_adev->mman.bdev.man[TTM_PL_TT]); +				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));  				if (r)  					goto out; @@ -4103,8 +4332,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,  				amdgpu_fbdev_set_suspend(tmp_adev, 0); -				/* must succeed. */ -				amdgpu_ras_resume(tmp_adev); +				/* +				 * The GPU enters bad state once faulty pages +				 * by ECC has reached the threshold, and ras +				 * recovery is scheduled next. So add one check +				 * here to break recovery if it indeed exceeds +				 * bad page threshold, and remind user to +				 * retire this GPU or setting one bigger +				 * bad_page_threshold value to fix this once +				 * probing driver again. +				 */ +				if (!amdgpu_ras_check_err_threshold(tmp_adev)) { +					/* must succeed. */ +					amdgpu_ras_resume(tmp_adev); +				} else { +					r = -EINVAL; +					goto out; +				}  				/* Update PSP FW topology after reset */  				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) @@ -4112,7 +4356,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,  			}  		} -  out:  		if (!r) {  			amdgpu_irq_gpu_reset_resume_helper(tmp_adev); @@ -4137,16 +4380,19 @@ end:  	return r;  } -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, +				struct amdgpu_hive_info *hive)  { -	if (trylock) { -		if (!mutex_trylock(&adev->lock_reset)) -			return false; -	} else -		mutex_lock(&adev->lock_reset); +	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) +		return false; + +	if (hive) { +		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); +	} else { +		down_write(&adev->reset_sem); +	}  	atomic_inc(&adev->gpu_reset_counter); -	adev->in_gpu_reset = true;  	switch (amdgpu_asic_reset_method(adev)) {  	case AMD_RESET_METHOD_MODE1:  		adev->mp1_state = PP_MP1_STATE_SHUTDOWN; @@ -4166,8 +4412,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)  {  	amdgpu_vf_error_trans_all(adev);  	adev->mp1_state = PP_MP1_STATE_NONE; -	adev->in_gpu_reset = false; -	mutex_unlock(&adev->lock_reset); +	atomic_set(&adev->in_gpu_reset, 0); +	up_write(&adev->reset_sem);  }  static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) @@ -4277,12 +4523,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	 * We always reset all schedulers for device and all devices for XGMI  	 * hive so that should take care of them too.  	 */ -	hive = amdgpu_get_xgmi_hive(adev, true); -	if (hive && !mutex_trylock(&hive->reset_lock)) { -		DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", -			  job ? job->base.id : -1, hive->hive_id); -		mutex_unlock(&hive->hive_lock); -		return 0; +	hive = amdgpu_get_xgmi_hive(adev); +	if (hive) { +		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { +			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", +				job ? job->base.id : -1, hive->hive_id); +			amdgpu_put_xgmi_hive(hive); +			return 0; +		} +		mutex_lock(&hive->hive_lock);  	}  	/* @@ -4304,11 +4553,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	/* block all schedulers and reset given job's ring */  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { -		if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { -			DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", +		if (!amdgpu_device_lock_adev(tmp_adev, hive)) { +			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",  				  job ? job->base.id : -1); -			mutex_unlock(&hive->hive_lock); -			return 0; +			r = 0; +			goto skip_recovery;  		}  		/* @@ -4376,12 +4625,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  retry:	/* Rest of adevs pre asic reset from XGMI hive. */  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {  		r = amdgpu_device_pre_asic_reset(tmp_adev, -						 NULL, +						 (tmp_adev == adev) ? job : NULL,  						 &need_full_reset);  		/*TODO Should we stop ?*/  		if (r) { -			DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", -				  r, tmp_adev->ddev->unique); +			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", +				  r, adev_to_drm(tmp_adev)->unique);  			tmp_adev->asic_reset_res = r;  		}  	} @@ -4393,7 +4642,7 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */  		if (r)  			adev->asic_reset_res = r;  	} else { -		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); +		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);  		if (r && r == -EAGAIN)  			goto retry;  	} @@ -4417,7 +4666,7 @@ skip_hw_reset:  		}  		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { -			drm_helper_resume_force_mode(tmp_adev->ddev); +			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));  		}  		tmp_adev->asic_reset_res = 0; @@ -4441,9 +4690,11 @@ skip_sched_resume:  		amdgpu_device_unlock_adev(tmp_adev);  	} +skip_recovery:  	if (hive) { -		mutex_unlock(&hive->reset_lock); +		atomic_set(&hive->in_reset, 0);  		mutex_unlock(&hive->hive_lock); +		amdgpu_put_xgmi_hive(hive);  	}  	if (r) @@ -4589,10 +4840,10 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)  int amdgpu_device_baco_enter(struct drm_device *dev)  { -	struct amdgpu_device *adev = dev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(dev);  	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); -	if (!amdgpu_device_supports_baco(adev->ddev)) +	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))  		return -ENOTSUPP;  	if (ras && ras->supported) @@ -4603,11 +4854,11 @@ int amdgpu_device_baco_enter(struct drm_device *dev)  int amdgpu_device_baco_exit(struct drm_device *dev)  { -	struct amdgpu_device *adev = dev->dev_private; +	struct amdgpu_device *adev = drm_to_adev(dev);  	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);  	int ret = 0; -	if (!amdgpu_device_supports_baco(adev->ddev)) +	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))  		return -ENOTSUPP;  	ret = amdgpu_dpm_baco_exit(adev); @@ -4619,3 +4870,235 @@ int amdgpu_device_baco_exit(struct drm_device *dev)  	return 0;  } + +static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) +{ +	int i; + +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { +		struct amdgpu_ring *ring = adev->rings[i]; + +		if (!ring || !ring->sched.thread) +			continue; + +		cancel_delayed_work_sync(&ring->sched.work_tdr); +	} +} + +/** + * amdgpu_pci_error_detected - Called when a PCI error is detected. + * @pdev: PCI device struct + * @state: PCI channel state + * + * Description: Called when a PCI error is detected. + * + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. + */ +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ +	struct drm_device *dev = pci_get_drvdata(pdev); +	struct amdgpu_device *adev = drm_to_adev(dev); +	int i; + +	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + +	if (adev->gmc.xgmi.num_physical_nodes > 1) { +		DRM_WARN("No support for XGMI hive yet..."); +		return PCI_ERS_RESULT_DISCONNECT; +	} + +	switch (state) { +	case pci_channel_io_normal: +		return PCI_ERS_RESULT_CAN_RECOVER; +	/* Fatal error, prepare for slot reset */ +	case pci_channel_io_frozen:		 +		/*		 +		 * Cancel and wait for all TDRs in progress if failing to +		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev +		 * +		 * Locking adev->reset_sem will prevent any external access +		 * to GPU during PCI error recovery +		 */ +		while (!amdgpu_device_lock_adev(adev, NULL)) +			amdgpu_cancel_all_tdr(adev); + +		/* +		 * Block any work scheduling as we do for regular GPU reset +		 * for the duration of the recovery +		 */ +		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { +			struct amdgpu_ring *ring = adev->rings[i]; + +			if (!ring || !ring->sched.thread) +				continue; + +			drm_sched_stop(&ring->sched, NULL); +		} +		return PCI_ERS_RESULT_NEED_RESET; +	case pci_channel_io_perm_failure: +		/* Permanent error, prepare for device removal */ +		return PCI_ERS_RESULT_DISCONNECT; +	} + +	return PCI_ERS_RESULT_NEED_RESET; +} + +/** + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers + * @pdev: pointer to PCI device + */ +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) +{ + +	DRM_INFO("PCI error: mmio enabled callback!!\n"); + +	/* TODO - dump whatever for debugging purposes */ + +	/* This called only if amdgpu_pci_error_detected returns +	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still +	 * works, no need to reset slot. +	 */ + +	return PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_slot_reset - Called when PCI slot has been reset. + * @pdev: PCI device struct + * + * Description: This routine is called by the pci error recovery + * code after the PCI slot has been reset, just before we + * should resume normal operations. + */ +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) +{ +	struct drm_device *dev = pci_get_drvdata(pdev); +	struct amdgpu_device *adev = drm_to_adev(dev); +	int r, i; +	bool need_full_reset = true; +	u32 memsize; +	struct list_head device_list; + +	DRM_INFO("PCI error: slot reset callback!!\n"); + +	INIT_LIST_HEAD(&device_list); +	list_add_tail(&adev->gmc.xgmi.head, &device_list); + +	/* wait for asic to come out of reset */ +	msleep(500); + +	/* Restore PCI confspace */ +	amdgpu_device_load_pci_state(pdev); + +	/* confirm  ASIC came out of reset */ +	for (i = 0; i < adev->usec_timeout; i++) { +		memsize = amdgpu_asic_get_config_memsize(adev); + +		if (memsize != 0xffffffff) +			break; +		udelay(1); +	} +	if (memsize == 0xffffffff) { +		r = -ETIME; +		goto out; +	} + +	adev->in_pci_err_recovery = true;	 +	r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); +	adev->in_pci_err_recovery = false; +	if (r) +		goto out; + +	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); + +out: +	if (!r) { +		if (amdgpu_device_cache_pci_state(adev->pdev)) +			pci_restore_state(adev->pdev); + +		DRM_INFO("PCIe error recovery succeeded\n"); +	} else { +		DRM_ERROR("PCIe error recovery failed, err:%d", r); +		amdgpu_device_unlock_adev(adev); +	} + +	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_resume() - resume normal ops after PCI reset + * @pdev: pointer to PCI device + * + * Called when the error recovery driver tells us that its + * OK to resume normal operation. Use completion to allow + * halted scsi ops to resume. + */ +void amdgpu_pci_resume(struct pci_dev *pdev) +{ +	struct drm_device *dev = pci_get_drvdata(pdev); +	struct amdgpu_device *adev = drm_to_adev(dev); +	int i; + + +	DRM_INFO("PCI error: resume callback!!\n"); + +	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { +		struct amdgpu_ring *ring = adev->rings[i]; + +		if (!ring || !ring->sched.thread) +			continue; + + +		drm_sched_resubmit_jobs(&ring->sched); +		drm_sched_start(&ring->sched, true); +	} + +	amdgpu_device_unlock_adev(adev); +} + +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) +{ +	struct drm_device *dev = pci_get_drvdata(pdev); +	struct amdgpu_device *adev = drm_to_adev(dev); +	int r; + +	r = pci_save_state(pdev); +	if (!r) { +		kfree(adev->pci_state); + +		adev->pci_state = pci_store_saved_state(pdev); + +		if (!adev->pci_state) { +			DRM_ERROR("Failed to store PCI saved state"); +			return false; +		} +	} else { +		DRM_WARN("Failed to save PCI state, err:%d\n", r); +		return false; +	} + +	return true; +} + +bool amdgpu_device_load_pci_state(struct pci_dev *pdev) +{ +	struct drm_device *dev = pci_get_drvdata(pdev); +	struct amdgpu_device *adev = drm_to_adev(dev); +	int r; + +	if (!adev->pci_state) +		return false; + +	r = pci_load_saved_state(pdev, adev->pci_state); + +	if (!r) { +		pci_restore_state(pdev); +	} else { +		DRM_WARN("Failed to load PCI state, err:%d\n", r); +		return false; +	} + +	return true; +} + + |