diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 860 |
1 files changed, 674 insertions, 186 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index eb7cfe87042e..7560b05e4ac1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -80,8 +80,8 @@ MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); -MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin"); -MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 @@ -116,6 +116,8 @@ const char *amdgpu_asic_name[] = { "NAVI12", "SIENNA_CICHLID", "NAVY_FLOUNDER", + "VANGOGH", + "DIMGREY_CAVEFISH", "LAST", }; @@ -132,7 +134,7 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, struct device_attribute *attr, char *buf) { struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = ddev->dev_private; + struct amdgpu_device *adev = drm_to_adev(ddev); uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev); return snprintf(buf, PAGE_SIZE, "%llu\n", cnt); @@ -157,7 +159,7 @@ static ssize_t amdgpu_device_get_product_name(struct device *dev, struct device_attribute *attr, char *buf) { struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = ddev->dev_private; + struct amdgpu_device *adev = drm_to_adev(ddev); return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); } @@ -179,7 +181,7 @@ static ssize_t amdgpu_device_get_product_number(struct device *dev, struct device_attribute *attr, char *buf) { struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = ddev->dev_private; + struct amdgpu_device *adev = drm_to_adev(ddev); return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); } @@ -201,7 +203,7 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev, struct device_attribute *attr, char *buf) { struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = ddev->dev_private; + struct amdgpu_device *adev = drm_to_adev(ddev); return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); } @@ -219,7 +221,7 @@ static DEVICE_ATTR(serial_number, S_IRUGO, */ bool amdgpu_device_supports_boco(struct drm_device *dev) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); if (adev->flags & AMD_IS_PX) return true; @@ -236,14 +238,16 @@ bool amdgpu_device_supports_boco(struct drm_device *dev) */ bool amdgpu_device_supports_baco(struct drm_device *dev) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); return amdgpu_asic_supports_baco(adev); } +/* + * VRAM access helper functions + */ + /** - * VRAM access helper functions. - * * amdgpu_device_vram_access - read/write a buffer in vram * * @adev: amdgpu_device pointer @@ -303,10 +307,10 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, } /* - * MMIO register access helper functions. + * register access helper functions. */ /** - * amdgpu_mm_rreg - read a memory mapped IO register + * amdgpu_device_rreg - read a memory mapped IO or indirect register * * @adev: amdgpu_device pointer * @reg: dword aligned register offset @@ -314,25 +318,29 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, * * Returns the 32 bit value from the offset specified. */ -uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, - uint32_t acc_flags) +uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, + uint32_t reg, uint32_t acc_flags) { uint32_t ret; - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) - return amdgpu_kiq_rreg(adev, reg); - - if ((reg * 4) < adev->rmmio_size) - ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); - else { - unsigned long flags; + if (adev->in_pci_err_recovery) + return 0; - spin_lock_irqsave(&adev->mmio_idx_lock, flags); - writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); - ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); + if ((reg * 4) < adev->rmmio_size) { + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && + amdgpu_sriov_runtime(adev) && + down_read_trylock(&adev->reset_sem)) { + ret = amdgpu_kiq_rreg(adev, reg); + up_read(&adev->reset_sem); + } else { + ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); + } + } else { + ret = adev->pcie_rreg(adev, reg * 4); } - trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret); + + trace_amdgpu_device_rreg(adev->pdev->device, reg, ret); + return ret; } @@ -350,7 +358,11 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, * * Returns the 8 bit value from the offset specified. */ -uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) +{ + if (adev->in_pci_err_recovery) + return 0; + if (offset < adev->rmmio_size) return (readb(adev->rmmio + offset)); BUG(); @@ -371,31 +383,19 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { * * Writes the value specified to the offset specified. */ -void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) +{ + if (adev->in_pci_err_recovery) + return; + if (offset < adev->rmmio_size) writeb(value, adev->rmmio + offset); else BUG(); } -void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) -{ - trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); - - if ((reg * 4) < adev->rmmio_size) - writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); - else { - unsigned long flags; - - spin_lock_irqsave(&adev->mmio_idx_lock, flags); - writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4)); - writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); - } -} - /** - * amdgpu_mm_wreg - write to a memory mapped IO register + * amdgpu_device_wreg - write to a memory mapped IO or indirect register * * @adev: amdgpu_device pointer * @reg: dword aligned register offset @@ -404,13 +404,27 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, * * Writes the value specified to the offset specified. */ -void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, - uint32_t acc_flags) +void amdgpu_device_wreg(struct amdgpu_device *adev, + uint32_t reg, uint32_t v, + uint32_t acc_flags) { - if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) - return amdgpu_kiq_wreg(adev, reg, v); + if (adev->in_pci_err_recovery) + return; - amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); + if ((reg * 4) < adev->rmmio_size) { + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && + amdgpu_sriov_runtime(adev) && + down_read_trylock(&adev->reset_sem)) { + amdgpu_kiq_wreg(adev, reg, v); + up_read(&adev->reset_sem); + } else { + writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); + } + } else { + adev->pcie_wreg(adev, reg * 4, v); + } + + trace_amdgpu_device_wreg(adev->pdev->device, reg, v); } /* @@ -418,18 +432,20 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, * * this function is invoked only the debugfs register access * */ -void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, - uint32_t acc_flags) +void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, + uint32_t reg, uint32_t v) { - if (amdgpu_sriov_fullaccess(adev) && - adev->gfx.rlc.funcs && - adev->gfx.rlc.funcs->is_rlcg_access_range) { + if (adev->in_pci_err_recovery) + return; + if (amdgpu_sriov_fullaccess(adev) && + adev->gfx.rlc.funcs && + adev->gfx.rlc.funcs->is_rlcg_access_range) { if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v); + } else { + writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); } - - amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); } /** @@ -442,6 +458,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t */ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) { + if (adev->in_pci_err_recovery) + return 0; + if ((reg * 4) < adev->rio_mem_size) return ioread32(adev->rio_mem + (reg * 4)); else { @@ -461,6 +480,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { + if (adev->in_pci_err_recovery) + return; + if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { @@ -480,6 +502,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) */ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return readl(adev->doorbell.ptr + index); } else { @@ -500,6 +525,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { writel(v, adev->doorbell.ptr + index); } else { @@ -518,6 +546,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) */ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) { + if (adev->in_pci_err_recovery) + return 0; + if (index < adev->doorbell.num_doorbells) { return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); } else { @@ -538,6 +569,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) */ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) { + if (adev->in_pci_err_recovery) + return; + if (index < adev->doorbell.num_doorbells) { atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); } else { @@ -546,9 +580,138 @@ void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) } /** + * amdgpu_device_indirect_rreg - read an indirect register + * + * @adev: amdgpu_device pointer + * @pcie_index: mmio register offset + * @pcie_data: mmio register offset + * + * Returns the value of indirect register @reg_addr + */ +u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev, + u32 pcie_index, u32 pcie_data, + u32 reg_addr) +{ + unsigned long flags; + u32 r; + void __iomem *pcie_index_offset; + void __iomem *pcie_data_offset; + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + + writel(reg_addr, pcie_index_offset); + readl(pcie_index_offset); + r = readl(pcie_data_offset); + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + + return r; +} + +/** + * amdgpu_device_indirect_rreg64 - read a 64bits indirect register + * + * @adev: amdgpu_device pointer + * @pcie_index: mmio register offset + * @pcie_data: mmio register offset + * + * Returns the value of indirect register @reg_addr + */ +u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, + u32 pcie_index, u32 pcie_data, + u32 reg_addr) +{ + unsigned long flags; + u64 r; + void __iomem *pcie_index_offset; + void __iomem *pcie_data_offset; + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + + /* read low 32 bits */ + writel(reg_addr, pcie_index_offset); + readl(pcie_index_offset); + r = readl(pcie_data_offset); + /* read high 32 bits */ + writel(reg_addr + 4, pcie_index_offset); + readl(pcie_index_offset); + r |= ((u64)readl(pcie_data_offset) << 32); + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + + return r; +} + +/** + * amdgpu_device_indirect_wreg - write an indirect register address + * + * @adev: amdgpu_device pointer + * @pcie_index: mmio register offset + * @pcie_data: mmio register offset + * @reg_addr: indirect register offset + * @reg_data: indirect register data + * + */ +void amdgpu_device_indirect_wreg(struct amdgpu_device *adev, + u32 pcie_index, u32 pcie_data, + u32 reg_addr, u32 reg_data) +{ + unsigned long flags; + void __iomem *pcie_index_offset; + void __iomem *pcie_data_offset; + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + + writel(reg_addr, pcie_index_offset); + readl(pcie_index_offset); + writel(reg_data, pcie_data_offset); + readl(pcie_data_offset); + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); +} + +/** + * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address + * + * @adev: amdgpu_device pointer + * @pcie_index: mmio register offset + * @pcie_data: mmio register offset + * @reg_addr: indirect register offset + * @reg_data: indirect register data + * + */ +void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, + u32 pcie_index, u32 pcie_data, + u32 reg_addr, u64 reg_data) +{ + unsigned long flags; + void __iomem *pcie_index_offset; + void __iomem *pcie_data_offset; + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + + /* write low 32 bits */ + writel(reg_addr, pcie_index_offset); + readl(pcie_index_offset); + writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); + readl(pcie_data_offset); + /* write high 32 bits */ + writel(reg_addr + 4, pcie_index_offset); + readl(pcie_index_offset); + writel((u32)(reg_data >> 32), pcie_data_offset); + readl(pcie_data_offset); + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); +} + +/** * amdgpu_invalid_rreg - dummy reg read function * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * @reg: offset of register * * Dummy register read function. Used for register blocks @@ -565,7 +728,7 @@ static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg) /** * amdgpu_invalid_wreg - dummy reg write function * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * @reg: offset of register * @v: value to write to the register * @@ -582,7 +745,7 @@ static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32 /** * amdgpu_invalid_rreg64 - dummy 64 bit reg read function * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * @reg: offset of register * * Dummy register read function. Used for register blocks @@ -599,7 +762,7 @@ static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) /** * amdgpu_invalid_wreg64 - dummy reg write function * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * @reg: offset of register * @v: value to write to the register * @@ -616,7 +779,7 @@ static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint /** * amdgpu_block_invalid_rreg - dummy reg read function * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * @block: offset of instance * @reg: offset of register * @@ -636,7 +799,7 @@ static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev, /** * amdgpu_block_invalid_wreg - dummy reg write function * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * @block: offset of instance * @reg: offset of register * @v: value to write to the register @@ -654,9 +817,23 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, } /** + * amdgpu_device_asic_init - Wrapper for atom asic_init + * + * @adev: amdgpu_device pointer + * + * Does any asic specific work and then calls atom asic init. + */ +static int amdgpu_device_asic_init(struct amdgpu_device *adev) +{ + amdgpu_asic_pre_asic_init(adev); + + return amdgpu_atom_asic_init(adev->mode_info.atom_context); +} + +/** * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * * Allocates a scratch page of VRAM for use by various things in the * driver. @@ -673,7 +850,7 @@ static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev) /** * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * * Frees the VRAM scratch page. */ @@ -1199,6 +1376,8 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_gmc_tmz_set(adev); + amdgpu_gmc_noretry_set(adev); + return 0; } @@ -1211,7 +1390,8 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) * Callback for the switcheroo driver. Suspends or resumes the * the asics before or after it is powered up using ACPI methods. */ -static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state) +static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, + enum vga_switcheroo_state state) { struct drm_device *dev = pci_get_drvdata(pdev); int r; @@ -1225,7 +1405,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; pci_set_power_state(dev->pdev, PCI_D0); - pci_restore_state(dev->pdev); + amdgpu_device_load_pci_state(dev->pdev); r = pci_enable_device(dev->pdev); if (r) DRM_WARN("pci_enable_device failed (%d)\n", r); @@ -1238,7 +1418,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); - pci_save_state(dev->pdev); + amdgpu_device_cache_pci_state(dev->pdev); /* Shut down the device */ pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3cold); @@ -1504,7 +1684,7 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) adev->enable_virtual_display = false; if (amdgpu_virtual_display) { - struct drm_device *ddev = adev->ddev; + struct drm_device *ddev = adev_to_drm(adev); const char *pci_address_name = pci_name(ddev->pdev); char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname; @@ -1563,7 +1743,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) adev->firmware.gpu_info_fw = NULL; - if (adev->discovery_bin) { + if (adev->mman.discovery_bin) { amdgpu_discovery_get_gfx_info(adev); /* @@ -1600,6 +1780,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) case CHIP_CARRIZO: case CHIP_STONEY: case CHIP_VEGA20: + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: + case CHIP_DIMGREY_CAVEFISH: default: return 0; case CHIP_VEGA10: @@ -1620,7 +1803,10 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) chip_name = "arcturus"; break; case CHIP_RENOIR: - chip_name = "renoir"; + if (adev->apu_flags & AMD_APU_IS_RENOIR) + chip_name = "renoir"; + else + chip_name = "green_sardine"; break; case CHIP_NAVI10: chip_name = "navi10"; @@ -1631,11 +1817,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) case CHIP_NAVI12: chip_name = "navi12"; break; - case CHIP_SIENNA_CICHLID: - chip_name = "sienna_cichlid"; - break; - case CHIP_NAVY_FLOUNDER: - chip_name = "navy_flounder"; + case CHIP_VANGOGH: + chip_name = "vangogh"; break; } @@ -1811,7 +1994,12 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) case CHIP_NAVI12: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: - adev->family = AMDGPU_FAMILY_NV; + case CHIP_DIMGREY_CAVEFISH: + case CHIP_VANGOGH: + if (adev->asic_type == CHIP_VANGOGH) + adev->family = AMDGPU_FAMILY_VGH; + else + adev->family = AMDGPU_FAMILY_NV; r = nv_set_ip_blocks(adev); if (r) @@ -1935,7 +2123,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) if (adev->ip_blocks[i].status.hw == true) break; - if (adev->in_gpu_reset || adev->in_suspend) { + if (amdgpu_in_reset(adev) || adev->in_suspend) { r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { DRM_ERROR("resume of IP block <%s> failed %d\n", @@ -2055,13 +2243,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) * it should be called after amdgpu_device_ip_hw_init_phase2 since * for some ASICs the RAS EEPROM code relies on SMU fully functioning * for I2C communication which only true at this point. - * recovery_init may fail, but it can free all resources allocated by - * itself and its failure should not stop amdgpu init process. + * + * amdgpu_ras_recovery_init may fail, but the upper only cares the + * failure from bad gpu situation and stop amdgpu init process + * accordingly. For other failed cases, it will still release all + * the resource and print error message, rather than returning one + * negative value to upper level. * * Note: theoretically, this should be called before all vram allocations * to protect retired page from abusing */ - amdgpu_ras_recovery_init(adev); + r = amdgpu_ras_recovery_init(adev); + if (r) + goto init_failed; if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); @@ -2106,7 +2300,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev) AMDGPU_RESET_MAGIC_NUM)) return true; - if (!adev->in_gpu_reset) + if (!amdgpu_in_reset(adev)) return false; /* @@ -2217,9 +2411,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void) gpu_ins = &(mgpu_info.gpu_ins[i]); adev = gpu_ins->adev; if (!(adev->flags & AMD_IS_APU) && - !gpu_ins->mgpu_fan_enabled && - adev->powerplay.pp_funcs && - adev->powerplay.pp_funcs->enable_mgpu_fan_boost) { + !gpu_ins->mgpu_fan_enabled) { ret = amdgpu_dpm_enable_mgpu_fan_boost(adev); if (ret) break; @@ -2574,17 +2766,16 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, }; - for (i = 0; i < adev->num_ip_blocks; i++) - adev->ip_blocks[i].status.hw = false; - for (i = 0; i < ARRAY_SIZE(ip_order); i++) { int j; struct amdgpu_ip_block *block; - for (j = 0; j < adev->num_ip_blocks; j++) { - block = &adev->ip_blocks[j]; + block = &adev->ip_blocks[i]; + block->status.hw = false; - if (block->version->type != ip_order[i] || + for (j = 0; j < ARRAY_SIZE(ip_order); j++) { + + if (block->version->type != ip_order[j] || !block->status.valid) continue; @@ -2777,6 +2968,12 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) { switch (asic_type) { #if defined(CONFIG_DRM_AMD_DC) +#if defined(CONFIG_DRM_AMD_DC_SI) + case CHIP_TAHITI: + case CHIP_PITCAIRN: + case CHIP_VERDE: + case CHIP_OLAND: +#endif case CHIP_BONAIRE: case CHIP_KAVERI: case CHIP_KABINI: @@ -2807,10 +3004,10 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_NAVI14: case CHIP_NAVI12: case CHIP_RENOIR: -#endif -#if defined(CONFIG_DRM_AMD_DC_DCN3_0) case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: + case CHIP_DIMGREY_CAVEFISH: + case CHIP_VANGOGH: #endif return amdgpu_dc != 0; #endif @@ -2825,13 +3022,13 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) /** * amdgpu_device_has_dc_support - check if dc is supported * - * @adev: amdgpu_device_pointer + * @adev: amdgpu_device pointer * * Returns true for supported, false for not supported */ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) { - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display) return false; return amdgpu_device_asic_has_dc_support(adev->asic_type); @@ -2842,7 +3039,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* It's a bug to not have a hive within this function */ if (WARN_ON(!hive)) @@ -2857,13 +3054,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) { task_barrier_enter(&hive->tb); - adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev); + adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev)); if (adev->asic_reset_res) goto fail; task_barrier_exit(&hive->tb); - adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev); + adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev)); if (adev->asic_reset_res) goto fail; @@ -2879,7 +3076,8 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) fail: if (adev->asic_reset_res) DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", - adev->asic_reset_res, adev->ddev->unique); + adev->asic_reset_res, adev_to_drm(adev)->unique); + amdgpu_put_xgmi_hive(hive); } static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) @@ -2958,12 +3156,11 @@ static const struct attribute *amdgpu_dev_attributes[] = { NULL }; + /** * amdgpu_device_init - initialize the driver * * @adev: amdgpu_device pointer - * @ddev: drm dev pointer - * @pdev: pci dev pointer * @flags: driver flags * * Initializes the driver info and hw (all asics). @@ -2971,18 +3168,15 @@ static const struct attribute *amdgpu_dev_attributes[] = { * Called at driver startup. */ int amdgpu_device_init(struct amdgpu_device *adev, - struct drm_device *ddev, - struct pci_dev *pdev, uint32_t flags) { + struct drm_device *ddev = adev_to_drm(adev); + struct pci_dev *pdev = adev->pdev; int r, i; bool boco = false; u32 max_MBps; adev->shutdown = false; - adev->dev = &pdev->dev; - adev->ddev = ddev; - adev->pdev = pdev; adev->flags = flags; if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) @@ -3038,7 +3232,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->mn_lock); mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); - mutex_init(&adev->lock_reset); + atomic_set(&adev->in_gpu_reset, 0); + init_rwsem(&adev->reset_sem); mutex_init(&adev->psp.mutex); mutex_init(&adev->notifier_lock); @@ -3133,13 +3328,13 @@ int amdgpu_device_init(struct amdgpu_device *adev, r = amdgpu_device_get_job_timeout_settings(adev); if (r) { dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); - return r; + goto failed_unmap; } /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) - return r; + goto failed_unmap; /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); @@ -3180,6 +3375,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } + pci_enable_pcie_error_reporting(adev->ddev.pdev); + /* Post card if necessary */ if (amdgpu_device_need_post(adev)) { if (!adev->bios) { @@ -3188,7 +3385,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, goto failed; } DRM_INFO("GPU posting now...\n"); - r = amdgpu_atom_asic_init(adev->mode_info.atom_context); + r = amdgpu_device_asic_init(adev); if (r) { dev_err(adev->dev, "gpu post error!\n"); goto failed; @@ -3226,7 +3423,7 @@ fence_driver_init: } /* init the mode config */ - drm_mode_config_init(adev->ddev); + drm_mode_config_init(adev_to_drm(adev)); r = amdgpu_device_ip_init(adev); if (r) { @@ -3322,16 +3519,18 @@ fence_driver_init: flush_delayed_work(&adev->delayed_init_work); r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); - if (r) { + if (r) dev_err(adev->dev, "Could not create amdgpu device attr\n"); - return r; - } if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); if (r) dev_err(adev->dev, "amdgpu_pmu_init failed\n"); + /* Have stored pci confspace at hand for restore in sudden PCI error */ + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(pdev); + return 0; failed: @@ -3339,6 +3538,10 @@ failed: if (boco) vga_switcheroo_fini_domain_pm_ops(adev->dev); +failed_unmap: + iounmap(adev->rmmio); + adev->rmmio = NULL; + return r; } @@ -3352,31 +3555,33 @@ failed: */ void amdgpu_device_fini(struct amdgpu_device *adev) { - int r; - - DRM_INFO("amdgpu: finishing device.\n"); + dev_info(adev->dev, "amdgpu: finishing device.\n"); flush_delayed_work(&adev->delayed_init_work); adev->shutdown = true; + kfree(adev->pci_state); + /* make sure IB test finished before entering exclusive mode * to avoid preemption on IB test * */ - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev)) { amdgpu_virt_request_full_gpu(adev, false); + amdgpu_virt_fini_data_exchange(adev); + } /* disable all interrupts */ amdgpu_irq_disable_all(adev); if (adev->mode_info.mode_config_initialized){ if (!amdgpu_device_has_dc_support(adev)) - drm_helper_force_disable_all(adev->ddev); + drm_helper_force_disable_all(adev_to_drm(adev)); else - drm_atomic_helper_shutdown(adev->ddev); + drm_atomic_helper_shutdown(adev_to_drm(adev)); } amdgpu_fence_driver_fini(adev); if (adev->pm_sysfs_en) amdgpu_pm_sysfs_fini(adev); amdgpu_fbdev_fini(adev); - r = amdgpu_device_ip_fini(adev); + amdgpu_device_ip_fini(adev); release_firmware(adev->firmware.gpu_info_fw); adev->firmware.gpu_info_fw = NULL; adev->accel_working = false; @@ -3394,7 +3599,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_has_atpx_dgpu_power_cntl()) && !pci_is_thunderbolt_attached(adev->pdev)) vga_switcheroo_unregister_client(adev->pdev); - if (amdgpu_device_supports_boco(adev->ddev)) + if (amdgpu_device_supports_boco(adev_to_drm(adev))) vga_switcheroo_fini_domain_pm_ops(adev->dev); vga_client_register(adev->pdev, NULL, NULL, NULL); if (adev->rio_mem) @@ -3410,7 +3615,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); if (IS_ENABLED(CONFIG_PERF_EVENTS)) amdgpu_pmu_fini(adev); - if (adev->discovery_bin) + if (adev->mman.discovery_bin) amdgpu_discovery_fini(adev); } @@ -3436,11 +3641,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) struct drm_connector_list_iter iter; int r; - if (dev == NULL || dev->dev_private == NULL) { - return -ENODEV; - } - - adev = dev->dev_private; + adev = drm_to_adev(dev); if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; @@ -3528,7 +3729,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) { struct drm_connector *connector; struct drm_connector_list_iter iter; - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); struct drm_crtc *crtc; int r = 0; @@ -3537,14 +3738,14 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) /* post card */ if (amdgpu_device_need_post(adev)) { - r = amdgpu_atom_asic_init(adev->mode_info.atom_context); + r = amdgpu_device_asic_init(adev); if (r) - DRM_ERROR("amdgpu asic init failed\n"); + dev_err(adev->dev, "amdgpu asic init failed\n"); } r = amdgpu_device_ip_resume(adev); if (r) { - DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r); + dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); return r; } amdgpu_fence_driver_resume(adev); @@ -3568,7 +3769,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) if (r == 0) { r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM); if (r != 0) - DRM_ERROR("Failed to pin cursor BO (%d)\n", r); + dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r); amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj); amdgpu_bo_unreserve(aobj); } @@ -3658,7 +3859,7 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev) adev->ip_blocks[i].status.hang = adev->ip_blocks[i].version->funcs->check_soft_reset(adev); if (adev->ip_blocks[i].status.hang) { - DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); + dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name); asic_hang = true; } } @@ -3719,7 +3920,7 @@ static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev) (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { if (adev->ip_blocks[i].status.hang) { - DRM_INFO("Some block need full reset!\n"); + dev_info(adev->dev, "Some block need full reset!\n"); return true; } } @@ -3807,7 +4008,7 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) else tmo = msecs_to_jiffies(100); - DRM_INFO("recover vram bo from shadow start\n"); + dev_info(adev->dev, "recover vram bo from shadow start\n"); mutex_lock(&adev->shadow_list_lock); list_for_each_entry(shadow, &adev->shadow_list, shadow_list) { @@ -3843,11 +4044,11 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) dma_fence_put(fence); if (r < 0 || tmo <= 0) { - DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); + dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo); return -EIO; } - DRM_INFO("recover vram bo from shadow done\n"); + dev_info(adev->dev, "recover vram bo from shadow done\n"); return 0; } @@ -3855,7 +4056,7 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) /** * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * @from_hypervisor: request from hypervisor * * do VF FLR and reinitialize Asic @@ -3882,7 +4083,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, amdgpu_virt_init_data_exchange(adev); /* we need recover gart prior to run SMC/CP/SDMA resume */ - amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]); + amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT)); r = amdgpu_device_fw_loading(adev); if (r) @@ -3908,9 +4109,37 @@ error: } /** + * amdgpu_device_has_job_running - check if there is any job in mirror list + * + * @adev: amdgpu_device pointer + * + * check if there is any job in mirror list + */ +bool amdgpu_device_has_job_running(struct amdgpu_device *adev) +{ + int i; + struct drm_sched_job *job; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + spin_lock(&ring->sched.job_list_lock); + job = list_first_entry_or_null(&ring->sched.ring_mirror_list, + struct drm_sched_job, node); + spin_unlock(&ring->sched.job_list_lock); + if (job) + return true; + } + return false; +} + +/** * amdgpu_device_should_recover_gpu - check if we should try GPU recovery * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover * a hung GPU. @@ -3918,7 +4147,7 @@ error: bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) { if (!amdgpu_device_ip_check_soft_reset(adev)) { - DRM_INFO("Timeout, but no hardware hang detected.\n"); + dev_info(adev->dev, "Timeout, but no hardware hang detected.\n"); return false; } @@ -3958,7 +4187,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) return true; disabled: - DRM_INFO("GPU recovery disabled.\n"); + dev_info(adev->dev, "GPU recovery disabled.\n"); return false; } @@ -3972,6 +4201,11 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, amdgpu_debugfs_wait_dump(adev); + if (amdgpu_sriov_vf(adev)) { + /* stop the data exchange thread */ + amdgpu_virt_fini_data_exchange(adev); + } + /* block all schedulers and reset given job's ring */ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; @@ -3997,7 +4231,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, r = amdgpu_device_ip_soft_reset(adev); amdgpu_device_ip_post_soft_reset(adev); if (r || amdgpu_device_ip_check_soft_reset(adev)) { - DRM_INFO("soft reset failed, will fallback to full reset!\n"); + dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n"); need_full_reset = true; } } @@ -4013,7 +4247,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, struct list_head *device_list_handle, - bool *need_full_reset_arg) + bool *need_full_reset_arg, + bool skip_hw_reset) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@ -4023,7 +4258,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, * ASIC reset has to be done on all HGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) */ - if (need_full_reset) { + if (!skip_hw_reset && need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /* For XGMI run all resets in parallel to speed up the process */ if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { @@ -4033,8 +4268,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, r = amdgpu_asic_reset(tmp_adev); if (r) { - DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s", - r, tmp_adev->ddev->unique); + dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", + r, adev_to_drm(tmp_adev)->unique); break; } } @@ -4066,8 +4301,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { if (need_full_reset) { /* post card */ - if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) - DRM_WARN("asic atom init failed!"); + if (amdgpu_device_asic_init(tmp_adev)) + dev_warn(tmp_adev->dev, "asic atom init failed!"); if (!r) { dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); @@ -4081,8 +4316,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, amdgpu_inc_vram_lost(tmp_adev); } - r = amdgpu_gtt_mgr_recover( - &tmp_adev->mman.bdev.man[TTM_PL_TT]); + r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); if (r) goto out; @@ -4109,8 +4343,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, amdgpu_fbdev_set_suspend(tmp_adev, 0); - /* must succeed. */ - amdgpu_ras_resume(tmp_adev); + /* + * The GPU enters bad state once faulty pages + * by ECC has reached the threshold, and ras + * recovery is scheduled next. So add one check + * here to break recovery if it indeed exceeds + * bad page threshold, and remind user to + * retire this GPU or setting one bigger + * bad_page_threshold value to fix this once + * probing driver again. + */ + if (!amdgpu_ras_check_err_threshold(tmp_adev)) { + /* must succeed. */ + amdgpu_ras_resume(tmp_adev); + } else { + r = -EINVAL; + goto out; + } /* Update PSP FW topology after reset */ if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) @@ -4118,7 +4367,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } } - out: if (!r) { amdgpu_irq_gpu_reset_resume_helper(tmp_adev); @@ -4143,16 +4391,19 @@ end: return r; } -static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) +static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, + struct amdgpu_hive_info *hive) { - if (trylock) { - if (!mutex_trylock(&adev->lock_reset)) - return false; - } else - mutex_lock(&adev->lock_reset); + if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) + return false; + + if (hive) { + down_write_nest_lock(&adev->reset_sem, &hive->hive_lock); + } else { + down_write(&adev->reset_sem); + } atomic_inc(&adev->gpu_reset_counter); - adev->in_gpu_reset = true; switch (amdgpu_asic_reset_method(adev)) { case AMD_RESET_METHOD_MODE1: adev->mp1_state = PP_MP1_STATE_SHUTDOWN; @@ -4172,8 +4423,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; - adev->in_gpu_reset = false; - mutex_unlock(&adev->lock_reset); + atomic_set(&adev->in_gpu_reset, 0); + up_write(&adev->reset_sem); } static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) @@ -4237,7 +4488,7 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * - * @adev: amdgpu device pointer + * @adev: amdgpu_device pointer * @job: which job trigger hang * * Attempt to reset the GPU if it has hung (all asics). @@ -4257,7 +4508,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, bool need_emergency_restart = false; bool audio_suspended = false; - /** + /* * Special case: RAS triggered and full reset isn't supported */ need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); @@ -4283,12 +4534,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev, true); - if (hive && !mutex_trylock(&hive->reset_lock)) { - DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", - job ? job->base.id : -1, hive->hive_id); - mutex_unlock(&hive->hive_lock); - return 0; + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { + DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", + job ? job->base.id : -1, hive->hive_id); + amdgpu_put_xgmi_hive(hive); + return 0; + } + mutex_lock(&hive->hive_lock); } /* @@ -4310,11 +4564,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { - DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", + if (!amdgpu_device_lock_adev(tmp_adev, hive)) { + dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress", job ? job->base.id : -1); - mutex_unlock(&hive->hive_lock); - return 0; + r = 0; + goto skip_recovery; } /* @@ -4382,12 +4636,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { r = amdgpu_device_pre_asic_reset(tmp_adev, - NULL, + (tmp_adev == adev) ? job : NULL, &need_full_reset); /*TODO Should we stop ?*/ if (r) { - DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", - r, tmp_adev->ddev->unique); + dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", + r, adev_to_drm(tmp_adev)->unique); tmp_adev->asic_reset_res = r; } } @@ -4399,7 +4653,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (r) adev->asic_reset_res = r; } else { - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); if (r && r == -EAGAIN) goto retry; } @@ -4423,7 +4677,7 @@ skip_hw_reset: } if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { - drm_helper_resume_force_mode(tmp_adev->ddev); + drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); } tmp_adev->asic_reset_res = 0; @@ -4447,9 +4701,11 @@ skip_sched_resume: amdgpu_device_unlock_adev(tmp_adev); } +skip_recovery: if (hive) { - mutex_unlock(&hive->reset_lock); + atomic_set(&hive->in_reset, 0); mutex_unlock(&hive->hive_lock); + amdgpu_put_xgmi_hive(hive); } if (r) @@ -4595,10 +4851,10 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) int amdgpu_device_baco_enter(struct drm_device *dev) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - if (!amdgpu_device_supports_baco(adev->ddev)) + if (!amdgpu_device_supports_baco(adev_to_drm(adev))) return -ENOTSUPP; if (ras && ras->supported) @@ -4609,11 +4865,11 @@ int amdgpu_device_baco_enter(struct drm_device *dev) int amdgpu_device_baco_exit(struct drm_device *dev) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int ret = 0; - if (!amdgpu_device_supports_baco(adev->ddev)) + if (!amdgpu_device_supports_baco(adev_to_drm(adev))) return -ENOTSUPP; ret = amdgpu_dpm_baco_exit(adev); @@ -4625,3 +4881,235 @@ int amdgpu_device_baco_exit(struct drm_device *dev) return 0; } + +static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) +{ + int i; + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + cancel_delayed_work_sync(&ring->sched.work_tdr); + } +} + +/** + * amdgpu_pci_error_detected - Called when a PCI error is detected. + * @pdev: PCI device struct + * @state: PCI channel state + * + * Description: Called when a PCI error is detected. + * + * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT. + */ +pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int i; + + DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); + + if (adev->gmc.xgmi.num_physical_nodes > 1) { + DRM_WARN("No support for XGMI hive yet..."); + return PCI_ERS_RESULT_DISCONNECT; + } + + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + /* Fatal error, prepare for slot reset */ + case pci_channel_io_frozen: + /* + * Cancel and wait for all TDRs in progress if failing to + * set adev->in_gpu_reset in amdgpu_device_lock_adev + * + * Locking adev->reset_sem will prevent any external access + * to GPU during PCI error recovery + */ + while (!amdgpu_device_lock_adev(adev, NULL)) + amdgpu_cancel_all_tdr(adev); + + /* + * Block any work scheduling as we do for regular GPU reset + * for the duration of the recovery + */ + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + drm_sched_stop(&ring->sched, NULL); + } + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + /* Permanent error, prepare for device removal */ + return PCI_ERS_RESULT_DISCONNECT; + } + + return PCI_ERS_RESULT_NEED_RESET; +} + +/** + * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers + * @pdev: pointer to PCI device + */ +pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev) +{ + + DRM_INFO("PCI error: mmio enabled callback!!\n"); + + /* TODO - dump whatever for debugging purposes */ + + /* This called only if amdgpu_pci_error_detected returns + * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still + * works, no need to reset slot. + */ + + return PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_slot_reset - Called when PCI slot has been reset. + * @pdev: PCI device struct + * + * Description: This routine is called by the pci error recovery + * code after the PCI slot has been reset, just before we + * should resume normal operations. + */ +pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r, i; + bool need_full_reset = true; + u32 memsize; + struct list_head device_list; + + DRM_INFO("PCI error: slot reset callback!!\n"); + + INIT_LIST_HEAD(&device_list); + list_add_tail(&adev->gmc.xgmi.head, &device_list); + + /* wait for asic to come out of reset */ + msleep(500); + + /* Restore PCI confspace */ + amdgpu_device_load_pci_state(pdev); + + /* confirm ASIC came out of reset */ + for (i = 0; i < adev->usec_timeout; i++) { + memsize = amdgpu_asic_get_config_memsize(adev); + + if (memsize != 0xffffffff) + break; + udelay(1); + } + if (memsize == 0xffffffff) { + r = -ETIME; + goto out; + } + + adev->in_pci_err_recovery = true; + r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); + adev->in_pci_err_recovery = false; + if (r) + goto out; + + r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); + +out: + if (!r) { + if (amdgpu_device_cache_pci_state(adev->pdev)) + pci_restore_state(adev->pdev); + + DRM_INFO("PCIe error recovery succeeded\n"); + } else { + DRM_ERROR("PCIe error recovery failed, err:%d", r); + amdgpu_device_unlock_adev(adev); + } + + return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; +} + +/** + * amdgpu_pci_resume() - resume normal ops after PCI reset + * @pdev: pointer to PCI device + * + * Called when the error recovery driver tells us that its + * OK to resume normal operation. Use completion to allow + * halted scsi ops to resume. + */ +void amdgpu_pci_resume(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int i; + + + DRM_INFO("PCI error: resume callback!!\n"); + + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + + drm_sched_resubmit_jobs(&ring->sched); + drm_sched_start(&ring->sched, true); + } + + amdgpu_device_unlock_adev(adev); +} + +bool amdgpu_device_cache_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + r = pci_save_state(pdev); + if (!r) { + kfree(adev->pci_state); + + adev->pci_state = pci_store_saved_state(pdev); + + if (!adev->pci_state) { + DRM_ERROR("Failed to store PCI saved state"); + return false; + } + } else { + DRM_WARN("Failed to save PCI state, err:%d\n", r); + return false; + } + + return true; +} + +bool amdgpu_device_load_pci_state(struct pci_dev *pdev) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = drm_to_adev(dev); + int r; + + if (!adev->pci_state) + return false; + + r = pci_load_saved_state(pdev, adev->pci_state); + + if (!r) { + pci_restore_state(pdev); + } else { + DRM_WARN("Failed to load PCI state, err:%d\n", r); + return false; + } + + return true; +} + + |