diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 926 |
1 files changed, 505 insertions, 421 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6238701cde23..8dee52ce26d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -32,8 +32,6 @@ #include <linux/slab.h> #include <linux/iommu.h> #include <linux/pci.h> -#include <linux/devcoredump.h> -#include <generated/utsrelease.h> #include <linux/pci-p2pdma.h> #include <linux/apple-gmux.h> @@ -43,6 +41,7 @@ #include <drm/drm_fb_helper.h> #include <drm/drm_probe_helper.h> #include <drm/amdgpu_drm.h> +#include <linux/device.h> #include <linux/vgaarb.h> #include <linux/vga_switcheroo.h> #include <linux/efi.h> @@ -74,6 +73,7 @@ #include "amdgpu_pmu.h" #include "amdgpu_fru_eeprom.h" #include "amdgpu_reset.h" +#include "amdgpu_virt.h" #include <linux/suspend.h> #include <drm/task_barrier.h> @@ -159,76 +159,79 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev, return sysfs_emit(buf, "%llu\n", cnt); } -static DEVICE_ATTR(pcie_replay_count, S_IRUGO, +static DEVICE_ATTR(pcie_replay_count, 0444, amdgpu_device_get_pcie_replay_count, NULL); -static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); - /** - * DOC: product_name + * DOC: board_info + * + * The amdgpu driver provides a sysfs API for giving board related information. + * It provides the form factor information in the format + * + * type : form factor + * + * Possible form factor values + * + * - "cem" - PCIE CEM card + * - "oam" - Open Compute Accelerator Module + * - "unknown" - Not known * - * The amdgpu driver provides a sysfs API for reporting the product name - * for the device - * The file product_name is used for this and returns the product name - * as returned from the FRU. - * NOTE: This is only available for certain server cards */ -static ssize_t amdgpu_device_get_product_name(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t amdgpu_device_get_board_info(struct device *dev, + struct device_attribute *attr, + char *buf) { struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); + enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM; + const char *pkg; - return sysfs_emit(buf, "%s\n", adev->product_name); -} + if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type) + pkg_type = adev->smuio.funcs->get_pkg_type(adev); -static DEVICE_ATTR(product_name, S_IRUGO, - amdgpu_device_get_product_name, NULL); - -/** - * DOC: product_number - * - * The amdgpu driver provides a sysfs API for reporting the part number - * for the device - * The file product_number is used for this and returns the part number - * as returned from the FRU. - * NOTE: This is only available for certain server cards - */ - -static ssize_t amdgpu_device_get_product_number(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct drm_device *ddev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(ddev); + switch (pkg_type) { + case AMDGPU_PKG_TYPE_CEM: + pkg = "cem"; + break; + case AMDGPU_PKG_TYPE_OAM: + pkg = "oam"; + break; + default: + pkg = "unknown"; + break; + } - return sysfs_emit(buf, "%s\n", adev->product_number); + return sysfs_emit(buf, "%s : %s\n", "type", pkg); } -static DEVICE_ATTR(product_number, S_IRUGO, - amdgpu_device_get_product_number, NULL); +static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL); -/** - * DOC: serial_number - * - * The amdgpu driver provides a sysfs API for reporting the serial number - * for the device - * The file serial_number is used for this and returns the serial number - * as returned from the FRU. - * NOTE: This is only available for certain server cards - */ +static struct attribute *amdgpu_board_attrs[] = { + &dev_attr_board_info.attr, + NULL, +}; -static ssize_t amdgpu_device_get_serial_number(struct device *dev, - struct device_attribute *attr, char *buf) +static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int n) { + struct device *dev = kobj_to_dev(kobj); struct drm_device *ddev = dev_get_drvdata(dev); struct amdgpu_device *adev = drm_to_adev(ddev); - return sysfs_emit(buf, "%s\n", adev->serial); + if (adev->flags & AMD_IS_APU) + return 0; + + return attr->mode; } -static DEVICE_ATTR(serial_number, S_IRUGO, - amdgpu_device_get_serial_number, NULL); +static const struct attribute_group amdgpu_board_attrs_group = { + .attrs = amdgpu_board_attrs, + .is_visible = amdgpu_board_attrs_is_visible +}; + +static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); + /** * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control @@ -370,10 +373,16 @@ size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos, if (write) { memcpy_toio(addr, buf, count); + /* Make sure HDP write cache flush happens without any reordering + * after the system memory contents are sent over PCIe device + */ mb(); amdgpu_device_flush_hdp(adev, NULL); } else { amdgpu_device_invalidate_hdp(adev, NULL); + /* Make sure HDP read cache is invalidated before issuing a read + * to the PCIe device + */ mb(); memcpy_fromio(buf, addr, count); } @@ -464,7 +473,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_domain->sem)) { - ret = amdgpu_kiq_rreg(adev, reg); + ret = amdgpu_kiq_rreg(adev, reg, 0); up_read(&adev->reset_domain->sem); } else { ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); @@ -481,8 +490,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, /* * MMIO register read with bytes helper functions * @offset:bytes offset from MMIO start - * -*/ + */ /** * amdgpu_mm_rreg8 - read a memory mapped IO register @@ -502,12 +510,55 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) BUG(); } + +/** + * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC + * + * @adev: amdgpu_device pointer + * @reg: dword aligned register offset + * @acc_flags: access flags which require special behavior + * @xcc_id: xcc accelerated compute core id + * + * Returns the 32 bit value from the offset specified. + */ +uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev, + uint32_t reg, uint32_t acc_flags, + uint32_t xcc_id) +{ + uint32_t ret, rlcg_flag; + + if (amdgpu_device_skip_hw_access(adev)) + return 0; + + if ((reg * 4) < adev->rmmio_size) { + if (amdgpu_sriov_vf(adev) && + !amdgpu_sriov_runtime(adev) && + adev->gfx.rlc.rlcg_reg_access_supported && + amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, + GC_HWIP, false, + &rlcg_flag)) { + ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id); + } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && + amdgpu_sriov_runtime(adev) && + down_read_trylock(&adev->reset_domain->sem)) { + ret = amdgpu_kiq_rreg(adev, reg, xcc_id); + up_read(&adev->reset_domain->sem); + } else { + ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); + } + } else { + ret = adev->pcie_rreg(adev, reg * 4); + } + + return ret; +} + /* * MMIO register write with bytes helper functions * @offset:bytes offset from MMIO start * @value: the value want to be written to the register - * -*/ + */ + /** * amdgpu_mm_wreg8 - read a memory mapped IO register * @@ -549,7 +600,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && down_read_trylock(&adev->reset_domain->sem)) { - amdgpu_kiq_wreg(adev, reg, v); + amdgpu_kiq_wreg(adev, reg, v, 0); up_read(&adev->reset_domain->sem); } else { writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); @@ -567,11 +618,13 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, * @adev: amdgpu_device pointer * @reg: mmio/rlc register * @v: value to write + * @xcc_id: xcc accelerated compute core id * * this function is invoked only for the debugfs register access */ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, - uint32_t reg, uint32_t v) + uint32_t reg, uint32_t v, + uint32_t xcc_id) { if (amdgpu_device_skip_hw_access(adev)) return; @@ -580,7 +633,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, adev->gfx.rlc.funcs && adev->gfx.rlc.funcs->is_rlcg_access_range) { if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg)) - return amdgpu_sriov_wreg(adev, reg, v, 0, 0); + return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id); } else if ((reg * 4) >= adev->rmmio_size) { adev->pcie_wreg(adev, reg * 4, v); } else { @@ -589,90 +642,43 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, } /** - * amdgpu_mm_rdoorbell - read a doorbell dword + * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC * * @adev: amdgpu_device pointer - * @index: doorbell index - * - * Returns the value in the doorbell aperture at the - * requested doorbell index (CIK). - */ -u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) -{ - if (amdgpu_device_skip_hw_access(adev)) - return 0; - - if (index < adev->doorbell.num_kernel_doorbells) { - return readl(adev->doorbell.ptr + index); - } else { - DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); - return 0; - } -} - -/** - * amdgpu_mm_wdoorbell - write a doorbell dword - * - * @adev: amdgpu_device pointer - * @index: doorbell index - * @v: value to write - * - * Writes @v to the doorbell aperture at the - * requested doorbell index (CIK). - */ -void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) -{ - if (amdgpu_device_skip_hw_access(adev)) - return; - - if (index < adev->doorbell.num_kernel_doorbells) { - writel(v, adev->doorbell.ptr + index); - } else { - DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); - } -} - -/** - * amdgpu_mm_rdoorbell64 - read a doorbell Qword - * - * @adev: amdgpu_device pointer - * @index: doorbell index + * @reg: dword aligned register offset + * @v: 32 bit value to write to the register + * @acc_flags: access flags which require special behavior + * @xcc_id: xcc accelerated compute core id * - * Returns the value in the doorbell aperture at the - * requested doorbell index (VEGA10+). + * Writes the value specified to the offset specified. */ -u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) +void amdgpu_device_xcc_wreg(struct amdgpu_device *adev, + uint32_t reg, uint32_t v, + uint32_t acc_flags, uint32_t xcc_id) { - if (amdgpu_device_skip_hw_access(adev)) - return 0; - - if (index < adev->doorbell.num_kernel_doorbells) { - return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); - } else { - DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index); - return 0; - } -} + uint32_t rlcg_flag; -/** - * amdgpu_mm_wdoorbell64 - write a doorbell Qword - * - * @adev: amdgpu_device pointer - * @index: doorbell index - * @v: value to write - * - * Writes @v to the doorbell aperture at the - * requested doorbell index (VEGA10+). - */ -void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) -{ if (amdgpu_device_skip_hw_access(adev)) return; - if (index < adev->doorbell.num_kernel_doorbells) { - atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); + if ((reg * 4) < adev->rmmio_size) { + if (amdgpu_sriov_vf(adev) && + !amdgpu_sriov_runtime(adev) && + adev->gfx.rlc.rlcg_reg_access_supported && + amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, + GC_HWIP, true, + &rlcg_flag)) { + amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id); + } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && + amdgpu_sriov_runtime(adev) && + down_read_trylock(&adev->reset_domain->sem)) { + amdgpu_kiq_wreg(adev, reg, v, xcc_id); + up_read(&adev->reset_domain->sem); + } else { + writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); + } } else { - DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index); + adev->pcie_wreg(adev, reg * 4, v); } } @@ -718,7 +724,7 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev, pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); - if (adev->nbio.funcs->get_pcie_index_hi_offset) + if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); else pcie_index_hi = 0; @@ -785,6 +791,56 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev, return r; } +u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev, + u64 reg_addr) +{ + unsigned long flags, pcie_index, pcie_data; + unsigned long pcie_index_hi = 0; + void __iomem *pcie_index_offset; + void __iomem *pcie_index_hi_offset; + void __iomem *pcie_data_offset; + u64 r; + + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); + if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) + pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + if (pcie_index_hi != 0) + pcie_index_hi_offset = (void __iomem *)adev->rmmio + + pcie_index_hi * 4; + + /* read low 32 bits */ + writel(reg_addr, pcie_index_offset); + readl(pcie_index_offset); + if (pcie_index_hi != 0) { + writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } + r = readl(pcie_data_offset); + /* read high 32 bits */ + writel(reg_addr + 4, pcie_index_offset); + readl(pcie_index_offset); + if (pcie_index_hi != 0) { + writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } + r |= ((u64)readl(pcie_data_offset) << 32); + + /* clear the high bits */ + if (pcie_index_hi != 0) { + writel(0, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } + + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); + + return r; +} + /** * amdgpu_device_indirect_wreg - write an indirect register address * @@ -824,7 +880,7 @@ void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev, pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); - if (adev->nbio.funcs->get_pcie_index_hi_offset) + if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); else pcie_index_hi = 0; @@ -889,6 +945,55 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev, spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); } +void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev, + u64 reg_addr, u64 reg_data) +{ + unsigned long flags, pcie_index, pcie_data; + unsigned long pcie_index_hi = 0; + void __iomem *pcie_index_offset; + void __iomem *pcie_index_hi_offset; + void __iomem *pcie_data_offset; + + pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev); + pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev); + if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset)) + pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev); + + spin_lock_irqsave(&adev->pcie_idx_lock, flags); + pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; + pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4; + if (pcie_index_hi != 0) + pcie_index_hi_offset = (void __iomem *)adev->rmmio + + pcie_index_hi * 4; + + /* write low 32 bits */ + writel(reg_addr, pcie_index_offset); + readl(pcie_index_offset); + if (pcie_index_hi != 0) { + writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } + writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset); + readl(pcie_data_offset); + /* write high 32 bits */ + writel(reg_addr + 4, pcie_index_offset); + readl(pcie_index_offset); + if (pcie_index_hi != 0) { + writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } + writel((u32)(reg_data >> 32), pcie_data_offset); + readl(pcie_data_offset); + + /* clear the high bits */ + if (pcie_index_hi != 0) { + writel(0, pcie_index_hi_offset); + readl(pcie_index_hi_offset); + } + + spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); +} + /** * amdgpu_device_get_rev_id - query device rev_id * @@ -966,6 +1071,13 @@ static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg) return 0; } +static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg) +{ + DRM_ERROR("Invalid callback to read register 0x%llX\n", reg); + BUG(); + return 0; +} + /** * amdgpu_invalid_wreg64 - dummy reg write function * @@ -983,6 +1095,13 @@ static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint BUG(); } +static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v) +{ + DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n", + reg, v); + BUG(); +} + /** * amdgpu_block_invalid_rreg - dummy reg read function * @@ -1032,13 +1151,22 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev, */ static int amdgpu_device_asic_init(struct amdgpu_device *adev) { + int ret; + amdgpu_asic_pre_asic_init(adev); - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) || - adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) - return amdgpu_atomfirmware_asic_init(adev, true); - else + if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || + amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) { + amdgpu_psp_wait_for_bootloader(adev); + ret = amdgpu_atomfirmware_asic_init(adev, true); + /* TODO: check the return val and stop device initialization if boot fails */ + amdgpu_psp_query_boot_status(adev); + return ret; + } else { return amdgpu_atom_asic_init(adev->mode_info.atom_context); + } + + return 0; } /** @@ -1078,7 +1206,7 @@ static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev) * @registers: pointer to the register array * @array_size: size of the register array * - * Programs an array or registers with and and or masks. + * Programs an array or registers with and or masks. * This is a helper for setting golden registers. */ void amdgpu_device_program_register_sequence(struct amdgpu_device *adev, @@ -1136,83 +1264,6 @@ int amdgpu_device_pci_reset(struct amdgpu_device *adev) } /* - * GPU doorbell aperture helpers function. - */ -/** - * amdgpu_device_doorbell_init - Init doorbell driver information. - * - * @adev: amdgpu_device pointer - * - * Init doorbell driver information (CIK) - * Returns 0 on success, error on failure. - */ -static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) -{ - - /* No doorbell on SI hardware generation */ - if (adev->asic_type < CHIP_BONAIRE) { - adev->doorbell.base = 0; - adev->doorbell.size = 0; - adev->doorbell.num_kernel_doorbells = 0; - adev->doorbell.ptr = NULL; - return 0; - } - - if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) - return -EINVAL; - - amdgpu_asic_init_doorbell_index(adev); - - /* doorbell bar mapping */ - adev->doorbell.base = pci_resource_start(adev->pdev, 2); - adev->doorbell.size = pci_resource_len(adev->pdev, 2); - - if (adev->enable_mes) { - adev->doorbell.num_kernel_doorbells = - adev->doorbell.size / sizeof(u32); - } else { - adev->doorbell.num_kernel_doorbells = - min_t(u32, adev->doorbell.size / sizeof(u32), - adev->doorbell_index.max_assignment+1); - if (adev->doorbell.num_kernel_doorbells == 0) - return -EINVAL; - - /* For Vega, reserve and map two pages on doorbell BAR since SDMA - * paging queue doorbell use the second page. The - * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the - * doorbells are in the first page. So with paging queue enabled, - * the max num_kernel_doorbells should + 1 page (0x400 in dword) - */ - if (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(4, 0, 0) && - adev->ip_versions[SDMA0_HWIP][0] < IP_VERSION(4, 2, 0)) - adev->doorbell.num_kernel_doorbells += 0x400; - } - - adev->doorbell.ptr = ioremap(adev->doorbell.base, - adev->doorbell.num_kernel_doorbells * - sizeof(u32)); - if (adev->doorbell.ptr == NULL) - return -ENOMEM; - - return 0; -} - -/** - * amdgpu_device_doorbell_fini - Tear down doorbell driver information. - * - * @adev: amdgpu_device pointer - * - * Tear down doorbell driver information (CIK) - */ -static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev) -{ - iounmap(adev->doorbell.ptr); - adev->doorbell.ptr = NULL; -} - - - -/* * amdgpu_device_wb_*() * Writeback is the method by which the GPU updates special pages in memory * with the status of certain GPU events (fences, ring pointers,etc.). @@ -1321,10 +1372,13 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size); struct pci_bus *root; struct resource *res; - unsigned i; + unsigned int i; u16 cmd; int r; + if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) + return 0; + /* Bypass for VF */ if (amdgpu_sriov_vf(adev)) return 0; @@ -1359,7 +1413,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) cmd & ~PCI_COMMAND_MEMORY); /* Free the VRAM and doorbell BAR, we most likely need to move both. */ - amdgpu_device_doorbell_fini(adev); + amdgpu_doorbell_fini(adev); if (adev->asic_type >= CHIP_BONAIRE) pci_release_resource(adev->pdev, 2); @@ -1376,7 +1430,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) /* When the doorbell or fb BAR isn't available we have no chance of * using the device. */ - r = amdgpu_device_doorbell_init(adev); + r = amdgpu_doorbell_init(adev); if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET)) return -ENODEV; @@ -1387,9 +1441,8 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) static bool amdgpu_device_read_bios(struct amdgpu_device *adev) { - if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) { + if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) return false; - } return true; } @@ -1425,6 +1478,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) if (adev->asic_type == CHIP_FIJI) { int err; uint32_t fw_ver; + err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev); /* force vPost if error occured */ if (err) @@ -1459,40 +1513,45 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) } /* - * On APUs with >= 64GB white flickering has been observed w/ SG enabled. - * Disable S/G on such systems until we have a proper fix. - * https://gitlab.freedesktop.org/drm/amd/-/issues/2354 - * https://gitlab.freedesktop.org/drm/amd/-/issues/2735 + * Check whether seamless boot is supported. + * + * So far we only support seamless boot on DCE 3.0 or later. + * If users report that it works on older ASICS as well, we may + * loosen this. */ -bool amdgpu_sg_display_supported(struct amdgpu_device *adev) +bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev) { - switch (amdgpu_sg_display) { + switch (amdgpu_seamless) { case -1: break; - case 0: - return false; case 1: return true; + case 0: + return false; default: + DRM_ERROR("Invalid value for amdgpu.seamless: %d\n", + amdgpu_seamless); return false; } - if ((totalram_pages() << (PAGE_SHIFT - 10)) + - (adev->gmc.real_vram_size / 1024) >= 64000000) { - DRM_WARN("Disabling S/G due to >=64GB RAM\n"); + + if (!(adev->flags & AMD_IS_APU)) return false; - } - return true; + + if (adev->mman.keep_stolen_vga_memory) + return false; + + return adev->ip_versions[DCE_HWIP][0] >= IP_VERSION(3, 0, 0); } /* - * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic - * speed switching. Until we have confirmation from Intel that a specific host - * supports it, it's safer that we keep it disabled for all. + * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids + * don't support dynamic speed switching. Until we have confirmation from Intel + * that a specific host supports it, it's safer that we keep it disabled for all. * * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 */ -bool amdgpu_device_pcie_dynamic_switching_supported(void) +static bool amdgpu_device_pcie_dynamic_switching_supported(void) { #if IS_ENABLED(CONFIG_X86) struct cpuinfo_x86 *c = &cpu_data(0); @@ -1525,20 +1584,13 @@ bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev) default: return false; } + if (adev->flags & AMD_IS_APU) + return false; + if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK)) + return false; return pcie_aspm_enabled(adev->pdev); } -bool amdgpu_device_aspm_support_quirk(void) -{ -#if IS_ENABLED(CONFIG_X86) - struct cpuinfo_x86 *c = &cpu_data(0); - - return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE); -#else - return true; -#endif -} - /* if we get transitioned to only one device, take VGA back */ /** * amdgpu_device_vga_set_decode - enable/disable vga decode @@ -1553,6 +1605,7 @@ static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev, bool state) { struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev)); + amdgpu_asic_set_vga_state(adev, state); if (state) return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM | @@ -1575,7 +1628,8 @@ static void amdgpu_device_check_block_size(struct amdgpu_device *adev) { /* defines number of bits in page table versus page directory, * a page is 4KB so we have 12 bits offset, minimum 9 bits in the - * page table and the remaining bits are in the page directory */ + * page table and the remaining bits are in the page directory + */ if (amdgpu_vm_block_size == -1) return; @@ -1785,6 +1839,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, } else { pr_info("switched off\n"); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; + amdgpu_device_prepare(dev); amdgpu_device_suspend(dev, true); amdgpu_device_cache_pci_state(pdev); /* Shut down the device */ @@ -1807,7 +1862,7 @@ static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev) { struct drm_device *dev = pci_get_drvdata(pdev); - /* + /* * FIXME: open_count is protected by drm_global_mutex but that would lead to * locking inversion with the driver load path. And the access here is * completely racy anyway. So don't bother with locking for now. @@ -2256,7 +2311,6 @@ out: */ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) { - struct drm_device *dev = adev_to_drm(adev); struct pci_dev *parent; int i, r; bool total; @@ -2327,11 +2381,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) (amdgpu_is_atpx_hybrid() || amdgpu_has_atpx_dgpu_power_cntl()) && ((adev->flags & AMD_IS_APU) == 0) && - !pci_is_thunderbolt_attached(to_pci_dev(dev->dev))) + !dev_is_removable(&adev->pdev->dev)) adev->flags |= AMD_IS_PX; if (!(adev->flags & AMD_IS_APU)) { - parent = pci_upstream_bridge(adev->pdev); + parent = pcie_find_root_port(adev->pdev); adev->has_pr3 = parent ? pci_pr3_present(parent) : false; } @@ -2341,6 +2395,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID) adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK; + if (!amdgpu_device_pcie_dynamic_switching_supported()) + adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK; total = true; for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2518,6 +2574,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) } r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, + DRM_SCHED_PRIORITY_COUNT, ring->num_hw_submission, 0, timeout, adev->reset_domain->wq, ring->sched_score, ring->name, @@ -2527,6 +2584,18 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) ring->name); return r; } + r = amdgpu_uvd_entity_init(adev, ring); + if (r) { + DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n", + ring->name); + return r; + } + r = amdgpu_vce_entity_init(adev, ring); + if (r) { + DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n", + ring->name); + return r; + } } amdgpu_xcp_update_partition_sched_list(adev); @@ -2687,6 +2756,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; + if (adev->mman.buffer_funcs_ring->sched.ready) + amdgpu_ttm_set_buffer_funcs_status(adev, true); + /* Don't init kfd if whole hive need to be reset during init */ if (!adev->gmc.xgmi.pending_reset) { kgd2kfd_init_zone_device(adev); @@ -2969,7 +3041,7 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) { int i, r; - if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) + if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0)) return; for (i = 0; i < adev->num_ip_blocks; i++) { @@ -3222,8 +3294,10 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */ if (adev->in_s0ix && - (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) && - (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA)) + (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >= + IP_VERSION(5, 0, 0)) && + (adev->ip_blocks[i].version->type == + AMD_IP_BLOCK_TYPE_SDMA)) continue; /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot. @@ -3282,6 +3356,8 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev) amdgpu_virt_request_full_gpu(adev, false); } + amdgpu_ttm_set_buffer_funcs_status(adev, false); + r = amdgpu_device_ip_suspend_phase1(adev); if (r) return r; @@ -3452,7 +3528,7 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev) * * Main resume function for hardware IPs. The hardware IPs * are split into two resume functions because they are - * are also used in in recovering from a GPU reset and some additional + * also used in recovering from a GPU reset and some additional * steps need to be take between them. In this case (S3/S4) they are * run sequentially. * Returns 0 on success, negative error code on failure. @@ -3461,12 +3537,6 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev) { int r; - if (!adev->in_s0ix) { - r = amdgpu_amdkfd_resume_iommu(adev); - if (r) - return r; - } - r = amdgpu_device_ip_resume_phase1(adev); if (r) return r; @@ -3477,6 +3547,9 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev) r = amdgpu_device_ip_resume_phase2(adev); + if (adev->mman.buffer_funcs_ring->sched.ready) + amdgpu_ttm_set_buffer_funcs_status(adev, true); + return r; } @@ -3554,8 +3627,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) #else default: if (amdgpu_dc > 0) - DRM_INFO_ONCE("Display Core has been requested via kernel parameter " - "but isn't supported by ASIC, ignoring\n"); + DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n"); return false; #endif } @@ -3607,9 +3679,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) if (adev->asic_reset_res) goto fail; - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops && - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev); + amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB); } else { task_barrier_full(&hive->tb); @@ -3711,9 +3781,6 @@ static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) } static const struct attribute *amdgpu_dev_attributes[] = { - &dev_attr_product_name.attr, - &dev_attr_product_number.attr, - &dev_attr_serial_number.attr, &dev_attr_pcie_replay_count.attr, NULL }; @@ -3724,10 +3791,6 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev) adev->gfx.mcbp = true; else if (amdgpu_mcbp == 0) adev->gfx.mcbp = false; - else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) && - (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) && - adev->gfx.num_gfx_rings) - adev->gfx.mcbp = true; if (amdgpu_sriov_vf(adev)) adev->gfx.mcbp = true; @@ -3790,6 +3853,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->pciep_wreg = &amdgpu_invalid_wreg; adev->pcie_rreg64 = &amdgpu_invalid_rreg64; adev->pcie_wreg64 = &amdgpu_invalid_wreg64; + adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext; + adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext; adev->uvd_ctx_rreg = &amdgpu_invalid_rreg; adev->uvd_ctx_wreg = &amdgpu_invalid_wreg; adev->didt_rreg = &amdgpu_invalid_rreg; @@ -3804,7 +3869,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision); /* mutex initialization are all done here so we - * can recall function without having locking issues */ + * can recall function without having locking issues + */ mutex_init(&adev->firmware.mutex); mutex_init(&adev->pm.mutex); mutex_init(&adev->gfx.gpu_clock_mutex); @@ -3844,6 +3910,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->ras_list); + INIT_LIST_HEAD(&adev->pm.od_kobj_list); + INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -3881,11 +3949,11 @@ int amdgpu_device_init(struct amdgpu_device *adev, atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); - if (adev->rmmio == NULL) { + if (!adev->rmmio) return -ENOMEM; - } + DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); - DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); + DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size); /* * Reset domain needs to be present early, before XGMI hive discovered @@ -3940,7 +4008,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, * internal path natively support atomics, set have_atomics_support to true. */ } else if ((adev->flags & AMD_IS_APU) && - (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) { + (amdgpu_ip_version(adev, GC_HWIP, 0) > + IP_VERSION(9, 0, 0))) { adev->have_atomics_support = true; } else { adev->have_atomics_support = @@ -3953,7 +4022,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, dev_info(adev->dev, "PCIE atomic ops is not supported\n"); /* doorbell bar mapping and doorbell index init*/ - amdgpu_device_doorbell_init(adev); + amdgpu_doorbell_init(adev); if (amdgpu_emu_mode == 1) { /* post the asic on emulation mode */ @@ -3988,13 +4057,23 @@ int amdgpu_device_init(struct amdgpu_device *adev, } } } else { - tmp = amdgpu_reset_method; - /* It should do a default reset when loading or reloading the driver, - * regardless of the module parameter reset_method. - */ - amdgpu_reset_method = AMD_RESET_METHOD_NONE; - r = amdgpu_asic_reset(adev); - amdgpu_reset_method = tmp; + switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { + case IP_VERSION(13, 0, 0): + case IP_VERSION(13, 0, 7): + case IP_VERSION(13, 0, 10): + r = psp_gpu_reset(adev); + break; + default: + tmp = amdgpu_reset_method; + /* It should do a default reset when loading or reloading the driver, + * regardless of the module parameter reset_method. + */ + amdgpu_reset_method = AMD_RESET_METHOD_NONE; + r = amdgpu_asic_reset(adev); + amdgpu_reset_method = tmp; + break; + } + if (r) { dev_err(adev->dev, "asic reset on init failed\n"); goto failed; @@ -4080,30 +4159,6 @@ fence_driver_init: /* Get a log2 for easy divisions. */ adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); - r = amdgpu_atombios_sysfs_init(adev); - if (r) - drm_err(&adev->ddev, - "registering atombios sysfs failed (%d).\n", r); - - r = amdgpu_pm_sysfs_init(adev); - if (r) - DRM_ERROR("registering pm sysfs failed (%d).\n", r); - - r = amdgpu_ucode_sysfs_init(adev); - if (r) { - adev->ucode_sysfs_en = false; - DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); - } else - adev->ucode_sysfs_en = true; - - r = amdgpu_psp_sysfs_init(adev); - if (r) { - adev->psp_sysfs_en = false; - if (!amdgpu_sriov_vf(adev)) - DRM_ERROR("Creating psp sysfs failed\n"); - } else - adev->psp_sysfs_en = true; - /* * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. * Otherwise the mgpu fan boost feature will be skipped due to the @@ -4132,10 +4187,38 @@ fence_driver_init: flush_delayed_work(&adev->delayed_init_work); } + /* + * Place those sysfs registering after `late_init`. As some of those + * operations performed in `late_init` might affect the sysfs + * interfaces creating. + */ + r = amdgpu_atombios_sysfs_init(adev); + if (r) + drm_err(&adev->ddev, + "registering atombios sysfs failed (%d).\n", r); + + r = amdgpu_pm_sysfs_init(adev); + if (r) + DRM_ERROR("registering pm sysfs failed (%d).\n", r); + + r = amdgpu_ucode_sysfs_init(adev); + if (r) { + adev->ucode_sysfs_en = false; + DRM_ERROR("Creating firmware sysfs failed (%d).\n", r); + } else + adev->ucode_sysfs_en = true; + r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); if (r) dev_err(adev->dev, "Could not create amdgpu device attr\n"); + r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group); + if (r) + dev_err(adev->dev, + "Could not create amdgpu board attributes\n"); + + amdgpu_fru_sysfs_init(adev); + if (IS_ENABLED(CONFIG_PERF_EVENTS)) r = amdgpu_pmu_init(adev); if (r) @@ -4147,13 +4230,14 @@ fence_driver_init: /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ /* this will fail for cards that aren't VGA class devices, just - * ignore it */ + * ignore it + */ if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA) vga_client_register(adev->pdev, amdgpu_device_vga_set_decode); px = amdgpu_device_supports_px(ddev); - if (px || (!pci_is_thunderbolt_attached(adev->pdev) && + if (px || (!dev_is_removable(&adev->pdev->dev) && apple_gmux_detect(NULL, NULL))) vga_switcheroo_register_client(adev->pdev, &amdgpu_switcheroo_ops, px); @@ -4199,7 +4283,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); /* Unmap all mapped bars - Doorbell, registers and VRAM */ - amdgpu_device_doorbell_fini(adev); + amdgpu_doorbell_fini(adev); iounmap(adev->rmmio); adev->rmmio = NULL; @@ -4230,7 +4314,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) /* make sure IB test finished before entering exclusive mode * to avoid preemption on IB test - * */ + */ if (amdgpu_sriov_vf(adev)) { amdgpu_virt_request_full_gpu(adev, false); amdgpu_virt_fini_data_exchange(adev); @@ -4253,13 +4337,14 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) amdgpu_pm_sysfs_fini(adev); if (adev->ucode_sysfs_en) amdgpu_ucode_sysfs_fini(adev); - if (adev->psp_sysfs_en) - amdgpu_psp_sysfs_fini(adev); sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); + amdgpu_fru_sysfs_fini(adev); /* disable ras feature must before hw fini */ amdgpu_ras_pre_fini(adev); + amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_device_ip_fini_early(adev); amdgpu_irq_fini_hw(adev); @@ -4297,9 +4382,12 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) kfree(adev->bios); adev->bios = NULL; + kfree(adev->fru_info); + adev->fru_info = NULL; + px = amdgpu_device_supports_px(adev_to_drm(adev)); - if (px || (!pci_is_thunderbolt_attached(adev->pdev) && + if (px || (!dev_is_removable(&adev->pdev->dev) && apple_gmux_detect(NULL, NULL))) vga_switcheroo_unregister_client(adev->pdev); @@ -4313,7 +4401,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) iounmap(adev->rmmio); adev->rmmio = NULL; - amdgpu_device_doorbell_fini(adev); + amdgpu_doorbell_fini(adev); drm_dev_exit(idx); } @@ -4356,6 +4444,41 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) * Suspend & resume. */ /** + * amdgpu_device_prepare - prepare for device suspend + * + * @dev: drm dev pointer + * + * Prepare to put the hw in the suspend state (all asics). + * Returns 0 for success or an error on failure. + * Called at driver suspend. + */ +int amdgpu_device_prepare(struct drm_device *dev) +{ + struct amdgpu_device *adev = drm_to_adev(dev); + int i, r; + + if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) + return 0; + + /* Evict the majority of BOs before starting suspend sequence */ + r = amdgpu_device_evict_resources(adev); + if (r) + return r; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.valid) + continue; + if (!adev->ip_blocks[i].version->funcs->prepare_suspend) + continue; + r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); + if (r) + return r; + } + + return 0; +} + +/** * amdgpu_device_suspend - initiate device suspend * * @dev: drm dev pointer @@ -4375,11 +4498,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) adev->in_suspend = true; - /* Evict the majority of BOs before grabbing the full access */ - r = amdgpu_device_evict_resources(adev); - if (r) - return r; - if (amdgpu_sriov_vf(adev)) { amdgpu_virt_fini_data_exchange(adev); r = amdgpu_virt_request_full_gpu(adev, false); @@ -4407,6 +4525,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) if (r) return r; + amdgpu_ttm_set_buffer_funcs_status(adev, false); + amdgpu_fence_driver_hw_fini(adev); amdgpu_device_ip_suspend_phase2(adev); @@ -4414,6 +4534,10 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, false); + r = amdgpu_dpm_notify_rlc_state(adev, false); + if (r) + return r; + return 0; } @@ -4459,19 +4583,18 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) } amdgpu_fence_driver_hw_init(adev); - r = amdgpu_device_ip_late_init(adev); - if (r) - goto exit; - - queue_delayed_work(system_wq, &adev->delayed_init_work, - msecs_to_jiffies(AMDGPU_RESUME_MS)); - if (!adev->in_s0ix) { r = amdgpu_amdkfd_resume(adev, adev->in_runpm); if (r) goto exit; } + r = amdgpu_device_ip_late_init(adev); + if (r) + goto exit; + + queue_delayed_work(system_wq, &adev->delayed_init_work, + msecs_to_jiffies(AMDGPU_RESUME_MS)); exit: if (amdgpu_sriov_vf(adev)) { amdgpu_virt_init_data_exchange(adev); @@ -4773,6 +4896,10 @@ retry: r = amdgpu_virt_reset_gpu(adev); if (r) return r; + amdgpu_irq_gpu_reset_resume_helper(adev); + + /* some sw clean up VF needs to do before recover */ + amdgpu_virt_post_reset(adev); /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); @@ -4799,7 +4926,6 @@ retry: amdgpu_put_xgmi_hive(hive); if (!r) { - amdgpu_irq_gpu_reset_resume_helper(adev); r = amdgpu_ib_ring_tests(adev); amdgpu_amdkfd_post_reset(adev); @@ -4925,9 +5051,12 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) } if (ret) - dev_err(adev->dev, "GPU mode1 reset failed\n"); + goto mode1_reset_failed; amdgpu_device_load_pci_state(adev->pdev); + ret = amdgpu_psp_wait_for_bootloader(adev); + if (ret) + goto mode1_reset_failed; /* wait for asic to come out of reset */ for (i = 0; i < adev->usec_timeout; i++) { @@ -4938,7 +5067,17 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) udelay(1); } + if (i >= adev->usec_timeout) { + ret = -ETIMEDOUT; + goto mode1_reset_failed; + } + amdgpu_atombios_scratch_regs_engine_hung(adev, false); + + return 0; + +mode1_reset_failed: + dev_err(adev->dev, "GPU mode1 reset failed\n"); return ret; } @@ -4967,8 +5106,9 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; - /*clear job fence from fence drv to avoid force_completion - *leave NULL and vm flush fence in fence drv */ + /* Clear job fence from fence drv to avoid force_completion + * leave NULL and vm flush fence in fence drv + */ amdgpu_fence_driver_clear_job_fences(ring); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ @@ -4982,7 +5122,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, r = amdgpu_reset_prepare_hwcontext(adev, reset_context); /* If reset handler not implemented, continue; otherwise return */ - if (r == -ENOSYS) + if (r == -EOPNOTSUPP) r = 0; else return r; @@ -5022,67 +5162,16 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) lockdep_assert_held(&adev->reset_domain->sem); - for (i = 0; i < adev->num_regs; i++) { - adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], - adev->reset_dump_reg_value[i]); - } - - return 0; -} + for (i = 0; i < adev->reset_info.num_regs; i++) { + adev->reset_info.reset_dump_reg_value[i] = + RREG32(adev->reset_info.reset_dump_reg_list[i]); -#ifdef CONFIG_DEV_COREDUMP -static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, - size_t count, void *data, size_t datalen) -{ - struct drm_printer p; - struct amdgpu_device *adev = data; - struct drm_print_iterator iter; - int i; - - iter.data = buffer; - iter.offset = 0; - iter.start = offset; - iter.remain = count; - - p = drm_coredump_printer(&iter); - - drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); - drm_printf(&p, "kernel: " UTS_RELEASE "\n"); - drm_printf(&p, "module: " KBUILD_MODNAME "\n"); - drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); - if (adev->reset_task_info.pid) - drm_printf(&p, "process_name: %s PID: %d\n", - adev->reset_task_info.process_name, - adev->reset_task_info.pid); - - if (adev->reset_vram_lost) - drm_printf(&p, "VRAM is lost due to GPU reset!\n"); - if (adev->num_regs) { - drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); - - for (i = 0; i < adev->num_regs; i++) - drm_printf(&p, "0x%08x: 0x%08x\n", - adev->reset_dump_reg_list[i], - adev->reset_dump_reg_value[i]); + trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], + adev->reset_info.reset_dump_reg_value[i]); } - return count - iter.remain; -} - -static void amdgpu_devcoredump_free(void *data) -{ -} - -static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) -{ - struct drm_device *dev = adev_to_drm(adev); - - ktime_get_ts64(&adev->reset_time); - dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, - amdgpu_devcoredump_read, amdgpu_devcoredump_free); + return 0; } -#endif int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context) @@ -5100,7 +5189,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, reset_context->reset_device_list = device_list_handle; r = amdgpu_reset_perform_reset(tmp_adev, reset_context); /* If reset handler not implemented, continue; otherwise return */ - if (r == -ENOSYS) + if (r == -EOPNOTSUPP) r = 0; else return r; @@ -5131,7 +5220,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (r) { dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s", r, adev_to_drm(tmp_adev)->unique); - break; + goto out; } } @@ -5150,9 +5239,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (!r && amdgpu_ras_intr_triggered()) { list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops && - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count) - tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev); + amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB); } amdgpu_ras_intr_cleared(); @@ -5178,24 +5265,15 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, dev_warn(tmp_adev->dev, "asic atom init failed!"); } else { dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); - r = amdgpu_amdkfd_resume_iommu(tmp_adev); - if (r) - goto out; r = amdgpu_device_ip_resume_phase1(tmp_adev); if (r) goto out; vram_lost = amdgpu_device_check_vram_lost(tmp_adev); -#ifdef CONFIG_DEV_COREDUMP - tmp_adev->reset_vram_lost = vram_lost; - memset(&tmp_adev->reset_task_info, 0, - sizeof(tmp_adev->reset_task_info)); - if (reset_context->job && reset_context->job->vm) - tmp_adev->reset_task_info = - reset_context->job->vm->task_info; - amdgpu_reset_capture_coredumpm(tmp_adev); -#endif + + amdgpu_coredump(tmp_adev, vram_lost, reset_context); + if (vram_lost) { DRM_INFO("VRAM is lost due to GPU reset!\n"); amdgpu_inc_vram_lost(tmp_adev); @@ -5205,10 +5283,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (r) return r; + r = amdgpu_xcp_restore_partition_mode( + tmp_adev->xcp_mgr); + if (r) + goto out; + r = amdgpu_device_ip_resume_phase2(tmp_adev); if (r) goto out; + if (tmp_adev->mman.buffer_funcs_ring->sched.ready) + amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true); + if (vram_lost) amdgpu_device_fill_reset_magic(tmp_adev); @@ -5422,7 +5508,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * Flush RAM to disk so that after reboot * the user can read log and see why the system rebooted. */ - if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { + if (need_emergency_restart && amdgpu_ras_get_context(adev) && + amdgpu_ras_get_context(adev)->reboot) { DRM_WARN("Emergency reboot."); ksys_sync_helper(); @@ -5560,8 +5647,9 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ adev->asic_reset_res = r; /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */ - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) || - adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) + if (amdgpu_ip_version(adev, GC_HWIP, 0) == + IP_VERSION(9, 4, 2) || + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3)) amdgpu_ras_resume(adev); } else { r = amdgpu_do_asic_reset(device_list_handle, reset_context); @@ -5586,12 +5674,8 @@ skip_hw_reset: drm_sched_start(&ring->sched, true); } - if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3)) - amdgpu_mes_self_test(tmp_adev); - - if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { + if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); - } if (tmp_adev->asic_reset_res) r = tmp_adev->asic_reset_res; @@ -6264,7 +6348,7 @@ bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev) return true; default: /* IP discovery */ - if (!adev->ip_versions[DCE_HWIP][0] || + if (!amdgpu_ip_version(adev, DCE_HWIP, 0) || (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) return false; return true; |