aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c1299
1 files changed, 813 insertions, 486 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6238701cde23..932dc93b2e63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -32,8 +32,6 @@
#include <linux/slab.h>
#include <linux/iommu.h>
#include <linux/pci.h>
-#include <linux/devcoredump.h>
-#include <generated/utsrelease.h>
#include <linux/pci-p2pdma.h>
#include <linux/apple-gmux.h>
@@ -43,6 +41,7 @@
#include <drm/drm_fb_helper.h>
#include <drm/drm_probe_helper.h>
#include <drm/amdgpu_drm.h>
+#include <linux/device.h>
#include <linux/vgaarb.h>
#include <linux/vga_switcheroo.h>
#include <linux/efi.h>
@@ -74,6 +73,8 @@
#include "amdgpu_pmu.h"
#include "amdgpu_fru_eeprom.h"
#include "amdgpu_reset.h"
+#include "amdgpu_virt.h"
+#include "amdgpu_dev_coredump.h"
#include <linux/suspend.h>
#include <drm/task_barrier.h>
@@ -96,6 +97,9 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
#define AMDGPU_RESUME_MS 2000
#define AMDGPU_MAX_RETRY_LIMIT 2
#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
+#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
+#define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
+#define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
static const struct drm_driver amdgpu_kms_driver;
@@ -140,6 +144,8 @@ const char *amdgpu_asic_name[] = {
"LAST",
};
+static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
+
/**
* DOC: pcie_replay_count
*
@@ -159,76 +165,138 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
return sysfs_emit(buf, "%llu\n", cnt);
}
-static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
+static DEVICE_ATTR(pcie_replay_count, 0444,
amdgpu_device_get_pcie_replay_count, NULL);
-static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
-
-/**
- * DOC: product_name
- *
- * The amdgpu driver provides a sysfs API for reporting the product name
- * for the device
- * The file product_name is used for this and returns the product name
- * as returned from the FRU.
- * NOTE: This is only available for certain server cards
- */
-
-static ssize_t amdgpu_device_get_product_name(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
+ struct bin_attribute *attr, char *buf,
+ loff_t ppos, size_t count)
{
+ struct device *dev = kobj_to_dev(kobj);
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
+ ssize_t bytes_read;
+
+ switch (ppos) {
+ case AMDGPU_SYS_REG_STATE_XGMI:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
+ break;
+ case AMDGPU_SYS_REG_STATE_WAFL:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
+ break;
+ case AMDGPU_SYS_REG_STATE_PCIE:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
+ break;
+ case AMDGPU_SYS_REG_STATE_USR:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
+ break;
+ case AMDGPU_SYS_REG_STATE_USR_1:
+ bytes_read = amdgpu_asic_get_reg_state(
+ adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return bytes_read;
+}
- return sysfs_emit(buf, "%s\n", adev->product_name);
+BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
+ AMDGPU_SYS_REG_STATE_END);
+
+int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
+{
+ int ret;
+
+ if (!amdgpu_asic_get_reg_state_supported(adev))
+ return 0;
+
+ ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
+
+ return ret;
}
-static DEVICE_ATTR(product_name, S_IRUGO,
- amdgpu_device_get_product_name, NULL);
+void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
+{
+ if (!amdgpu_asic_get_reg_state_supported(adev))
+ return;
+ sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
+}
/**
- * DOC: product_number
+ * DOC: board_info
+ *
+ * The amdgpu driver provides a sysfs API for giving board related information.
+ * It provides the form factor information in the format
+ *
+ * type : form factor
+ *
+ * Possible form factor values
+ *
+ * - "cem" - PCIE CEM card
+ * - "oam" - Open Compute Accelerator Module
+ * - "unknown" - Not known
*
- * The amdgpu driver provides a sysfs API for reporting the part number
- * for the device
- * The file product_number is used for this and returns the part number
- * as returned from the FRU.
- * NOTE: This is only available for certain server cards
*/
-static ssize_t amdgpu_device_get_product_number(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t amdgpu_device_get_board_info(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
+ enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
+ const char *pkg;
+
+ if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
+ pkg_type = adev->smuio.funcs->get_pkg_type(adev);
- return sysfs_emit(buf, "%s\n", adev->product_number);
+ switch (pkg_type) {
+ case AMDGPU_PKG_TYPE_CEM:
+ pkg = "cem";
+ break;
+ case AMDGPU_PKG_TYPE_OAM:
+ pkg = "oam";
+ break;
+ default:
+ pkg = "unknown";
+ break;
+ }
+
+ return sysfs_emit(buf, "%s : %s\n", "type", pkg);
}
-static DEVICE_ATTR(product_number, S_IRUGO,
- amdgpu_device_get_product_number, NULL);
+static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
-/**
- * DOC: serial_number
- *
- * The amdgpu driver provides a sysfs API for reporting the serial number
- * for the device
- * The file serial_number is used for this and returns the serial number
- * as returned from the FRU.
- * NOTE: This is only available for certain server cards
- */
+static struct attribute *amdgpu_board_attrs[] = {
+ &dev_attr_board_info.attr,
+ NULL,
+};
-static ssize_t amdgpu_device_get_serial_number(struct device *dev,
- struct device_attribute *attr, char *buf)
+static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
{
+ struct device *dev = kobj_to_dev(kobj);
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
- return sysfs_emit(buf, "%s\n", adev->serial);
+ if (adev->flags & AMD_IS_APU)
+ return 0;
+
+ return attr->mode;
}
-static DEVICE_ATTR(serial_number, S_IRUGO,
- amdgpu_device_get_serial_number, NULL);
+static const struct attribute_group amdgpu_board_attrs_group = {
+ .attrs = amdgpu_board_attrs,
+ .is_visible = amdgpu_board_attrs_is_visible
+};
+
+static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
+
/**
* amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
@@ -270,16 +338,93 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)
*
* @dev: drm_device pointer
*
- * Returns true if the device supporte BACO,
- * otherwise return false.
+ * Return:
+ * 1 if the device supporte BACO;
+ * 3 if the device support MACO (only works if BACO is supported)
+ * otherwise return 0.
*/
-bool amdgpu_device_supports_baco(struct drm_device *dev)
+int amdgpu_device_supports_baco(struct drm_device *dev)
{
struct amdgpu_device *adev = drm_to_adev(dev);
return amdgpu_asic_supports_baco(adev);
}
+void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
+{
+ struct drm_device *dev;
+ int bamaco_support;
+
+ dev = adev_to_drm(adev);
+
+ adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
+ bamaco_support = amdgpu_device_supports_baco(dev);
+
+ switch (amdgpu_runtime_pm) {
+ case 2:
+ if (bamaco_support & MACO_SUPPORT) {
+ adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
+ dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
+ } else if (bamaco_support == BACO_SUPPORT) {
+ adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
+ dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
+ }
+ break;
+ case 1:
+ if (bamaco_support & BACO_SUPPORT) {
+ adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
+ dev_info(adev->dev, "Forcing BACO for runtime pm\n");
+ }
+ break;
+ case -1:
+ case -2:
+ if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
+ adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
+ dev_info(adev->dev, "Using ATPX for runtime pm\n");
+ } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
+ adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
+ dev_info(adev->dev, "Using BOCO for runtime pm\n");
+ } else {
+ if (!bamaco_support)
+ goto no_runtime_pm;
+
+ switch (adev->asic_type) {
+ case CHIP_VEGA20:
+ case CHIP_ARCTURUS:
+ /* BACO are not supported on vega20 and arctrus */
+ break;
+ case CHIP_VEGA10:
+ /* enable BACO as runpm mode if noretry=0 */
+ if (!adev->gmc.noretry)
+ adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
+ break;
+ default:
+ /* enable BACO as runpm mode on CI+ */
+ adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
+ break;
+ }
+
+ if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
+ if (bamaco_support & MACO_SUPPORT) {
+ adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
+ dev_info(adev->dev, "Using BAMACO for runtime pm\n");
+ } else {
+ dev_info(adev->dev, "Using BACO for runtime pm\n");
+ }
+ }
+ }
+ break;
+ case 0:
+ dev_info(adev->dev, "runtime pm is manually disabled\n");
+ break;
+ default:
+ break;
+ }
+
+no_runtime_pm:
+ if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
+ dev_info(adev->dev, "Runtime PM not available\n");
+}
/**
* amdgpu_device_supports_smart_shift - Is the device dGPU with
* smart shift support
@@ -370,10 +515,16 @@ size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
if (write) {
memcpy_toio(addr, buf, count);
+ /* Make sure HDP write cache flush happens without any reordering
+ * after the system memory contents are sent over PCIe device
+ */
mb();
amdgpu_device_flush_hdp(adev, NULL);
} else {
amdgpu_device_invalidate_hdp(adev, NULL);
+ /* Make sure HDP read cache is invalidated before issuing a read
+ * to the PCIe device
+ */
mb();
memcpy_fromio(buf, addr, count);
}
@@ -464,7 +615,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
down_read_trylock(&adev->reset_domain->sem)) {
- ret = amdgpu_kiq_rreg(adev, reg);
+ ret = amdgpu_kiq_rreg(adev, reg, 0);
up_read(&adev->reset_domain->sem);
} else {
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
@@ -481,8 +632,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
/*
* MMIO register read with bytes helper functions
* @offset:bytes offset from MMIO start
- *
-*/
+ */
/**
* amdgpu_mm_rreg8 - read a memory mapped IO register
@@ -502,12 +652,55 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
BUG();
}
+
+/**
+ * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
+ *
+ * @adev: amdgpu_device pointer
+ * @reg: dword aligned register offset
+ * @acc_flags: access flags which require special behavior
+ * @xcc_id: xcc accelerated compute core id
+ *
+ * Returns the 32 bit value from the offset specified.
+ */
+uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
+ uint32_t reg, uint32_t acc_flags,
+ uint32_t xcc_id)
+{
+ uint32_t ret, rlcg_flag;
+
+ if (amdgpu_device_skip_hw_access(adev))
+ return 0;
+
+ if ((reg * 4) < adev->rmmio_size) {
+ if (amdgpu_sriov_vf(adev) &&
+ !amdgpu_sriov_runtime(adev) &&
+ adev->gfx.rlc.rlcg_reg_access_supported &&
+ amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
+ GC_HWIP, false,
+ &rlcg_flag)) {
+ ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id);
+ } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
+ amdgpu_sriov_runtime(adev) &&
+ down_read_trylock(&adev->reset_domain->sem)) {
+ ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
+ up_read(&adev->reset_domain->sem);
+ } else {
+ ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
+ }
+ } else {
+ ret = adev->pcie_rreg(adev, reg * 4);
+ }
+
+ return ret;
+}
+
/*
* MMIO register write with bytes helper functions
* @offset:bytes offset from MMIO start
* @value: the value want to be written to the register
- *
-*/
+ */
+
/**
* amdgpu_mm_wreg8 - read a memory mapped IO register
*
@@ -549,7 +742,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
down_read_trylock(&adev->reset_domain->sem)) {
- amdgpu_kiq_wreg(adev, reg, v);
+ amdgpu_kiq_wreg(adev, reg, v, 0);
up_read(&adev->reset_domain->sem);
} else {
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
@@ -567,11 +760,13 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
* @adev: amdgpu_device pointer
* @reg: mmio/rlc register
* @v: value to write
+ * @xcc_id: xcc accelerated compute core id
*
* this function is invoked only for the debugfs register access
*/
void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
- uint32_t reg, uint32_t v)
+ uint32_t reg, uint32_t v,
+ uint32_t xcc_id)
{
if (amdgpu_device_skip_hw_access(adev))
return;
@@ -580,7 +775,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
adev->gfx.rlc.funcs &&
adev->gfx.rlc.funcs->is_rlcg_access_range) {
if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
- return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
+ return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
} else if ((reg * 4) >= adev->rmmio_size) {
adev->pcie_wreg(adev, reg * 4, v);
} else {
@@ -589,90 +784,43 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
}
/**
- * amdgpu_mm_rdoorbell - read a doorbell dword
+ * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
*
* @adev: amdgpu_device pointer
- * @index: doorbell index
- *
- * Returns the value in the doorbell aperture at the
- * requested doorbell index (CIK).
- */
-u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
-{
- if (amdgpu_device_skip_hw_access(adev))
- return 0;
-
- if (index < adev->doorbell.num_kernel_doorbells) {
- return readl(adev->doorbell.ptr + index);
- } else {
- DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
- return 0;
- }
-}
-
-/**
- * amdgpu_mm_wdoorbell - write a doorbell dword
- *
- * @adev: amdgpu_device pointer
- * @index: doorbell index
- * @v: value to write
- *
- * Writes @v to the doorbell aperture at the
- * requested doorbell index (CIK).
- */
-void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
-{
- if (amdgpu_device_skip_hw_access(adev))
- return;
-
- if (index < adev->doorbell.num_kernel_doorbells) {
- writel(v, adev->doorbell.ptr + index);
- } else {
- DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
- }
-}
-
-/**
- * amdgpu_mm_rdoorbell64 - read a doorbell Qword
- *
- * @adev: amdgpu_device pointer
- * @index: doorbell index
+ * @reg: dword aligned register offset
+ * @v: 32 bit value to write to the register
+ * @acc_flags: access flags which require special behavior
+ * @xcc_id: xcc accelerated compute core id
*
- * Returns the value in the doorbell aperture at the
- * requested doorbell index (VEGA10+).
+ * Writes the value specified to the offset specified.
*/
-u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
+void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
+ uint32_t reg, uint32_t v,
+ uint32_t acc_flags, uint32_t xcc_id)
{
- if (amdgpu_device_skip_hw_access(adev))
- return 0;
-
- if (index < adev->doorbell.num_kernel_doorbells) {
- return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
- } else {
- DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
- return 0;
- }
-}
+ uint32_t rlcg_flag;
-/**
- * amdgpu_mm_wdoorbell64 - write a doorbell Qword
- *
- * @adev: amdgpu_device pointer
- * @index: doorbell index
- * @v: value to write
- *
- * Writes @v to the doorbell aperture at the
- * requested doorbell index (VEGA10+).
- */
-void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
-{
if (amdgpu_device_skip_hw_access(adev))
return;
- if (index < adev->doorbell.num_kernel_doorbells) {
- atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
+ if ((reg * 4) < adev->rmmio_size) {
+ if (amdgpu_sriov_vf(adev) &&
+ !amdgpu_sriov_runtime(adev) &&
+ adev->gfx.rlc.rlcg_reg_access_supported &&
+ amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
+ GC_HWIP, true,
+ &rlcg_flag)) {
+ amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id);
+ } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
+ amdgpu_sriov_runtime(adev) &&
+ down_read_trylock(&adev->reset_domain->sem)) {
+ amdgpu_kiq_wreg(adev, reg, v, xcc_id);
+ up_read(&adev->reset_domain->sem);
+ } else {
+ writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
+ }
} else {
- DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
+ adev->pcie_wreg(adev, reg * 4, v);
}
}
@@ -716,12 +864,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
- pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
- pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
- if (adev->nbio.funcs->get_pcie_index_hi_offset)
- pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
- else
+ if (unlikely(!adev->nbio.funcs)) {
+ pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
+ pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
+ } else {
+ pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+ pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+ }
+
+ if (reg_addr >> 32) {
+ if (unlikely(!adev->nbio.funcs))
+ pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
+ else
+ pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+ } else {
pcie_index_hi = 0;
+ }
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
@@ -785,6 +943,56 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
return r;
}
+u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
+ u64 reg_addr)
+{
+ unsigned long flags, pcie_index, pcie_data;
+ unsigned long pcie_index_hi = 0;
+ void __iomem *pcie_index_offset;
+ void __iomem *pcie_index_hi_offset;
+ void __iomem *pcie_data_offset;
+ u64 r;
+
+ pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+ pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+ if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
+ pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+
+ spin_lock_irqsave(&adev->pcie_idx_lock, flags);
+ pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
+ pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
+ if (pcie_index_hi != 0)
+ pcie_index_hi_offset = (void __iomem *)adev->rmmio +
+ pcie_index_hi * 4;
+
+ /* read low 32 bits */
+ writel(reg_addr, pcie_index_offset);
+ readl(pcie_index_offset);
+ if (pcie_index_hi != 0) {
+ writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
+ readl(pcie_index_hi_offset);
+ }
+ r = readl(pcie_data_offset);
+ /* read high 32 bits */
+ writel(reg_addr + 4, pcie_index_offset);
+ readl(pcie_index_offset);
+ if (pcie_index_hi != 0) {
+ writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
+ readl(pcie_index_hi_offset);
+ }
+ r |= ((u64)readl(pcie_data_offset) << 32);
+
+ /* clear the high bits */
+ if (pcie_index_hi != 0) {
+ writel(0, pcie_index_hi_offset);
+ readl(pcie_index_hi_offset);
+ }
+
+ spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+
+ return r;
+}
+
/**
* amdgpu_device_indirect_wreg - write an indirect register address
*
@@ -824,7 +1032,7 @@ void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
- if (adev->nbio.funcs->get_pcie_index_hi_offset)
+ if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
else
pcie_index_hi = 0;
@@ -889,6 +1097,55 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
}
+void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
+ u64 reg_addr, u64 reg_data)
+{
+ unsigned long flags, pcie_index, pcie_data;
+ unsigned long pcie_index_hi = 0;
+ void __iomem *pcie_index_offset;
+ void __iomem *pcie_index_hi_offset;
+ void __iomem *pcie_data_offset;
+
+ pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+ pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+ if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
+ pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+
+ spin_lock_irqsave(&adev->pcie_idx_lock, flags);
+ pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
+ pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
+ if (pcie_index_hi != 0)
+ pcie_index_hi_offset = (void __iomem *)adev->rmmio +
+ pcie_index_hi * 4;
+
+ /* write low 32 bits */
+ writel(reg_addr, pcie_index_offset);
+ readl(pcie_index_offset);
+ if (pcie_index_hi != 0) {
+ writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
+ readl(pcie_index_hi_offset);
+ }
+ writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
+ readl(pcie_data_offset);
+ /* write high 32 bits */
+ writel(reg_addr + 4, pcie_index_offset);
+ readl(pcie_index_offset);
+ if (pcie_index_hi != 0) {
+ writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
+ readl(pcie_index_hi_offset);
+ }
+ writel((u32)(reg_data >> 32), pcie_data_offset);
+ readl(pcie_data_offset);
+
+ /* clear the high bits */
+ if (pcie_index_hi != 0) {
+ writel(0, pcie_index_hi_offset);
+ readl(pcie_index_hi_offset);
+ }
+
+ spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+}
+
/**
* amdgpu_device_get_rev_id - query device rev_id
*
@@ -966,6 +1223,13 @@ static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
return 0;
}
+static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
+{
+ DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
+ BUG();
+ return 0;
+}
+
/**
* amdgpu_invalid_wreg64 - dummy reg write function
*
@@ -983,6 +1247,13 @@ static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint
BUG();
}
+static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
+{
+ DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
+ reg, v);
+ BUG();
+}
+
/**
* amdgpu_block_invalid_rreg - dummy reg read function
*
@@ -1032,13 +1303,20 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
*/
static int amdgpu_device_asic_init(struct amdgpu_device *adev)
{
+ int ret;
+
amdgpu_asic_pre_asic_init(adev);
- if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
- adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
- return amdgpu_atomfirmware_asic_init(adev, true);
- else
+ if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
+ amdgpu_psp_wait_for_bootloader(adev);
+ ret = amdgpu_atomfirmware_asic_init(adev, true);
+ return ret;
+ } else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
+ }
+
+ return 0;
}
/**
@@ -1078,7 +1356,7 @@ static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
* @registers: pointer to the register array
* @array_size: size of the register array
*
- * Programs an array or registers with and and or masks.
+ * Programs an array or registers with and or masks.
* This is a helper for setting golden registers.
*/
void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
@@ -1136,83 +1414,6 @@ int amdgpu_device_pci_reset(struct amdgpu_device *adev)
}
/*
- * GPU doorbell aperture helpers function.
- */
-/**
- * amdgpu_device_doorbell_init - Init doorbell driver information.
- *
- * @adev: amdgpu_device pointer
- *
- * Init doorbell driver information (CIK)
- * Returns 0 on success, error on failure.
- */
-static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
-{
-
- /* No doorbell on SI hardware generation */
- if (adev->asic_type < CHIP_BONAIRE) {
- adev->doorbell.base = 0;
- adev->doorbell.size = 0;
- adev->doorbell.num_kernel_doorbells = 0;
- adev->doorbell.ptr = NULL;
- return 0;
- }
-
- if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
- return -EINVAL;
-
- amdgpu_asic_init_doorbell_index(adev);
-
- /* doorbell bar mapping */
- adev->doorbell.base = pci_resource_start(adev->pdev, 2);
- adev->doorbell.size = pci_resource_len(adev->pdev, 2);
-
- if (adev->enable_mes) {
- adev->doorbell.num_kernel_doorbells =
- adev->doorbell.size / sizeof(u32);
- } else {
- adev->doorbell.num_kernel_doorbells =
- min_t(u32, adev->doorbell.size / sizeof(u32),
- adev->doorbell_index.max_assignment+1);
- if (adev->doorbell.num_kernel_doorbells == 0)
- return -EINVAL;
-
- /* For Vega, reserve and map two pages on doorbell BAR since SDMA
- * paging queue doorbell use the second page. The
- * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
- * doorbells are in the first page. So with paging queue enabled,
- * the max num_kernel_doorbells should + 1 page (0x400 in dword)
- */
- if (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(4, 0, 0) &&
- adev->ip_versions[SDMA0_HWIP][0] < IP_VERSION(4, 2, 0))
- adev->doorbell.num_kernel_doorbells += 0x400;
- }
-
- adev->doorbell.ptr = ioremap(adev->doorbell.base,
- adev->doorbell.num_kernel_doorbells *
- sizeof(u32));
- if (adev->doorbell.ptr == NULL)
- return -ENOMEM;
-
- return 0;
-}
-
-/**
- * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
- *
- * @adev: amdgpu_device pointer
- *
- * Tear down doorbell driver information (CIK)
- */
-static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
-{
- iounmap(adev->doorbell.ptr);
- adev->doorbell.ptr = NULL;
-}
-
-
-
-/*
* amdgpu_device_wb_*()
* Writeback is the method by which the GPU updates special pages in memory
* with the status of certain GPU events (fences, ring pointers,etc.).
@@ -1281,13 +1482,17 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev)
*/
int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
{
- unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
+ unsigned long flags, offset;
+ spin_lock_irqsave(&adev->wb.lock, flags);
+ offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
if (offset < adev->wb.num_wb) {
__set_bit(offset, adev->wb.used);
+ spin_unlock_irqrestore(&adev->wb.lock, flags);
*wb = offset << 3; /* convert to dw offset */
return 0;
} else {
+ spin_unlock_irqrestore(&adev->wb.lock, flags);
return -EINVAL;
}
}
@@ -1302,9 +1507,13 @@ int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
*/
void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
{
+ unsigned long flags;
+
wb >>= 3;
+ spin_lock_irqsave(&adev->wb.lock, flags);
if (wb < adev->wb.num_wb)
__clear_bit(wb, adev->wb.used);
+ spin_unlock_irqrestore(&adev->wb.lock, flags);
}
/**
@@ -1321,14 +1530,21 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
struct pci_bus *root;
struct resource *res;
- unsigned i;
+ unsigned int i;
u16 cmd;
int r;
+ if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
+ return 0;
+
/* Bypass for VF */
if (amdgpu_sriov_vf(adev))
return 0;
+ /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
+ if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
+ DRM_WARN("System can't access extended configuration space, please check!!\n");
+
/* skip if the bios has already enabled large BAR */
if (adev->gmc.real_vram_size &&
(pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
@@ -1359,7 +1575,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
cmd & ~PCI_COMMAND_MEMORY);
/* Free the VRAM and doorbell BAR, we most likely need to move both. */
- amdgpu_device_doorbell_fini(adev);
+ amdgpu_doorbell_fini(adev);
if (adev->asic_type >= CHIP_BONAIRE)
pci_release_resource(adev->pdev, 2);
@@ -1376,7 +1592,7 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
/* When the doorbell or fb BAR isn't available we have no chance of
* using the device.
*/
- r = amdgpu_device_doorbell_init(adev);
+ r = amdgpu_doorbell_init(adev);
if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
return -ENODEV;
@@ -1387,9 +1603,8 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
{
- if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) {
+ if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
return false;
- }
return true;
}
@@ -1425,12 +1640,14 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
if (adev->asic_type == CHIP_FIJI) {
int err;
uint32_t fw_ver;
+
err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
/* force vPost if error occured */
if (err)
return true;
fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
+ release_firmware(adev->pm.fw);
if (fw_ver < 0x00160e00)
return true;
}
@@ -1459,44 +1676,53 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
}
/*
- * On APUs with >= 64GB white flickering has been observed w/ SG enabled.
- * Disable S/G on such systems until we have a proper fix.
- * https://gitlab.freedesktop.org/drm/amd/-/issues/2354
- * https://gitlab.freedesktop.org/drm/amd/-/issues/2735
+ * Check whether seamless boot is supported.
+ *
+ * So far we only support seamless boot on DCE 3.0 or later.
+ * If users report that it works on older ASICS as well, we may
+ * loosen this.
*/
-bool amdgpu_sg_display_supported(struct amdgpu_device *adev)
+bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
{
- switch (amdgpu_sg_display) {
+ switch (amdgpu_seamless) {
case -1:
break;
- case 0:
- return false;
case 1:
return true;
+ case 0:
+ return false;
default:
+ DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
+ amdgpu_seamless);
return false;
}
- if ((totalram_pages() << (PAGE_SHIFT - 10)) +
- (adev->gmc.real_vram_size / 1024) >= 64000000) {
- DRM_WARN("Disabling S/G due to >=64GB RAM\n");
+
+ if (!(adev->flags & AMD_IS_APU))
return false;
- }
- return true;
+
+ if (adev->mman.keep_stolen_vga_memory)
+ return false;
+
+ return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
}
/*
- * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
- * speed switching. Until we have confirmation from Intel that a specific host
- * supports it, it's safer that we keep it disabled for all.
+ * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
+ * don't support dynamic speed switching. Until we have confirmation from Intel
+ * that a specific host supports it, it's safer that we keep it disabled for all.
*
* https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
* https://gitlab.freedesktop.org/drm/amd/-/issues/2663
*/
-bool amdgpu_device_pcie_dynamic_switching_supported(void)
+static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
{
#if IS_ENABLED(CONFIG_X86)
struct cpuinfo_x86 *c = &cpu_data(0);
+ /* eGPU change speeds based on USB4 fabric conditions */
+ if (dev_is_removable(adev->dev))
+ return true;
+
if (c->x86_vendor == X86_VENDOR_INTEL)
return false;
#endif
@@ -1525,20 +1751,13 @@ bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
default:
return false;
}
+ if (adev->flags & AMD_IS_APU)
+ return false;
+ if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
+ return false;
return pcie_aspm_enabled(adev->pdev);
}
-bool amdgpu_device_aspm_support_quirk(void)
-{
-#if IS_ENABLED(CONFIG_X86)
- struct cpuinfo_x86 *c = &cpu_data(0);
-
- return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
-#else
- return true;
-#endif
-}
-
/* if we get transitioned to only one device, take VGA back */
/**
* amdgpu_device_vga_set_decode - enable/disable vga decode
@@ -1553,6 +1772,7 @@ static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
bool state)
{
struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
+
amdgpu_asic_set_vga_state(adev, state);
if (state)
return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
@@ -1575,7 +1795,8 @@ static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
{
/* defines number of bits in page table versus page directory,
* a page is 4KB so we have 12 bits offset, minimum 9 bits in the
- * page table and the remaining bits are in the page directory */
+ * page table and the remaining bits are in the page directory
+ */
if (amdgpu_vm_block_size == -1)
return;
@@ -1785,6 +2006,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
} else {
pr_info("switched off\n");
dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+ amdgpu_device_prepare(dev);
amdgpu_device_suspend(dev, true);
amdgpu_device_cache_pci_state(pdev);
/* Shut down the device */
@@ -1807,7 +2029,7 @@ static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
- /*
+ /*
* FIXME: open_count is protected by drm_global_mutex but that would lead to
* locking inversion with the driver load path. And the access here is
* completely racy anyway. So don't bother with locking for now.
@@ -2133,15 +2355,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
adev->firmware.gpu_info_fw = NULL;
- if (adev->mman.discovery_bin) {
- /*
- * FIXME: The bounding box is still needed by Navi12, so
- * temporarily read it from gpu_info firmware. Should be dropped
- * when DAL no longer needs it.
- */
- if (adev->asic_type != CHIP_NAVI12)
- return 0;
- }
+ if (adev->mman.discovery_bin)
+ return 0;
switch (adev->asic_type) {
default:
@@ -2256,7 +2471,6 @@ out:
*/
static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
{
- struct drm_device *dev = adev_to_drm(adev);
struct pci_dev *parent;
int i, r;
bool total;
@@ -2327,11 +2541,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
(amdgpu_is_atpx_hybrid() ||
amdgpu_has_atpx_dgpu_power_cntl()) &&
((adev->flags & AMD_IS_APU) == 0) &&
- !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
+ !dev_is_removable(&adev->pdev->dev))
adev->flags |= AMD_IS_PX;
if (!(adev->flags & AMD_IS_APU)) {
- parent = pci_upstream_bridge(adev->pdev);
+ parent = pcie_find_root_port(adev->pdev);
adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
}
@@ -2341,6 +2555,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
+ if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
+ adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
total = true;
for (i = 0; i < adev->num_ip_blocks; i++) {
@@ -2517,7 +2733,8 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
break;
}
- r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+ r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
+ DRM_SCHED_PRIORITY_COUNT,
ring->num_hw_submission, 0,
timeout, adev->reset_domain->wq,
ring->sched_score, ring->name,
@@ -2527,6 +2744,18 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
ring->name);
return r;
}
+ r = amdgpu_uvd_entity_init(adev, ring);
+ if (r) {
+ DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
+ ring->name);
+ return r;
+ }
+ r = amdgpu_vce_entity_init(adev, ring);
+ if (r) {
+ DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
+ ring->name);
+ return r;
+ }
}
amdgpu_xcp_update_partition_sched_list(adev);
@@ -2607,6 +2836,12 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
goto init_failed;
}
}
+
+ r = amdgpu_seq64_init(adev);
+ if (r) {
+ DRM_ERROR("allocate seq64 failed %d\n", r);
+ goto init_failed;
+ }
}
}
@@ -2687,6 +2922,9 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (r)
goto init_failed;
+ if (adev->mman.buffer_funcs_ring->sched.ready)
+ amdgpu_ttm_set_buffer_funcs_status(adev, true);
+
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset) {
kgd2kfd_init_zone_device(adev);
@@ -2969,7 +3207,7 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
{
int i, r;
- if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
+ if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
return;
for (i = 0; i < adev->num_ip_blocks; i++) {
@@ -3066,6 +3304,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_device_wb_fini(adev);
amdgpu_device_mem_scratch_fini(adev);
amdgpu_ib_pool_fini(adev);
+ amdgpu_seq64_fini(adev);
}
r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
@@ -3222,8 +3461,10 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
if (adev->in_s0ix &&
- (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
- (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
+ (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
+ IP_VERSION(5, 0, 0)) &&
+ (adev->ip_blocks[i].version->type ==
+ AMD_IP_BLOCK_TYPE_SDMA))
continue;
/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
@@ -3282,6 +3523,8 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
amdgpu_virt_request_full_gpu(adev, false);
}
+ amdgpu_ttm_set_buffer_funcs_status(adev, false);
+
r = amdgpu_device_ip_suspend_phase1(adev);
if (r)
return r;
@@ -3452,7 +3695,7 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
*
* Main resume function for hardware IPs. The hardware IPs
* are split into two resume functions because they are
- * are also used in in recovering from a GPU reset and some additional
+ * also used in recovering from a GPU reset and some additional
* steps need to be take between them. In this case (S3/S4) they are
* run sequentially.
* Returns 0 on success, negative error code on failure.
@@ -3461,12 +3704,6 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
{
int r;
- if (!adev->in_s0ix) {
- r = amdgpu_amdkfd_resume_iommu(adev);
- if (r)
- return r;
- }
-
r = amdgpu_device_ip_resume_phase1(adev);
if (r)
return r;
@@ -3477,6 +3714,9 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
r = amdgpu_device_ip_resume_phase2(adev);
+ if (adev->mman.buffer_funcs_ring->sched.ready)
+ amdgpu_ttm_set_buffer_funcs_status(adev, true);
+
return r;
}
@@ -3554,8 +3794,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
#else
default:
if (amdgpu_dc > 0)
- DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
- "but isn't supported by ASIC, ignoring\n");
+ DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
return false;
#endif
}
@@ -3607,9 +3846,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (adev->asic_reset_res)
goto fail;
- if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
- adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
- adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
+ amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
} else {
task_barrier_full(&hive->tb);
@@ -3711,9 +3948,6 @@ static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
}
static const struct attribute *amdgpu_dev_attributes[] = {
- &dev_attr_product_name.attr,
- &dev_attr_product_number.attr,
- &dev_attr_serial_number.attr,
&dev_attr_pcie_replay_count.attr,
NULL
};
@@ -3724,10 +3958,6 @@ static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
adev->gfx.mcbp = true;
else if (amdgpu_mcbp == 0)
adev->gfx.mcbp = false;
- else if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
- (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
- adev->gfx.num_gfx_rings)
- adev->gfx.mcbp = true;
if (amdgpu_sriov_vf(adev))
adev->gfx.mcbp = true;
@@ -3790,6 +4020,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->pciep_wreg = &amdgpu_invalid_wreg;
adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
+ adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
+ adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
adev->didt_rreg = &amdgpu_invalid_rreg;
@@ -3804,7 +4036,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
/* mutex initialization are all done here so we
- * can recall function without having locking issues */
+ * can recall function without having locking issues
+ */
mutex_init(&adev->firmware.mutex);
mutex_init(&adev->pm.mutex);
mutex_init(&adev->gfx.gpu_clock_mutex);
@@ -3836,6 +4069,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
spin_lock_init(&adev->se_cac_idx_lock);
spin_lock_init(&adev->audio_endpt_idx_lock);
spin_lock_init(&adev->mm_stats.lock);
+ spin_lock_init(&adev->wb.lock);
INIT_LIST_HEAD(&adev->shadow_list);
mutex_init(&adev->shadow_list_lock);
@@ -3844,6 +4078,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_LIST_HEAD(&adev->ras_list);
+ INIT_LIST_HEAD(&adev->pm.od_kobj_list);
+
INIT_DELAYED_WORK(&adev->delayed_init_work,
amdgpu_device_delayed_init_work_handler);
INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
@@ -3881,11 +4117,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,
atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
- if (adev->rmmio == NULL) {
+ if (!adev->rmmio)
return -ENOMEM;
- }
+
DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
- DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
+ DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
/*
* Reset domain needs to be present early, before XGMI hive discovered
@@ -3907,13 +4143,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return r;
}
+ amdgpu_device_set_mcbp(adev);
+
/* early init functions */
r = amdgpu_device_ip_early_init(adev);
if (r)
return r;
- amdgpu_device_set_mcbp(adev);
-
/* Get rid of things like offb */
r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
if (r)
@@ -3922,6 +4158,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
/* Enable TMZ based on IP_VERSION */
amdgpu_gmc_tmz_set(adev);
+ if (amdgpu_sriov_vf(adev) &&
+ amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
+ /* VF MMIO access (except mailbox range) from CPU
+ * will be blocked during sriov runtime
+ */
+ adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
+
amdgpu_gmc_noretry_set(adev);
/* Need to get xgmi info early to decide the reset behavior*/
if (adev->gmc.xgmi.supported) {
@@ -3940,7 +4183,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
* internal path natively support atomics, set have_atomics_support to true.
*/
} else if ((adev->flags & AMD_IS_APU) &&
- (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
+ (amdgpu_ip_version(adev, GC_HWIP, 0) >
+ IP_VERSION(9, 0, 0))) {
adev->have_atomics_support = true;
} else {
adev->have_atomics_support =
@@ -3953,7 +4197,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
dev_info(adev->dev, "PCIE atomic ops is not supported\n");
/* doorbell bar mapping and doorbell index init*/
- amdgpu_device_doorbell_init(adev);
+ amdgpu_doorbell_init(adev);
if (amdgpu_emu_mode == 1) {
/* post the asic on emulation mode */
@@ -3987,18 +4231,22 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->ip_blocks[i].status.hw = true;
}
}
+ } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
+ !amdgpu_device_has_display_hardware(adev)) {
+ r = psp_gpu_reset(adev);
} else {
- tmp = amdgpu_reset_method;
- /* It should do a default reset when loading or reloading the driver,
- * regardless of the module parameter reset_method.
- */
- amdgpu_reset_method = AMD_RESET_METHOD_NONE;
- r = amdgpu_asic_reset(adev);
- amdgpu_reset_method = tmp;
- if (r) {
- dev_err(adev->dev, "asic reset on init failed\n");
- goto failed;
- }
+ tmp = amdgpu_reset_method;
+ /* It should do a default reset when loading or reloading the driver,
+ * regardless of the module parameter reset_method.
+ */
+ amdgpu_reset_method = AMD_RESET_METHOD_NONE;
+ r = amdgpu_asic_reset(adev);
+ amdgpu_reset_method = tmp;
+ }
+
+ if (r) {
+ dev_err(adev->dev, "asic reset on init failed\n");
+ goto failed;
}
}
@@ -4080,30 +4328,6 @@ fence_driver_init:
/* Get a log2 for easy divisions. */
adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
- r = amdgpu_atombios_sysfs_init(adev);
- if (r)
- drm_err(&adev->ddev,
- "registering atombios sysfs failed (%d).\n", r);
-
- r = amdgpu_pm_sysfs_init(adev);
- if (r)
- DRM_ERROR("registering pm sysfs failed (%d).\n", r);
-
- r = amdgpu_ucode_sysfs_init(adev);
- if (r) {
- adev->ucode_sysfs_en = false;
- DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
- } else
- adev->ucode_sysfs_en = true;
-
- r = amdgpu_psp_sysfs_init(adev);
- if (r) {
- adev->psp_sysfs_en = false;
- if (!amdgpu_sriov_vf(adev))
- DRM_ERROR("Creating psp sysfs failed\n");
- } else
- adev->psp_sysfs_en = true;
-
/*
* Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
* Otherwise the mgpu fan boost feature will be skipped due to the
@@ -4132,10 +4356,39 @@ fence_driver_init:
flush_delayed_work(&adev->delayed_init_work);
}
+ /*
+ * Place those sysfs registering after `late_init`. As some of those
+ * operations performed in `late_init` might affect the sysfs
+ * interfaces creating.
+ */
+ r = amdgpu_atombios_sysfs_init(adev);
+ if (r)
+ drm_err(&adev->ddev,
+ "registering atombios sysfs failed (%d).\n", r);
+
+ r = amdgpu_pm_sysfs_init(adev);
+ if (r)
+ DRM_ERROR("registering pm sysfs failed (%d).\n", r);
+
+ r = amdgpu_ucode_sysfs_init(adev);
+ if (r) {
+ adev->ucode_sysfs_en = false;
+ DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
+ } else
+ adev->ucode_sysfs_en = true;
+
r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
if (r)
dev_err(adev->dev, "Could not create amdgpu device attr\n");
+ r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
+ if (r)
+ dev_err(adev->dev,
+ "Could not create amdgpu board attributes\n");
+
+ amdgpu_fru_sysfs_init(adev);
+ amdgpu_reg_state_sysfs_init(adev);
+
if (IS_ENABLED(CONFIG_PERF_EVENTS))
r = amdgpu_pmu_init(adev);
if (r)
@@ -4147,13 +4400,14 @@ fence_driver_init:
/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
/* this will fail for cards that aren't VGA class devices, just
- * ignore it */
+ * ignore it
+ */
if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
px = amdgpu_device_supports_px(ddev);
- if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
+ if (px || (!dev_is_removable(&adev->pdev->dev) &&
apple_gmux_detect(NULL, NULL)))
vga_switcheroo_register_client(adev->pdev,
&amdgpu_switcheroo_ops, px);
@@ -4199,7 +4453,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
/* Unmap all mapped bars - Doorbell, registers and VRAM */
- amdgpu_device_doorbell_fini(adev);
+ amdgpu_doorbell_fini(adev);
iounmap(adev->rmmio);
adev->rmmio = NULL;
@@ -4230,7 +4484,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
/* make sure IB test finished before entering exclusive mode
* to avoid preemption on IB test
- * */
+ */
if (amdgpu_sriov_vf(adev)) {
amdgpu_virt_request_full_gpu(adev, false);
amdgpu_virt_fini_data_exchange(adev);
@@ -4253,13 +4507,16 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_pm_sysfs_fini(adev);
if (adev->ucode_sysfs_en)
amdgpu_ucode_sysfs_fini(adev);
- if (adev->psp_sysfs_en)
- amdgpu_psp_sysfs_fini(adev);
sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
+ amdgpu_fru_sysfs_fini(adev);
+
+ amdgpu_reg_state_sysfs_fini(adev);
/* disable ras feature must before hw fini */
amdgpu_ras_pre_fini(adev);
+ amdgpu_ttm_set_buffer_funcs_status(adev, false);
+
amdgpu_device_ip_fini_early(adev);
amdgpu_irq_fini_hw(adev);
@@ -4297,9 +4554,12 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
kfree(adev->bios);
adev->bios = NULL;
+ kfree(adev->fru_info);
+ adev->fru_info = NULL;
+
px = amdgpu_device_supports_px(adev_to_drm(adev));
- if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
+ if (px || (!dev_is_removable(&adev->pdev->dev) &&
apple_gmux_detect(NULL, NULL)))
vga_switcheroo_unregister_client(adev->pdev);
@@ -4313,7 +4573,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
iounmap(adev->rmmio);
adev->rmmio = NULL;
- amdgpu_device_doorbell_fini(adev);
+ amdgpu_doorbell_fini(adev);
drm_dev_exit(idx);
}
@@ -4356,6 +4616,50 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
* Suspend & resume.
*/
/**
+ * amdgpu_device_prepare - prepare for device suspend
+ *
+ * @dev: drm dev pointer
+ *
+ * Prepare to put the hw in the suspend state (all asics).
+ * Returns 0 for success or an error on failure.
+ * Called at driver suspend.
+ */
+int amdgpu_device_prepare(struct drm_device *dev)
+{
+ struct amdgpu_device *adev = drm_to_adev(dev);
+ int i, r;
+
+ amdgpu_choose_low_power_state(adev);
+
+ if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
+ return 0;
+
+ /* Evict the majority of BOs before starting suspend sequence */
+ r = amdgpu_device_evict_resources(adev);
+ if (r)
+ goto unprepare;
+
+ flush_delayed_work(&adev->gfx.gfx_off_delay_work);
+
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (!adev->ip_blocks[i].status.valid)
+ continue;
+ if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
+ continue;
+ r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
+ if (r)
+ goto unprepare;
+ }
+
+ return 0;
+
+unprepare:
+ adev->in_s0ix = adev->in_s3 = false;
+
+ return r;
+}
+
+/**
* amdgpu_device_suspend - initiate device suspend
*
* @dev: drm dev pointer
@@ -4375,11 +4679,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
adev->in_suspend = true;
- /* Evict the majority of BOs before grabbing the full access */
- r = amdgpu_device_evict_resources(adev);
- if (r)
- return r;
-
if (amdgpu_sriov_vf(adev)) {
amdgpu_virt_fini_data_exchange(adev);
r = amdgpu_virt_request_full_gpu(adev, false);
@@ -4394,7 +4693,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
cancel_delayed_work_sync(&adev->delayed_init_work);
- flush_delayed_work(&adev->gfx.gfx_off_delay_work);
amdgpu_ras_suspend(adev);
@@ -4407,6 +4705,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
if (r)
return r;
+ amdgpu_ttm_set_buffer_funcs_status(adev, false);
+
amdgpu_fence_driver_hw_fini(adev);
amdgpu_device_ip_suspend_phase2(adev);
@@ -4414,6 +4714,10 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
if (amdgpu_sriov_vf(adev))
amdgpu_virt_release_full_gpu(adev, false);
+ r = amdgpu_dpm_notify_rlc_state(adev, false);
+ if (r)
+ return r;
+
return 0;
}
@@ -4459,19 +4763,18 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
}
amdgpu_fence_driver_hw_init(adev);
- r = amdgpu_device_ip_late_init(adev);
- if (r)
- goto exit;
-
- queue_delayed_work(system_wq, &adev->delayed_init_work,
- msecs_to_jiffies(AMDGPU_RESUME_MS));
-
if (!adev->in_s0ix) {
r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
if (r)
goto exit;
}
+ r = amdgpu_device_ip_late_init(adev);
+ if (r)
+ goto exit;
+
+ queue_delayed_work(system_wq, &adev->delayed_init_work,
+ msecs_to_jiffies(AMDGPU_RESUME_MS));
exit:
if (amdgpu_sriov_vf(adev)) {
amdgpu_virt_init_data_exchange(adev);
@@ -4767,12 +5070,19 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
retry:
amdgpu_amdkfd_pre_reset(adev);
+ amdgpu_device_stop_pending_resets(adev);
+
if (from_hypervisor)
r = amdgpu_virt_request_full_gpu(adev, true);
else
r = amdgpu_virt_reset_gpu(adev);
if (r)
return r;
+ amdgpu_ras_set_fed(adev, false);
+ amdgpu_irq_gpu_reset_resume_helper(adev);
+
+ /* some sw clean up VF needs to do before recover */
+ amdgpu_virt_post_reset(adev);
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
@@ -4799,7 +5109,6 @@ retry:
amdgpu_put_xgmi_hive(hive);
if (!r) {
- amdgpu_irq_gpu_reset_resume_helper(adev);
r = amdgpu_ib_ring_tests(adev);
amdgpu_amdkfd_post_reset(adev);
@@ -4838,7 +5147,7 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
spin_lock(&ring->sched.job_list_lock);
@@ -4925,9 +5234,12 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
}
if (ret)
- dev_err(adev->dev, "GPU mode1 reset failed\n");
+ goto mode1_reset_failed;
amdgpu_device_load_pci_state(adev->pdev);
+ ret = amdgpu_psp_wait_for_bootloader(adev);
+ if (ret)
+ goto mode1_reset_failed;
/* wait for asic to come out of reset */
for (i = 0; i < adev->usec_timeout; i++) {
@@ -4938,7 +5250,17 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
udelay(1);
}
+ if (i >= adev->usec_timeout) {
+ ret = -ETIMEDOUT;
+ goto mode1_reset_failed;
+ }
+
amdgpu_atombios_scratch_regs_engine_hung(adev, false);
+
+ return 0;
+
+mode1_reset_failed:
+ dev_err(adev->dev, "GPU mode1 reset failed\n");
return ret;
}
@@ -4964,11 +5286,12 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
- /*clear job fence from fence drv to avoid force_completion
- *leave NULL and vm flush fence in fence drv */
+ /* Clear job fence from fence drv to avoid force_completion
+ * leave NULL and vm flush fence in fence drv
+ */
amdgpu_fence_driver_clear_job_fences(ring);
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
@@ -4982,7 +5305,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
/* If reset handler not implemented, continue; otherwise return */
- if (r == -ENOSYS)
+ if (r == -EOPNOTSUPP)
r = 0;
else
return r;
@@ -5022,67 +5345,16 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
lockdep_assert_held(&adev->reset_domain->sem);
- for (i = 0; i < adev->num_regs; i++) {
- adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
- trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
- adev->reset_dump_reg_value[i]);
- }
-
- return 0;
-}
-
-#ifdef CONFIG_DEV_COREDUMP
-static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
- size_t count, void *data, size_t datalen)
-{
- struct drm_printer p;
- struct amdgpu_device *adev = data;
- struct drm_print_iterator iter;
- int i;
-
- iter.data = buffer;
- iter.offset = 0;
- iter.start = offset;
- iter.remain = count;
-
- p = drm_coredump_printer(&iter);
-
- drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
- drm_printf(&p, "kernel: " UTS_RELEASE "\n");
- drm_printf(&p, "module: " KBUILD_MODNAME "\n");
- drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
- if (adev->reset_task_info.pid)
- drm_printf(&p, "process_name: %s PID: %d\n",
- adev->reset_task_info.process_name,
- adev->reset_task_info.pid);
+ for (i = 0; i < adev->reset_info.num_regs; i++) {
+ adev->reset_info.reset_dump_reg_value[i] =
+ RREG32(adev->reset_info.reset_dump_reg_list[i]);
- if (adev->reset_vram_lost)
- drm_printf(&p, "VRAM is lost due to GPU reset!\n");
- if (adev->num_regs) {
- drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
-
- for (i = 0; i < adev->num_regs; i++)
- drm_printf(&p, "0x%08x: 0x%08x\n",
- adev->reset_dump_reg_list[i],
- adev->reset_dump_reg_value[i]);
+ trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i],
+ adev->reset_info.reset_dump_reg_value[i]);
}
- return count - iter.remain;
-}
-
-static void amdgpu_devcoredump_free(void *data)
-{
-}
-
-static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
-{
- struct drm_device *dev = adev_to_drm(adev);
-
- ktime_get_ts64(&adev->reset_time);
- dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
- amdgpu_devcoredump_read, amdgpu_devcoredump_free);
+ return 0;
}
-#endif
int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
@@ -5090,17 +5362,26 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
- bool gpu_reset_for_dev_remove = 0;
+ uint32_t i;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
- amdgpu_reset_reg_dumps(tmp_adev);
+
+ if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
+ amdgpu_reset_reg_dumps(tmp_adev);
+
+ /* Trigger ip dump before we reset the asic */
+ for (i = 0; i < tmp_adev->num_ip_blocks; i++)
+ if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
+ tmp_adev->ip_blocks[i].version->funcs
+ ->dump_ip_state((void *)tmp_adev);
+ }
reset_context->reset_device_list = device_list_handle;
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
/* If reset handler not implemented, continue; otherwise return */
- if (r == -ENOSYS)
+ if (r == -EOPNOTSUPP)
r = 0;
else
return r;
@@ -5110,10 +5391,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
- gpu_reset_for_dev_remove =
- test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
- test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
-
/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
@@ -5131,7 +5408,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
if (r) {
dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
r, adev_to_drm(tmp_adev)->unique);
- break;
+ goto out;
}
}
@@ -5150,52 +5427,31 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
- if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
- tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
- tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
+ amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
}
amdgpu_ras_intr_cleared();
}
- /* Since the mode1 reset affects base ip blocks, the
- * phase1 ip blocks need to be resumed. Otherwise there
- * will be a BIOS signature error and the psp bootloader
- * can't load kdb on the next amdgpu install.
- */
- if (gpu_reset_for_dev_remove) {
- list_for_each_entry(tmp_adev, device_list_handle, reset_list)
- amdgpu_device_ip_resume_phase1(tmp_adev);
-
- goto end;
- }
-
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
+ amdgpu_ras_set_fed(tmp_adev, false);
r = amdgpu_device_asic_init(tmp_adev);
if (r) {
dev_warn(tmp_adev->dev, "asic atom init failed!");
} else {
dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
- r = amdgpu_amdkfd_resume_iommu(tmp_adev);
- if (r)
- goto out;
r = amdgpu_device_ip_resume_phase1(tmp_adev);
if (r)
goto out;
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
-#ifdef CONFIG_DEV_COREDUMP
- tmp_adev->reset_vram_lost = vram_lost;
- memset(&tmp_adev->reset_task_info, 0,
- sizeof(tmp_adev->reset_task_info));
- if (reset_context->job && reset_context->job->vm)
- tmp_adev->reset_task_info =
- reset_context->job->vm->task_info;
- amdgpu_reset_capture_coredumpm(tmp_adev);
-#endif
+
+ if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
+ amdgpu_coredump(tmp_adev, vram_lost, reset_context);
+
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
@@ -5205,10 +5461,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
if (r)
return r;
+ r = amdgpu_xcp_restore_partition_mode(
+ tmp_adev->xcp_mgr);
+ if (r)
+ goto out;
+
r = amdgpu_device_ip_resume_phase2(tmp_adev);
if (r)
goto out;
+ if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
+ amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
+
if (vram_lost)
amdgpu_device_fill_reset_magic(tmp_adev);
@@ -5384,6 +5648,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
}
+static int amdgpu_device_health_check(struct list_head *device_list_handle)
+{
+ struct amdgpu_device *tmp_adev;
+ int ret = 0;
+ u32 status;
+
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+ if (PCI_POSSIBLE_ERROR(status)) {
+ dev_err(tmp_adev->dev, "device lost from bus!");
+ ret = -ENODEV;
+ }
+ }
+
+ return ret;
+}
+
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
@@ -5407,11 +5688,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
- bool gpu_reset_for_dev_remove = false;
-
- gpu_reset_for_dev_remove =
- test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
- test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
/*
* Special case: RAS triggered and full reset isn't supported
@@ -5422,7 +5698,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* Flush RAM to disk so that after reboot
* the user can read log and see why the system rebooted.
*/
- if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
+ if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
+ amdgpu_ras_get_context(adev)->reboot) {
DRM_WARN("Emergency reboot.");
ksys_sync_helper();
@@ -5448,7 +5725,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
list_add_tail(&tmp_adev->reset_list, &device_list);
- if (gpu_reset_for_dev_remove && adev->shutdown)
+ if (adev->shutdown)
tmp_adev->shutdown = true;
}
if (!list_is_first(&adev->reset_list, &device_list))
@@ -5459,6 +5736,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
device_list_handle = &device_list;
}
+ if (!amdgpu_sriov_vf(adev)) {
+ r = amdgpu_device_health_check(device_list_handle);
+ if (r)
+ goto end_reset;
+ }
+
/* We need to lock reset domain only once both for XGMI and single device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
@@ -5505,7 +5788,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
drm_sched_stop(&ring->sched, job ? &job->base : NULL);
@@ -5533,10 +5816,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
- if (gpu_reset_for_dev_remove) {
- /* Workaroud for ASICs need to disable SMC first */
- amdgpu_device_smu_fini_early(tmp_adev);
- }
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
/*TODO Should we stop ?*/
if (r) {
@@ -5545,11 +5824,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
tmp_adev->asic_reset_res = r;
}
- /*
- * Drop all pending non scheduler resets. Scheduler resets
- * were already dropped during drm_sched_stop
- */
- amdgpu_device_stop_pending_resets(tmp_adev);
+ if (!amdgpu_sriov_vf(tmp_adev))
+ /*
+ * Drop all pending non scheduler resets. Scheduler resets
+ * were already dropped during drm_sched_stop
+ */
+ amdgpu_device_stop_pending_resets(tmp_adev);
}
/* Actual ASIC resets if needed.*/
@@ -5560,16 +5840,15 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
adev->asic_reset_res = r;
/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
- if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
- adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
+ if (amdgpu_ip_version(adev, GC_HWIP, 0) ==
+ IP_VERSION(9, 4, 2) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
amdgpu_ras_resume(adev);
} else {
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
if (r && r == -EAGAIN)
goto retry;
-
- if (!r && gpu_reset_for_dev_remove)
- goto recover_end;
}
skip_hw_reset:
@@ -5580,18 +5859,14 @@ skip_hw_reset:
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
drm_sched_start(&ring->sched, true);
}
- if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
- amdgpu_mes_self_test(tmp_adev);
-
- if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
+ if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
- }
if (tmp_adev->asic_reset_res)
r = tmp_adev->asic_reset_res;
@@ -5629,11 +5904,11 @@ skip_sched_resume:
amdgpu_ras_set_error_query_ready(tmp_adev, true);
}
-recover_end:
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+end_reset:
if (hive) {
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);
@@ -5647,6 +5922,44 @@ recover_end:
}
/**
+ * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
+ *
+ * @adev: amdgpu_device pointer
+ * @speed: pointer to the speed of the link
+ * @width: pointer to the width of the link
+ *
+ * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
+ * first physical partner to an AMD dGPU.
+ * This will exclude any virtual switches and links.
+ */
+static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
+ enum pci_bus_speed *speed,
+ enum pcie_link_width *width)
+{
+ struct pci_dev *parent = adev->pdev;
+
+ if (!speed || !width)
+ return;
+
+ *speed = PCI_SPEED_UNKNOWN;
+ *width = PCIE_LNK_WIDTH_UNKNOWN;
+
+ if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
+ while ((parent = pci_upstream_bridge(parent))) {
+ /* skip upstream/downstream switches internal to dGPU*/
+ if (parent->vendor == PCI_VENDOR_ID_ATI)
+ continue;
+ *speed = pcie_get_speed_cap(parent);
+ *width = pcie_get_width_cap(parent);
+ break;
+ }
+ } else {
+ /* use the current speeds rather than max if switching is not supported */
+ pcie_bandwidth_available(adev->pdev, NULL, speed, width);
+ }
+}
+
+/**
* amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
*
* @adev: amdgpu_device pointer
@@ -5679,8 +5992,8 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
return;
- pcie_bandwidth_available(adev->pdev, NULL,
- &platform_speed_cap, &platform_link_width);
+ amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
+ &platform_link_width);
if (adev->pm.pcie_gen_mask == 0) {
/* asic caps */
@@ -5907,7 +6220,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
drm_sched_stop(&ring->sched, NULL);
@@ -5957,6 +6270,20 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
struct amdgpu_reset_context reset_context;
u32 memsize;
struct list_head device_list;
+ struct amdgpu_hive_info *hive;
+ int hive_ras_recovery = 0;
+ struct amdgpu_ras *ras;
+
+ /* PCI error slot reset should be skipped During RAS recovery */
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (hive) {
+ hive_ras_recovery = atomic_read(&hive->ras_recovery);
+ amdgpu_put_xgmi_hive(hive);
+ }
+ ras = amdgpu_ras_get_context(adev);
+ if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) &&
+ ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
+ return PCI_ERS_RESULT_RECOVERED;
DRM_INFO("PCI error: slot reset callback!!\n");
@@ -6035,7 +6362,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
- if (!ring || !ring->sched.thread)
+ if (!amdgpu_ring_sched_ready(ring))
continue;
drm_sched_start(&ring->sched, true);
@@ -6264,7 +6591,7 @@ bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
return true;
default:
/* IP discovery */
- if (!adev->ip_versions[DCE_HWIP][0] ||
+ if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
(adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
return false;
return true;