aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c824
1 files changed, 591 insertions, 233 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8a5a8ff5d362..130a9adf09ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -65,11 +65,14 @@
#include "amdgpu_ras.h"
#include "amdgpu_pmu.h"
#include "amdgpu_fru_eeprom.h"
+#include "amdgpu_reset.h"
#include <linux/suspend.h>
#include <drm/task_barrier.h>
#include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
+
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
@@ -81,6 +84,7 @@ MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
#define AMDGPU_RESUME_MS 2000
@@ -110,6 +114,7 @@ const char *amdgpu_asic_name[] = {
"RAVEN",
"ARCTURUS",
"RENOIR",
+ "ALDEBARAN",
"NAVI10",
"NAVI14",
"NAVI12",
@@ -117,6 +122,8 @@ const char *amdgpu_asic_name[] = {
"NAVY_FLOUNDER",
"VANGOGH",
"DIMGREY_CAVEFISH",
+ "BEIGE_GOBY",
+ "YELLOW_CARP",
"LAST",
};
@@ -136,7 +143,7 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
struct amdgpu_device *adev = drm_to_adev(ddev);
uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
- return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
+ return sysfs_emit(buf, "%llu\n", cnt);
}
static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
@@ -160,7 +167,7 @@ static ssize_t amdgpu_device_get_product_name(struct device *dev,
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
- return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
+ return sysfs_emit(buf, "%s\n", adev->product_name);
}
static DEVICE_ATTR(product_name, S_IRUGO,
@@ -182,7 +189,7 @@ static ssize_t amdgpu_device_get_product_number(struct device *dev,
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
- return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
+ return sysfs_emit(buf, "%s\n", adev->product_number);
}
static DEVICE_ATTR(product_number, S_IRUGO,
@@ -204,25 +211,25 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev,
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
- return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
+ return sysfs_emit(buf, "%s\n", adev->serial);
}
static DEVICE_ATTR(serial_number, S_IRUGO,
amdgpu_device_get_serial_number, NULL);
/**
- * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control
+ * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
*
* @dev: drm_device pointer
*
- * Returns true if the device is a dGPU with HG/PX power control,
+ * Returns true if the device is a dGPU with ATPX power control,
* otherwise return false.
*/
-bool amdgpu_device_supports_atpx(struct drm_device *dev)
+bool amdgpu_device_supports_px(struct drm_device *dev)
{
struct amdgpu_device *adev = drm_to_adev(dev);
- if (adev->flags & AMD_IS_PX)
+ if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
return true;
return false;
}
@@ -232,14 +239,15 @@ bool amdgpu_device_supports_atpx(struct drm_device *dev)
*
* @dev: drm_device pointer
*
- * Returns true if the device is a dGPU with HG/PX power control,
+ * Returns true if the device is a dGPU with ACPI power control,
* otherwise return false.
*/
bool amdgpu_device_supports_boco(struct drm_device *dev)
{
struct amdgpu_device *adev = drm_to_adev(dev);
- if (adev->has_pr3)
+ if (adev->has_pr3 ||
+ ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
return true;
return false;
}
@@ -259,6 +267,21 @@ bool amdgpu_device_supports_baco(struct drm_device *dev)
return amdgpu_asic_supports_baco(adev);
}
+/**
+ * amdgpu_device_supports_smart_shift - Is the device dGPU with
+ * smart shift support
+ *
+ * @dev: drm_device pointer
+ *
+ * Returns true if the device is a dGPU with Smart Shift support,
+ * otherwise returns false.
+ */
+bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
+{
+ return (amdgpu_device_supports_boco(dev) &&
+ amdgpu_acpi_is_power_shift_control_supported());
+}
+
/*
* VRAM access helper functions
*/
@@ -278,7 +301,10 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
unsigned long flags;
uint32_t hi = ~0;
uint64_t last;
+ int idx;
+ if (!drm_dev_enter(&adev->ddev, &idx))
+ return;
#ifdef CONFIG_64BIT
last = min(pos + size, adev->gmc.visible_vram_size);
@@ -289,15 +315,15 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
if (write) {
memcpy_toio(addr, buf, count);
mb();
- amdgpu_asic_flush_hdp(adev, NULL);
+ amdgpu_device_flush_hdp(adev, NULL);
} else {
- amdgpu_asic_invalidate_hdp(adev, NULL);
+ amdgpu_device_invalidate_hdp(adev, NULL);
mb();
memcpy_fromio(buf, addr, count);
}
if (count == size)
- return;
+ goto exit;
pos += count;
buf += count / 4;
@@ -320,11 +346,45 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
*buf++ = RREG32_NO_KIQ(mmMM_DATA);
}
spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
+
+#ifdef CONFIG_64BIT
+exit:
+#endif
+ drm_dev_exit(idx);
}
/*
* register access helper functions.
*/
+
+/* Check if hw access should be skipped because of hotplug or device error */
+bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
+{
+ if (adev->no_hw_access)
+ return true;
+
+#ifdef CONFIG_LOCKDEP
+ /*
+ * This is a bit complicated to understand, so worth a comment. What we assert
+ * here is that the GPU reset is not running on another thread in parallel.
+ *
+ * For this we trylock the read side of the reset semaphore, if that succeeds
+ * we know that the reset is not running in paralell.
+ *
+ * If the trylock fails we assert that we are either already holding the read
+ * side of the lock or are the reset thread itself and hold the write side of
+ * the lock.
+ */
+ if (in_task()) {
+ if (down_read_trylock(&adev->reset_sem))
+ up_read(&adev->reset_sem);
+ else
+ lockdep_assert_held(&adev->reset_sem);
+ }
+#endif
+ return false;
+}
+
/**
* amdgpu_device_rreg - read a memory mapped IO or indirect register
*
@@ -339,7 +399,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
{
uint32_t ret;
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return 0;
if ((reg * 4) < adev->rmmio_size) {
@@ -376,7 +436,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
*/
uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
{
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return 0;
if (offset < adev->rmmio_size)
@@ -401,7 +461,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
*/
void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
{
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return;
if (offset < adev->rmmio_size)
@@ -424,7 +484,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
uint32_t reg, uint32_t v,
uint32_t acc_flags)
{
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return;
if ((reg * 4) < adev->rmmio_size) {
@@ -451,63 +511,20 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
uint32_t reg, uint32_t v)
{
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return;
if (amdgpu_sriov_fullaccess(adev) &&
adev->gfx.rlc.funcs &&
adev->gfx.rlc.funcs->is_rlcg_access_range) {
if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
- return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
+ return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0, 0);
} else {
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
}
}
/**
- * amdgpu_io_rreg - read an IO register
- *
- * @adev: amdgpu_device pointer
- * @reg: dword aligned register offset
- *
- * Returns the 32 bit value from the offset specified.
- */
-u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
-{
- if (adev->in_pci_err_recovery)
- return 0;
-
- if ((reg * 4) < adev->rio_mem_size)
- return ioread32(adev->rio_mem + (reg * 4));
- else {
- iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
- return ioread32(adev->rio_mem + (mmMM_DATA * 4));
- }
-}
-
-/**
- * amdgpu_io_wreg - write to an IO register
- *
- * @adev: amdgpu_device pointer
- * @reg: dword aligned register offset
- * @v: 32 bit value to write to the register
- *
- * Writes the value specified to the offset specified.
- */
-void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
-{
- if (adev->in_pci_err_recovery)
- return;
-
- if ((reg * 4) < adev->rio_mem_size)
- iowrite32(v, adev->rio_mem + (reg * 4));
- else {
- iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
- iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
- }
-}
-
-/**
* amdgpu_mm_rdoorbell - read a doorbell dword
*
* @adev: amdgpu_device pointer
@@ -518,7 +535,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
*/
u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
{
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return 0;
if (index < adev->doorbell.num_doorbells) {
@@ -541,7 +558,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
*/
void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
{
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return;
if (index < adev->doorbell.num_doorbells) {
@@ -562,7 +579,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
*/
u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
{
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return 0;
if (index < adev->doorbell.num_doorbells) {
@@ -585,7 +602,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
*/
void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
{
- if (adev->in_pci_err_recovery)
+ if (amdgpu_device_skip_hw_access(adev))
return;
if (index < adev->doorbell.num_doorbells) {
@@ -1223,6 +1240,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
}
}
+ /* Don't post if we need to reset whole hive on init */
+ if (adev->gmc.xgmi.pending_reset)
+ return false;
+
if (adev->has_hw_reset) {
adev->has_hw_reset = false;
return true;
@@ -1429,7 +1450,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
struct drm_device *dev = pci_get_drvdata(pdev);
int r;
- if (amdgpu_device_supports_atpx(dev) && state == VGA_SWITCHEROO_OFF)
+ if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
return;
if (state == VGA_SWITCHEROO_ON) {
@@ -1690,6 +1711,19 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
if (!ip_block_version)
return -EINVAL;
+ switch (ip_block_version->type) {
+ case AMD_IP_BLOCK_TYPE_VCN:
+ if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
+ return 0;
+ break;
+ case AMD_IP_BLOCK_TYPE_JPEG:
+ if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
+ return 0;
+ break;
+ default:
+ break;
+ }
+
DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
ip_block_version->funcs->name);
@@ -1810,9 +1844,11 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
case CHIP_CARRIZO:
case CHIP_STONEY:
case CHIP_VEGA20:
+ case CHIP_ALDEBARAN:
case CHIP_SIENNA_CICHLID:
case CHIP_NAVY_FLOUNDER:
case CHIP_DIMGREY_CAVEFISH:
+ case CHIP_BEIGE_GOBY:
default:
return 0;
case CHIP_VEGA10:
@@ -1850,6 +1886,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
case CHIP_VANGOGH:
chip_name = "vangogh";
break;
+ case CHIP_YELLOW_CARP:
+ chip_name = "yellow_carp";
+ break;
}
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
@@ -2010,6 +2049,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
case CHIP_RAVEN:
case CHIP_ARCTURUS:
case CHIP_RENOIR:
+ case CHIP_ALDEBARAN:
if (adev->flags & AMD_IS_APU)
adev->family = AMDGPU_FAMILY_RV;
else
@@ -2025,9 +2065,13 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
case CHIP_SIENNA_CICHLID:
case CHIP_NAVY_FLOUNDER:
case CHIP_DIMGREY_CAVEFISH:
+ case CHIP_BEIGE_GOBY:
case CHIP_VANGOGH:
+ case CHIP_YELLOW_CARP:
if (adev->asic_type == CHIP_VANGOGH)
adev->family = AMDGPU_FAMILY_VGH;
+ else if (adev->asic_type == CHIP_YELLOW_CARP)
+ adev->family = AMDGPU_FAMILY_YC;
else
adev->family = AMDGPU_FAMILY_NV;
@@ -2045,6 +2089,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
adev->pm.pp_feature = amdgpu_pp_feature_mask;
if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
+ if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
+ adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
for (i = 0; i < adev->num_ip_blocks; i++) {
if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
@@ -2083,6 +2129,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
return r;
}
+
+ /*get pf2vf msg info at it's earliest time*/
+ if (amdgpu_sriov_vf(adev))
+ amdgpu_virt_init_data_exchange(adev);
+
}
}
@@ -2149,6 +2200,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
continue;
+ if (!adev->ip_blocks[i].status.sw)
+ continue;
+
/* no need to do the fw loading again if already done*/
if (adev->ip_blocks[i].status.hw == true)
break;
@@ -2289,7 +2343,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev);
- amdgpu_amdkfd_device_init(adev);
+
+ /* Don't init kfd if whole hive need to be reset during init */
+ if (!adev->gmc.xgmi.pending_reset)
+ amdgpu_amdkfd_device_init(adev);
amdgpu_fru_get_product_info(adev);
@@ -2359,8 +2416,8 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
* Returns 0 on success, negative error code on failure.
*/
-static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
- enum amd_clockgating_state state)
+int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
+ enum amd_clockgating_state state)
{
int i, j, r;
@@ -2395,7 +2452,8 @@ static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
return 0;
}
-static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
+int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
+ enum amd_powergating_state state)
{
int i, j, r;
@@ -2506,6 +2564,11 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
if (r)
DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
+ /* For XGMI + passthrough configuration on arcturus, enable light SBR */
+ if (adev->asic_type == CHIP_ARCTURUS &&
+ amdgpu_passthrough(adev) &&
+ adev->gmc.xgmi.num_physical_nodes > 1)
+ smu_set_light_sbr(&adev->smu, true);
if (adev->gmc.xgmi.num_physical_nodes > 1) {
mutex_lock(&mgpu_info.mutex);
@@ -2544,34 +2607,26 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
return 0;
}
-/**
- * amdgpu_device_ip_fini - run fini for hardware IPs
- *
- * @adev: amdgpu_device pointer
- *
- * Main teardown pass for hardware IPs. The list of all the hardware
- * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
- * are run. hw_fini tears down the hardware associated with each IP
- * and sw_fini tears down any software state associated with each IP.
- * Returns 0 on success, negative error code on failure.
- */
-static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
+static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
{
int i, r;
- if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
- amdgpu_virt_release_ras_err_handler_data(adev);
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (!adev->ip_blocks[i].version->funcs->early_fini)
+ continue;
- amdgpu_ras_pre_fini(adev);
+ r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
+ if (r) {
+ DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name, r);
+ }
+ }
- if (adev->gmc.xgmi.num_physical_nodes > 1)
- amdgpu_xgmi_remove_device(adev);
+ amdgpu_amdkfd_suspend(adev, false);
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
- amdgpu_amdkfd_device_fini(adev);
-
/* need to disable SMC first */
for (i = 0; i < adev->num_ip_blocks; i++) {
if (!adev->ip_blocks[i].status.hw)
@@ -2602,6 +2657,33 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
}
+ return 0;
+}
+
+/**
+ * amdgpu_device_ip_fini - run fini for hardware IPs
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Main teardown pass for hardware IPs. The list of all the hardware
+ * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
+ * are run. hw_fini tears down the hardware associated with each IP
+ * and sw_fini tears down any software state associated with each IP.
+ * Returns 0 on success, negative error code on failure.
+ */
+static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
+{
+ int i, r;
+
+ if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
+ amdgpu_virt_release_ras_err_handler_data(adev);
+
+ amdgpu_ras_pre_fini(adev);
+
+ if (adev->gmc.xgmi.num_physical_nodes > 1)
+ amdgpu_xgmi_remove_device(adev);
+
+ amdgpu_amdkfd_device_fini_sw(adev);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.sw)
@@ -2743,6 +2825,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
continue;
}
+ /* skip unnecessary suspend if we do not initialize them yet */
+ if (adev->gmc.xgmi.pending_reset &&
+ !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+ adev->ip_blocks[i].status.hw = false;
+ continue;
+ }
+
/* skip suspend of gfx and psp for S0ix
* gfx is in gfxoff state, so on resume it will exit gfxoff just
* like at runtime. PSP is also part of the always on hardware
@@ -2772,7 +2864,6 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
}
}
}
- adev->ip_blocks[i].status.hw = false;
}
return 0;
@@ -2793,8 +2884,10 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
{
int r;
- if (amdgpu_sriov_vf(adev))
+ if (amdgpu_sriov_vf(adev)) {
+ amdgpu_virt_fini_data_exchange(adev);
amdgpu_virt_request_full_gpu(adev, false);
+ }
r = amdgpu_device_ip_suspend_phase1(adev);
if (r)
@@ -2818,7 +2911,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
AMD_IP_BLOCK_TYPE_IH,
};
- for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
+ for (i = 0; i < adev->num_ip_blocks; i++) {
int j;
struct amdgpu_ip_block *block;
@@ -2996,7 +3089,7 @@ static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
{
if (amdgpu_sriov_vf(adev)) {
if (adev->is_atom_fw) {
- if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
+ if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
} else {
if (amdgpu_atombios_has_gpu_virtualization_table(adev))
@@ -3059,7 +3152,9 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
case CHIP_SIENNA_CICHLID:
case CHIP_NAVY_FLOUNDER:
case CHIP_DIMGREY_CAVEFISH:
+ case CHIP_BEIGE_GOBY:
case CHIP_VANGOGH:
+ case CHIP_YELLOW_CARP:
#endif
return amdgpu_dc != 0;
#endif
@@ -3080,13 +3175,14 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
*/
bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
{
- if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
+ if (amdgpu_sriov_vf(adev) ||
+ adev->enable_virtual_display ||
+ (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
return false;
return amdgpu_device_asic_has_dc_support(adev->asic_type);
}
-
static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{
struct amdgpu_device *adev =
@@ -3117,8 +3213,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (adev->asic_reset_res)
goto fail;
- if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
- adev->mmhub.funcs->reset_ras_error_count(adev);
+ if (adev->mmhub.ras_funcs &&
+ adev->mmhub.ras_funcs->reset_ras_error_count)
+ adev->mmhub.ras_funcs->reset_ras_error_count(adev);
} else {
task_barrier_full(&hive->tb);
@@ -3141,8 +3238,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
int ret = 0;
/*
- * By default timeout for non compute jobs is 10000.
- * And there is no timeout enforced on compute jobs.
+ * By default timeout for non compute jobs is 10000
+ * and 60000 for compute jobs.
* In SR-IOV or passthrough mode, timeout for compute
* jobs are 60000 by default.
*/
@@ -3151,10 +3248,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
- else if (amdgpu_passthrough(adev))
- adev->compute_timeout = msecs_to_jiffies(60000);
else
- adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
+ adev->compute_timeout = msecs_to_jiffies(60000);
if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
while ((timeout_setting = strsep(&input, ",")) &&
@@ -3211,7 +3306,6 @@ static const struct attribute *amdgpu_dev_attributes[] = {
NULL
};
-
/**
* amdgpu_device_init - initialize the driver
*
@@ -3228,7 +3322,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
struct drm_device *ddev = adev_to_drm(adev);
struct pci_dev *pdev = adev->pdev;
int r, i;
- bool atpx = false;
+ bool px = false;
u32 max_MBps;
adev->shutdown = false;
@@ -3250,6 +3344,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->vm_manager.vm_pte_funcs = NULL;
adev->vm_manager.vm_pte_num_scheds = 0;
adev->gmc.gmc_funcs = NULL;
+ adev->harvest_ip_mask = 0x0;
adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
@@ -3276,7 +3371,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
/* mutex initialization are all done here so we
* can recall function without having locking issues */
- atomic_set(&adev->irq.ih.lock, 0);
mutex_init(&adev->firmware.mutex);
mutex_init(&adev->pm.mutex);
mutex_init(&adev->gfx.gpu_clock_mutex);
@@ -3309,6 +3403,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_LIST_HEAD(&adev->shadow_list);
mutex_init(&adev->shadow_list_lock);
+ INIT_LIST_HEAD(&adev->reset_list);
+
INIT_DELAYED_WORK(&adev->delayed_init_work,
amdgpu_device_delayed_init_work_handler);
INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
@@ -3347,17 +3443,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
- /* io port mapping */
- for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
- if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
- adev->rio_mem_size = pci_resource_len(adev->pdev, i);
- adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
- break;
- }
- }
- if (adev->rio_mem == NULL)
- DRM_INFO("PCI I/O BAR is not found.\n");
-
/* enable PCIE atomic ops */
r = pci_enable_atomic_ops_to_root(adev->pdev,
PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
@@ -3394,29 +3479,14 @@ int amdgpu_device_init(struct amdgpu_device *adev,
/* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev);
- /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
- /* this will fail for cards that aren't VGA class devices, just
- * ignore it */
- if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
- vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
-
- if (amdgpu_device_supports_atpx(ddev))
- atpx = true;
- if (amdgpu_has_atpx() &&
- (amdgpu_is_atpx_hybrid() ||
- amdgpu_has_atpx_dgpu_power_cntl()) &&
- !pci_is_thunderbolt_attached(adev->pdev))
- vga_switcheroo_register_client(adev->pdev,
- &amdgpu_switcheroo_ops, atpx);
- if (atpx)
- vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
-
if (amdgpu_emu_mode == 1) {
/* post the asic on emulation mode */
emu_soc_asic_init(adev);
goto fence_driver_init;
}
+ amdgpu_reset_init(adev);
+
/* detect if we are with an SRIOV vbios */
amdgpu_device_detect_sriov_bios(adev);
@@ -3424,10 +3494,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
* E.g., driver was not cleanly unloaded previously, etc.
*/
if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
- r = amdgpu_asic_reset(adev);
- if (r) {
- dev_err(adev->dev, "asic reset on init failed\n");
- goto failed;
+ if (adev->gmc.xgmi.num_physical_nodes) {
+ dev_info(adev->dev, "Pending hive reset.\n");
+ adev->gmc.xgmi.pending_reset = true;
+ /* Only need to init necessary block for SMU to handle the reset */
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (!adev->ip_blocks[i].status.valid)
+ continue;
+ if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+ DRM_DEBUG("IP %s disabled for hw_init.\n",
+ adev->ip_blocks[i].version->funcs->name);
+ adev->ip_blocks[i].status.hw = true;
+ }
+ }
+ } else {
+ r = amdgpu_asic_reset(adev);
+ if (r) {
+ dev_err(adev->dev, "asic reset on init failed\n");
+ goto failed;
+ }
}
}
@@ -3493,11 +3581,11 @@ fence_driver_init:
adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
adev->virt.ops = NULL;
r = -EAGAIN;
- goto failed;
+ goto release_ras_con;
}
dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
- goto failed;
+ goto release_ras_con;
}
dev_info(adev->dev,
@@ -3558,19 +3646,19 @@ fence_driver_init:
/* enable clockgating, etc. after ib tests, etc. since some blocks require
* explicit gating rather than handling it automatically.
*/
- r = amdgpu_device_ip_late_init(adev);
- if (r) {
- dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
- amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
- goto failed;
+ if (!adev->gmc.xgmi.pending_reset) {
+ r = amdgpu_device_ip_late_init(adev);
+ if (r) {
+ dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+ amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+ goto release_ras_con;
+ }
+ /* must succeed. */
+ amdgpu_ras_resume(adev);
+ queue_delayed_work(system_wq, &adev->delayed_init_work,
+ msecs_to_jiffies(AMDGPU_RESUME_MS));
}
- /* must succeed. */
- amdgpu_ras_resume(adev);
-
- queue_delayed_work(system_wq, &adev->delayed_init_work,
- msecs_to_jiffies(AMDGPU_RESUME_MS));
-
if (amdgpu_sriov_vf(adev))
flush_delayed_work(&adev->delayed_init_work);
@@ -3587,12 +3675,30 @@ fence_driver_init:
if (amdgpu_device_cache_pci_state(adev->pdev))
pci_restore_state(pdev);
+ /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
+ /* this will fail for cards that aren't VGA class devices, just
+ * ignore it */
+ if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
+ vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
+
+ if (amdgpu_device_supports_px(ddev)) {
+ px = true;
+ vga_switcheroo_register_client(adev->pdev,
+ &amdgpu_switcheroo_ops, px);
+ vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
+ }
+
+ if (adev->gmc.xgmi.pending_reset)
+ queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
+ msecs_to_jiffies(AMDGPU_RESUME_MS));
+
return 0;
+release_ras_con:
+ amdgpu_release_ras_context(adev);
+
failed:
amdgpu_vf_error_trans_all(adev);
- if (atpx)
- vga_switcheroo_fini_domain_pm_ops(adev->dev);
failed_unmap:
iounmap(adev->rmmio);
@@ -3601,6 +3707,27 @@ failed_unmap:
return r;
}
+static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
+{
+ /* Clear all CPU mappings pointing to this device */
+ unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
+
+ /* Unmap all mapped bars - Doorbell, registers and VRAM */
+ amdgpu_device_doorbell_fini(adev);
+
+ iounmap(adev->rmmio);
+ adev->rmmio = NULL;
+ if (adev->mman.aper_base_kaddr)
+ iounmap(adev->mman.aper_base_kaddr);
+ adev->mman.aper_base_kaddr = NULL;
+
+ /* Memory manager related */
+ if (!adev->gmc.xgmi.connected_to_cpu) {
+ arch_phys_wc_del(adev->gmc.vram_mtrr);
+ arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
+ }
+}
+
/**
* amdgpu_device_fini - tear down the driver
*
@@ -3609,14 +3736,13 @@ failed_unmap:
* Tear down the driver info (all asics).
* Called at driver shutdown.
*/
-void amdgpu_device_fini(struct amdgpu_device *adev)
+void amdgpu_device_fini_hw(struct amdgpu_device *adev)
{
dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(&adev->delayed_init_work);
+ ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
adev->shutdown = true;
- kfree(adev->pci_state);
-
/* make sure IB test finished before entering exclusive mode
* to avoid preemption on IB test
* */
@@ -3633,14 +3759,35 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
else
drm_atomic_helper_shutdown(adev_to_drm(adev));
}
- amdgpu_fence_driver_fini(adev);
+ amdgpu_fence_driver_fini_hw(adev);
+
if (adev->pm_sysfs_en)
amdgpu_pm_sysfs_fini(adev);
+ if (adev->ucode_sysfs_en)
+ amdgpu_ucode_sysfs_fini(adev);
+ sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
+
amdgpu_fbdev_fini(adev);
+
+ amdgpu_irq_fini_hw(adev);
+
+ amdgpu_device_ip_fini_early(adev);
+
+ amdgpu_gart_dummy_page_fini(adev);
+
+ amdgpu_device_unmap_mmio(adev);
+}
+
+void amdgpu_device_fini_sw(struct amdgpu_device *adev)
+{
amdgpu_device_ip_fini(adev);
+ amdgpu_fence_driver_fini_sw(adev);
release_firmware(adev->firmware.gpu_info_fw);
adev->firmware.gpu_info_fw = NULL;
adev->accel_working = false;
+
+ amdgpu_reset_fini(adev);
+
/* free i2c buses */
if (!amdgpu_device_has_dc_support(adev))
amdgpu_i2c_fini(adev);
@@ -3650,30 +3797,20 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
kfree(adev->bios);
adev->bios = NULL;
- if (amdgpu_has_atpx() &&
- (amdgpu_is_atpx_hybrid() ||
- amdgpu_has_atpx_dgpu_power_cntl()) &&
- !pci_is_thunderbolt_attached(adev->pdev))
+ if (amdgpu_device_supports_px(adev_to_drm(adev))) {
vga_switcheroo_unregister_client(adev->pdev);
- if (amdgpu_device_supports_atpx(adev_to_drm(adev)))
vga_switcheroo_fini_domain_pm_ops(adev->dev);
+ }
if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
vga_client_register(adev->pdev, NULL, NULL, NULL);
- if (adev->rio_mem)
- pci_iounmap(adev->pdev, adev->rio_mem);
- adev->rio_mem = NULL;
- iounmap(adev->rmmio);
- adev->rmmio = NULL;
- amdgpu_device_doorbell_fini(adev);
-
- if (adev->ucode_sysfs_en)
- amdgpu_ucode_sysfs_fini(adev);
- sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
if (IS_ENABLED(CONFIG_PERF_EVENTS))
amdgpu_pmu_fini(adev);
if (adev->mman.discovery_bin)
amdgpu_discovery_fini(adev);
+
+ kfree(adev->pci_state);
+
}
@@ -3693,12 +3830,15 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
{
struct amdgpu_device *adev = drm_to_adev(dev);
- int r;
if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
return 0;
adev->in_suspend = true;
+
+ if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
+ DRM_WARN("smart shift update failed\n");
+
drm_kms_helper_poll_disable(dev);
if (fbcon)
@@ -3708,7 +3848,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
amdgpu_ras_suspend(adev);
- r = amdgpu_device_ip_suspend_phase1(adev);
+ amdgpu_device_ip_suspend_phase1(adev);
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);
@@ -3718,7 +3858,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
amdgpu_fence_driver_suspend(adev);
- r = amdgpu_device_ip_suspend_phase2(adev);
+ amdgpu_device_ip_suspend_phase2(adev);
/* evict remaining vram memory
* This second call to evict vram is to evict the gart page table
* using the CPU.
@@ -3808,6 +3948,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
#endif
adev->in_suspend = false;
+ if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
+ DRM_WARN("smart shift update failed\n");
+
return 0;
}
@@ -3981,6 +4124,7 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
{
struct dma_fence *fence = NULL, *next = NULL;
struct amdgpu_bo *shadow;
+ struct amdgpu_bo_vm *vmbo;
long r = 1, tmo;
if (amdgpu_sriov_runtime(adev))
@@ -3990,12 +4134,12 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
dev_info(adev->dev, "recover vram bo from shadow start\n");
mutex_lock(&adev->shadow_list_lock);
- list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
-
+ list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
+ shadow = &vmbo->bo;
/* No need to recover an evicted BO */
- if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
- shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
- shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
+ if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
+ shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
+ shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
continue;
r = amdgpu_bo_restore_shadow(shadow, &next);
@@ -4079,11 +4223,11 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
amdgpu_amdkfd_post_reset(adev);
error:
- amdgpu_virt_release_full_gpu(adev, true);
if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
amdgpu_inc_vram_lost(adev);
r = amdgpu_device_recover_vram(adev);
}
+ amdgpu_virt_release_full_gpu(adev, true);
return r;
}
@@ -4160,6 +4304,8 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
case CHIP_SIENNA_CICHLID:
case CHIP_NAVY_FLOUNDER:
case CHIP_DIMGREY_CAVEFISH:
+ case CHIP_VANGOGH:
+ case CHIP_ALDEBARAN:
break;
default:
goto disabled;
@@ -4173,15 +4319,60 @@ disabled:
return false;
}
+int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
+{
+ u32 i;
+ int ret = 0;
+
+ amdgpu_atombios_scratch_regs_engine_hung(adev, true);
+
+ dev_info(adev->dev, "GPU mode1 reset\n");
+
+ /* disable BM */
+ pci_clear_master(adev->pdev);
+
+ amdgpu_device_cache_pci_state(adev->pdev);
+
+ if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
+ dev_info(adev->dev, "GPU smu mode1 reset\n");
+ ret = amdgpu_dpm_mode1_reset(adev);
+ } else {
+ dev_info(adev->dev, "GPU psp mode1 reset\n");
+ ret = psp_gpu_reset(adev);
+ }
+
+ if (ret)
+ dev_err(adev->dev, "GPU mode1 reset failed\n");
+
+ amdgpu_device_load_pci_state(adev->pdev);
+
+ /* wait for asic to come out of reset */
+ for (i = 0; i < adev->usec_timeout; i++) {
+ u32 memsize = adev->nbio.funcs->get_memsize(adev);
+
+ if (memsize != 0xffffffff)
+ break;
+ udelay(1);
+ }
-static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
- struct amdgpu_job *job,
- bool *need_full_reset_arg)
+ amdgpu_atombios_scratch_regs_engine_hung(adev, false);
+ return ret;
+}
+
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+ struct amdgpu_reset_context *reset_context)
{
int i, r = 0;
- bool need_full_reset = *need_full_reset_arg;
+ struct amdgpu_job *job = NULL;
+ bool need_full_reset =
+ test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+
+ if (reset_context->reset_req_dev == adev)
+ job = reset_context->job;
- amdgpu_debugfs_wait_dump(adev);
+ /* no need to dump if device is not in good state during probe period */
+ if (!adev->gmc.xgmi.pending_reset)
+ amdgpu_debugfs_wait_dump(adev);
if (amdgpu_sriov_vf(adev)) {
/* stop the data exchange thread */
@@ -4202,6 +4393,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if(job)
drm_sched_increase_karma(&job->base);
+ r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
+ /* If reset handler not implemented, continue; otherwise return */
+ if (r == -ENOSYS)
+ r = 0;
+ else
+ return r;
+
/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
if (!amdgpu_sriov_vf(adev)) {
@@ -4220,30 +4418,47 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (need_full_reset)
r = amdgpu_device_ip_suspend(adev);
-
- *need_full_reset_arg = need_full_reset;
+ if (need_full_reset)
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+ else
+ clear_bit(AMDGPU_NEED_FULL_RESET,
+ &reset_context->flags);
}
return r;
}
-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
- struct list_head *device_list_handle,
- bool *need_full_reset_arg,
- bool skip_hw_reset)
+int amdgpu_do_asic_reset(struct list_head *device_list_handle,
+ struct amdgpu_reset_context *reset_context)
{
struct amdgpu_device *tmp_adev = NULL;
- bool need_full_reset = *need_full_reset_arg, vram_lost = false;
+ bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
+ /* Try reset handler method first */
+ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+ reset_list);
+ r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
+ /* If reset handler not implemented, continue; otherwise return */
+ if (r == -ENOSYS)
+ r = 0;
+ else
+ return r;
+
+ /* Reset handler not implemented, use the default method */
+ need_full_reset =
+ test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+ skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
+
/*
- * ASIC reset has to be done on all HGMI hive nodes ASAP
+ * ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
*/
if (!skip_hw_reset && need_full_reset) {
- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+ tmp_adev->gmc.xgmi.pending_reset = false;
if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
} else
@@ -4258,8 +4473,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
/* For XGMI wait for all resets to complete before proceed */
if (!r) {
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(&tmp_adev->xgmi_reset_work);
r = tmp_adev->asic_reset_res;
@@ -4271,22 +4485,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
}
if (!r && amdgpu_ras_intr_triggered()) {
- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- if (tmp_adev->mmhub.funcs &&
- tmp_adev->mmhub.funcs->reset_ras_error_count)
- tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ if (tmp_adev->mmhub.ras_funcs &&
+ tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
+ tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
}
amdgpu_ras_intr_cleared();
}
- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
- if (amdgpu_device_asic_init(tmp_adev))
+ r = amdgpu_device_asic_init(tmp_adev);
+ if (r) {
dev_warn(tmp_adev->dev, "asic atom init failed!");
-
- if (!r) {
+ } else {
dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
r = amdgpu_device_ip_resume_phase1(tmp_adev);
if (r)
@@ -4319,6 +4533,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
*/
amdgpu_register_gpu_instance(tmp_adev);
+ if (!reset_context->hive &&
+ tmp_adev->gmc.xgmi.num_physical_nodes > 1)
+ amdgpu_xgmi_add_device(tmp_adev);
+
r = amdgpu_device_ip_late_init(tmp_adev);
if (r)
goto out;
@@ -4335,7 +4553,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
* bad_page_threshold value to fix this once
* probing driver again.
*/
- if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+ if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
/* must succeed. */
amdgpu_ras_resume(tmp_adev);
} else {
@@ -4344,8 +4562,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
}
/* Update PSP FW topology after reset */
- if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
- r = amdgpu_xgmi_update_topology(hive, tmp_adev);
+ if (reset_context->hive &&
+ tmp_adev->gmc.xgmi.num_physical_nodes > 1)
+ r = amdgpu_xgmi_update_topology(
+ reset_context->hive, tmp_adev);
}
}
@@ -4355,7 +4575,6 @@ out:
r = amdgpu_ib_ring_tests(tmp_adev);
if (r) {
dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
- r = amdgpu_device_ip_suspend(tmp_adev);
need_full_reset = true;
r = -EAGAIN;
goto end;
@@ -4369,7 +4588,10 @@ out:
}
end:
- *need_full_reset_arg = need_full_reset;
+ if (need_full_reset)
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+ else
+ clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
return r;
}
@@ -4385,7 +4607,6 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
down_write(&adev->reset_sem);
}
- atomic_inc(&adev->gpu_reset_counter);
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -4507,6 +4728,74 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
return 0;
}
+static void amdgpu_device_recheck_guilty_jobs(
+ struct amdgpu_device *adev, struct list_head *device_list_handle,
+ struct amdgpu_reset_context *reset_context)
+{
+ int i, r = 0;
+
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = adev->rings[i];
+ int ret = 0;
+ struct drm_sched_job *s_job;
+
+ if (!ring || !ring->sched.thread)
+ continue;
+
+ s_job = list_first_entry_or_null(&ring->sched.pending_list,
+ struct drm_sched_job, list);
+ if (s_job == NULL)
+ continue;
+
+ /* clear job's guilty and depend the folowing step to decide the real one */
+ drm_sched_reset_karma(s_job);
+ drm_sched_resubmit_jobs_ext(&ring->sched, 1);
+
+ ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
+ if (ret == 0) { /* timeout */
+ DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
+ ring->sched.name, s_job->id);
+
+ /* set guilty */
+ drm_sched_increase_karma(s_job);
+retry:
+ /* do hw reset */
+ if (amdgpu_sriov_vf(adev)) {
+ amdgpu_virt_fini_data_exchange(adev);
+ r = amdgpu_device_reset_sriov(adev, false);
+ if (r)
+ adev->asic_reset_res = r;
+ } else {
+ clear_bit(AMDGPU_SKIP_HW_RESET,
+ &reset_context->flags);
+ r = amdgpu_do_asic_reset(device_list_handle,
+ reset_context);
+ if (r && r == -EAGAIN)
+ goto retry;
+ }
+
+ /*
+ * add reset counter so that the following
+ * resubmitted job could flush vmid
+ */
+ atomic_inc(&adev->gpu_reset_counter);
+ continue;
+ }
+
+ /* got the hw fence, signal finished fence */
+ atomic_dec(ring->sched.score);
+ dma_fence_get(&s_job->s_fence->finished);
+ dma_fence_signal(&s_job->s_fence->finished);
+ dma_fence_put(&s_job->s_fence->finished);
+
+ /* remove node from list and free the job */
+ spin_lock(&ring->sched.job_list_lock);
+ list_del_init(&s_job->list);
+ spin_unlock(&ring->sched.job_list_lock);
+ ring->sched.ops->free_job(s_job);
+ }
+}
+
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
@@ -4522,13 +4811,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_job *job)
{
struct list_head device_list, *device_list_handle = NULL;
- bool need_full_reset = false;
bool job_signaled = false;
struct amdgpu_hive_info *hive = NULL;
struct amdgpu_device *tmp_adev = NULL;
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
+ int tmp_vram_lost_counter;
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
/*
* Special case: RAS triggered and full reset isn't supported
@@ -4569,6 +4861,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
mutex_lock(&hive->hive_lock);
}
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.job = job;
+ reset_context.hive = hive;
+ clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+
/*
* lock the device before we try to operate the linked list
* if didn't get the device lock, don't touch the linked list since
@@ -4592,16 +4890,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*/
INIT_LIST_HEAD(&device_list);
if (adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
- list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
- device_list_handle = &hive->device_list;
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ list_add_tail(&tmp_adev->reset_list, &device_list);
+ if (!list_is_first(&adev->reset_list, &device_list))
+ list_rotate_to_front(&adev->reset_list, &device_list);
+ device_list_handle = &device_list;
} else {
- list_add_tail(&adev->gmc.xgmi.head, &device_list);
+ list_add_tail(&adev->reset_list, &device_list);
device_list_handle = &device_list;
}
/* block all schedulers and reset given job's ring */
- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/*
* Try to put the audio codec into suspend state
* before gpu reset started.
@@ -4646,6 +4946,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (need_emergency_restart)
amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
}
+ atomic_inc(&tmp_adev->gpu_reset_counter);
}
if (need_emergency_restart)
@@ -4665,10 +4966,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
}
retry: /* Rest of adevs pre asic reset from XGMI hive. */
- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- r = amdgpu_device_pre_asic_reset(tmp_adev,
- (tmp_adev == adev) ? job : NULL,
- &need_full_reset);
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
/*TODO Should we stop ?*/
if (r) {
dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
@@ -4677,6 +4976,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
}
}
+ tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
/* Actual ASIC resets if needed.*/
/* TODO Implement XGMI hive reset logic for SRIOV */
if (amdgpu_sriov_vf(adev)) {
@@ -4684,7 +4984,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
if (r)
adev->asic_reset_res = r;
} else {
- r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
+ r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
if (r && r == -EAGAIN)
goto retry;
}
@@ -4692,7 +4992,19 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
skip_hw_reset:
/* Post ASIC reset for all devs .*/
- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+
+ /*
+ * Sometimes a later bad compute job can block a good gfx job as gfx
+ * and compute ring share internal GC HW mutually. We add an additional
+ * guilty jobs recheck step to find the real guilty job, it synchronously
+ * submits and pends for the first job being signaled. If it gets timeout,
+ * we identify it as a real guilty job.
+ */
+ if (amdgpu_gpu_recovery == 2 &&
+ !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
+ amdgpu_device_recheck_guilty_jobs(
+ tmp_adev, device_list_handle, &reset_context);
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
@@ -4719,14 +5031,23 @@ skip_hw_reset:
amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
} else {
dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
+ DRM_WARN("smart shift update failed\n");
}
}
skip_sched_resume:
- list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- /*unlock kfd: SRIOV would do it separately */
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+ /* unlock kfd: SRIOV would do it separately */
if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
amdgpu_amdkfd_post_reset(tmp_adev);
+
+ /* kfd_post_reset will do nothing if kfd device is not initialized,
+ * need to bring up kfd here if it's not be initialized before
+ */
+ if (!adev->kfd.init_complete)
+ amdgpu_amdkfd_device_init(adev);
+
if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev);
amdgpu_device_unlock_adev(tmp_adev);
@@ -4900,7 +5221,8 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
return -ENOTSUPP;
- if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
+ if (ras && adev->ras_enabled &&
+ adev->nbio.funcs->enable_doorbell_interrupt)
adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
return amdgpu_dpm_baco_enter(adev);
@@ -4919,7 +5241,8 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
if (ret)
return ret;
- if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
+ if (ras && adev->ras_enabled &&
+ adev->nbio.funcs->enable_doorbell_interrupt)
adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
return 0;
@@ -4988,6 +5311,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
drm_sched_stop(&ring->sched, NULL);
}
+ atomic_inc(&adev->gpu_reset_counter);
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
/* Permanent error, prepare for device removal */
@@ -5029,14 +5353,16 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
int r, i;
- bool need_full_reset = true;
+ struct amdgpu_reset_context reset_context;
u32 memsize;
struct list_head device_list;
DRM_INFO("PCI error: slot reset callback!!\n");
+ memset(&reset_context, 0, sizeof(reset_context));
+
INIT_LIST_HEAD(&device_list);
- list_add_tail(&adev->gmc.xgmi.head, &device_list);
+ list_add_tail(&adev->reset_list, &device_list);
/* wait for asic to come out of reset */
msleep(500);
@@ -5057,13 +5383,18 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
goto out;
}
- adev->in_pci_err_recovery = true;
- r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
- adev->in_pci_err_recovery = false;
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
+
+ adev->no_hw_access = true;
+ r = amdgpu_device_pre_asic_reset(adev, &reset_context);
+ adev->no_hw_access = false;
if (r)
goto out;
- r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
+ r = amdgpu_do_asic_reset(&device_list, &reset_context);
out:
if (!r) {
@@ -5154,4 +5485,31 @@ bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
return true;
}
+void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring)
+{
+#ifdef CONFIG_X86_64
+ if (adev->flags & AMD_IS_APU)
+ return;
+#endif
+ if (adev->gmc.xgmi.connected_to_cpu)
+ return;
+ if (ring && ring->funcs->emit_hdp_flush)
+ amdgpu_ring_emit_hdp_flush(ring);
+ else
+ amdgpu_asic_flush_hdp(adev, ring);
+}
+
+void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring)
+{
+#ifdef CONFIG_X86_64
+ if (adev->flags & AMD_IS_APU)
+ return;
+#endif
+ if (adev->gmc.xgmi.connected_to_cpu)
+ return;
+
+ amdgpu_asic_invalidate_hdp(adev, ring);
+}