aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c1624
1 files changed, 1135 insertions, 489 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7d3b54615147..ab8f970b2849 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -30,6 +30,11 @@
#include <linux/module.h>
#include <linux/console.h>
#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/pci.h>
+#include <linux/devcoredump.h>
+#include <generated/utsrelease.h>
+#include <linux/pci-p2pdma.h>
#include <drm/drm_atomic_helper.h>
#include <drm/drm_probe_helper.h>
@@ -54,7 +59,6 @@
#include "soc15.h"
#include "nv.h"
#include "bif/bif_4_1_d.h"
-#include <linux/pci.h>
#include <linux/firmware.h>
#include "amdgpu_vf_error.h"
@@ -71,19 +75,19 @@
#include <drm/task_barrier.h>
#include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
+
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
-MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
-MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
-MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
-MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
#define AMDGPU_RESUME_MS 2000
+#define AMDGPU_MAX_RETRY_LIMIT 2
+#define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
const char *amdgpu_asic_name[] = {
"TAHITI",
@@ -113,12 +117,16 @@ const char *amdgpu_asic_name[] = {
"RENOIR",
"ALDEBARAN",
"NAVI10",
+ "CYAN_SKILLFISH",
"NAVI14",
"NAVI12",
"SIENNA_CICHLID",
"NAVY_FLOUNDER",
"VANGOGH",
"DIMGREY_CAVEFISH",
+ "BEIGE_GOBY",
+ "YELLOW_CARP",
+ "IP DISCOVERY",
"LAST",
};
@@ -262,12 +270,27 @@ bool amdgpu_device_supports_baco(struct drm_device *dev)
return amdgpu_asic_supports_baco(adev);
}
+/**
+ * amdgpu_device_supports_smart_shift - Is the device dGPU with
+ * smart shift support
+ *
+ * @dev: drm_device pointer
+ *
+ * Returns true if the device is a dGPU with Smart Shift support,
+ * otherwise returns false.
+ */
+bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
+{
+ return (amdgpu_device_supports_boco(dev) &&
+ amdgpu_acpi_is_power_shift_control_supported());
+}
+
/*
* VRAM access helper functions
*/
/**
- * amdgpu_device_vram_access - read/write a buffer in vram
+ * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
*
* @adev: amdgpu_device pointer
* @pos: offset of the buffer in vram
@@ -275,54 +298,107 @@ bool amdgpu_device_supports_baco(struct drm_device *dev)
* @size: read/write size, sizeof(@buf) must > @size
* @write: true - write to vram, otherwise - read from vram
*/
-void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
- uint32_t *buf, size_t size, bool write)
+void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
+ void *buf, size_t size, bool write)
{
unsigned long flags;
- uint32_t hi = ~0;
+ uint32_t hi = ~0, tmp = 0;
+ uint32_t *data = buf;
uint64_t last;
+ int idx;
+
+ if (!drm_dev_enter(adev_to_drm(adev), &idx))
+ return;
+
+ BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
+
+ spin_lock_irqsave(&adev->mmio_idx_lock, flags);
+ for (last = pos + size; pos < last; pos += 4) {
+ tmp = pos >> 31;
+
+ WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
+ if (tmp != hi) {
+ WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
+ hi = tmp;
+ }
+ if (write)
+ WREG32_NO_KIQ(mmMM_DATA, *data++);
+ else
+ *data++ = RREG32_NO_KIQ(mmMM_DATA);
+ }
+ spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
+ drm_dev_exit(idx);
+}
+/**
+ * amdgpu_device_aper_access - access vram by vram aperature
+ *
+ * @adev: amdgpu_device pointer
+ * @pos: offset of the buffer in vram
+ * @buf: virtual address of the buffer in system memory
+ * @size: read/write size, sizeof(@buf) must > @size
+ * @write: true - write to vram, otherwise - read from vram
+ *
+ * The return value means how many bytes have been transferred.
+ */
+size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
+ void *buf, size_t size, bool write)
+{
#ifdef CONFIG_64BIT
+ void __iomem *addr;
+ size_t count = 0;
+ uint64_t last;
+
+ if (!adev->mman.aper_base_kaddr)
+ return 0;
+
last = min(pos + size, adev->gmc.visible_vram_size);
if (last > pos) {
- void __iomem *addr = adev->mman.aper_base_kaddr + pos;
- size_t count = last - pos;
+ addr = adev->mman.aper_base_kaddr + pos;
+ count = last - pos;
if (write) {
memcpy_toio(addr, buf, count);
mb();
- amdgpu_asic_flush_hdp(adev, NULL);
+ amdgpu_device_flush_hdp(adev, NULL);
} else {
- amdgpu_asic_invalidate_hdp(adev, NULL);
+ amdgpu_device_invalidate_hdp(adev, NULL);
mb();
memcpy_fromio(buf, addr, count);
}
- if (count == size)
- return;
-
- pos += count;
- buf += count / 4;
- size -= count;
}
+
+ return count;
+#else
+ return 0;
#endif
+}
- spin_lock_irqsave(&adev->mmio_idx_lock, flags);
- for (last = pos + size; pos < last; pos += 4) {
- uint32_t tmp = pos >> 31;
+/**
+ * amdgpu_device_vram_access - read/write a buffer in vram
+ *
+ * @adev: amdgpu_device pointer
+ * @pos: offset of the buffer in vram
+ * @buf: virtual address of the buffer in system memory
+ * @size: read/write size, sizeof(@buf) must > @size
+ * @write: true - write to vram, otherwise - read from vram
+ */
+void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
+ void *buf, size_t size, bool write)
+{
+ size_t count;
- WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
- if (tmp != hi) {
- WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
- hi = tmp;
- }
- if (write)
- WREG32_NO_KIQ(mmMM_DATA, *buf++);
- else
- *buf++ = RREG32_NO_KIQ(mmMM_DATA);
+ /* try to using vram apreature to access vram first */
+ count = amdgpu_device_aper_access(adev, pos, buf, size, write);
+ size -= count;
+ if (size) {
+ /* using MM to access rest vram */
+ pos += count;
+ buf += count;
+ amdgpu_device_mm_access(adev, pos, buf, size, write);
}
- spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
}
/*
@@ -332,7 +408,7 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
/* Check if hw access should be skipped because of hotplug or device error */
bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
{
- if (adev->in_pci_err_recovery)
+ if (adev->no_hw_access)
return true;
#ifdef CONFIG_LOCKDEP
@@ -348,10 +424,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
* the lock.
*/
if (in_task()) {
- if (down_read_trylock(&adev->reset_sem))
- up_read(&adev->reset_sem);
+ if (down_read_trylock(&adev->reset_domain->sem))
+ up_read(&adev->reset_domain->sem);
else
- lockdep_assert_held(&adev->reset_sem);
+ lockdep_assert_held(&adev->reset_domain->sem);
}
#endif
return false;
@@ -377,9 +453,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
- down_read_trylock(&adev->reset_sem)) {
+ down_read_trylock(&adev->reset_domain->sem)) {
ret = amdgpu_kiq_rreg(adev, reg);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
} else {
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -462,9 +538,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
if ((reg * 4) < adev->rmmio_size) {
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
amdgpu_sriov_runtime(adev) &&
- down_read_trylock(&adev->reset_sem)) {
+ down_read_trylock(&adev->reset_domain->sem)) {
amdgpu_kiq_wreg(adev, reg, v);
- up_read(&adev->reset_sem);
+ up_read(&adev->reset_domain->sem);
} else {
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -475,11 +551,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
}
-/*
- * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
+/**
+ * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
+ *
+ * @adev: amdgpu_device pointer
+ * @reg: mmio/rlc register
+ * @v: value to write
*
- * this function is invoked only the debugfs register access
- * */
+ * this function is invoked only for the debugfs register access
+ */
void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
uint32_t reg, uint32_t v)
{
@@ -490,7 +570,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
adev->gfx.rlc.funcs &&
adev->gfx.rlc.funcs->is_rlcg_access_range) {
if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
- return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v, 0);
+ return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
+ } else if ((reg * 4) >= adev->rmmio_size) {
+ adev->pcie_wreg(adev, reg * 4, v);
} else {
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
}
@@ -834,7 +916,10 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev)
{
amdgpu_asic_pre_asic_init(adev);
- return amdgpu_atom_asic_init(adev->mode_info.atom_context);
+ if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
+ return amdgpu_atomfirmware_asic_init(adev, true);
+ else
+ return amdgpu_atom_asic_init(adev->mode_info.atom_context);
}
/**
@@ -962,19 +1047,25 @@ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
adev->doorbell.base = pci_resource_start(adev->pdev, 2);
adev->doorbell.size = pci_resource_len(adev->pdev, 2);
- adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
- adev->doorbell_index.max_assignment+1);
- if (adev->doorbell.num_doorbells == 0)
- return -EINVAL;
-
- /* For Vega, reserve and map two pages on doorbell BAR since SDMA
- * paging queue doorbell use the second page. The
- * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
- * doorbells are in the first page. So with paging queue enabled,
- * the max num_doorbells should + 1 page (0x400 in dword)
- */
- if (adev->asic_type >= CHIP_VEGA10)
- adev->doorbell.num_doorbells += 0x400;
+ if (adev->enable_mes) {
+ adev->doorbell.num_doorbells =
+ adev->doorbell.size / sizeof(u32);
+ } else {
+ adev->doorbell.num_doorbells =
+ min_t(u32, adev->doorbell.size / sizeof(u32),
+ adev->doorbell_index.max_assignment+1);
+ if (adev->doorbell.num_doorbells == 0)
+ return -EINVAL;
+
+ /* For Vega, reserve and map two pages on doorbell BAR since SDMA
+ * paging queue doorbell use the second page. The
+ * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
+ * doorbells are in the first page. So with paging queue enabled,
+ * the max num_doorbells should + 1 page (0x400 in dword)
+ */
+ if (adev->asic_type >= CHIP_VEGA10)
+ adev->doorbell.num_doorbells += 0x400;
+ }
adev->doorbell.ptr = ioremap(adev->doorbell.base,
adev->doorbell.num_doorbells *
@@ -1025,7 +1116,7 @@ static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
}
/**
- * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
+ * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
*
* @adev: amdgpu_device pointer
*
@@ -1234,19 +1325,45 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
return true;
}
+/**
+ * amdgpu_device_should_use_aspm - check if the device should program ASPM
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Confirm whether the module parameter and pcie bridge agree that ASPM should
+ * be set for this device.
+ *
+ * Returns true if it should be used or false if not.
+ */
+bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
+{
+ switch (amdgpu_aspm) {
+ case -1:
+ break;
+ case 0:
+ return false;
+ case 1:
+ return true;
+ default:
+ return false;
+ }
+ return pcie_aspm_enabled(adev->pdev);
+}
+
/* if we get transitioned to only one device, take VGA back */
/**
* amdgpu_device_vga_set_decode - enable/disable vga decode
*
- * @cookie: amdgpu_device pointer
+ * @pdev: PCI device pointer
* @state: enable/disable vga decode
*
* Enable/disable vga decode (all asics).
* Returns VGA resource flags.
*/
-static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
+static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
+ bool state)
{
- struct amdgpu_device *adev = cookie;
+ struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
amdgpu_asic_set_vga_state(adev, state);
if (state)
return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
@@ -1341,6 +1458,43 @@ def_value:
adev->pm.smu_prv_buffer_size = 0;
}
+static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
+{
+ if (!(adev->flags & AMD_IS_APU) ||
+ adev->asic_type < CHIP_RAVEN)
+ return 0;
+
+ switch (adev->asic_type) {
+ case CHIP_RAVEN:
+ if (adev->pdev->device == 0x15dd)
+ adev->apu_flags |= AMD_APU_IS_RAVEN;
+ if (adev->pdev->device == 0x15d8)
+ adev->apu_flags |= AMD_APU_IS_PICASSO;
+ break;
+ case CHIP_RENOIR:
+ if ((adev->pdev->device == 0x1636) ||
+ (adev->pdev->device == 0x164c))
+ adev->apu_flags |= AMD_APU_IS_RENOIR;
+ else
+ adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
+ break;
+ case CHIP_VANGOGH:
+ adev->apu_flags |= AMD_APU_IS_VANGOGH;
+ break;
+ case CHIP_YELLOW_CARP:
+ break;
+ case CHIP_CYAN_SKILLFISH:
+ if ((adev->pdev->device == 0x13FE) ||
+ (adev->pdev->device == 0x143F))
+ adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
/**
* amdgpu_device_check_arguments - validate module params
*
@@ -1392,6 +1546,11 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
}
+ if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
+ dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
+ amdgpu_reset_method = -1;
+ }
+
amdgpu_device_check_smu_prv_buffer_size(adev);
amdgpu_device_check_vm_size(adev);
@@ -1400,10 +1559,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
- amdgpu_gmc_tmz_set(adev);
-
- amdgpu_gmc_noretry_set(adev);
-
return 0;
}
@@ -1557,7 +1712,7 @@ int amdgpu_device_ip_set_powergating_state(void *dev,
* clockgating is enabled.
*/
void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
- u32 *flags)
+ u64 *flags)
{
int i;
@@ -1683,6 +1838,19 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
if (!ip_block_version)
return -EINVAL;
+ switch (ip_block_version->type) {
+ case AMD_IP_BLOCK_TYPE_VCN:
+ if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
+ return 0;
+ break;
+ case AMD_IP_BLOCK_TYPE_JPEG:
+ if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
+ return 0;
+ break;
+ default:
+ break;
+ }
+
DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
ip_block_version->funcs->name);
@@ -1767,11 +1935,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
adev->firmware.gpu_info_fw = NULL;
if (adev->mman.discovery_bin) {
- amdgpu_discovery_get_gfx_info(adev);
-
/*
* FIXME: The bounding box is still needed by Navi12, so
- * temporarily read it from gpu_info firmware. Should be droped
+ * temporarily read it from gpu_info firmware. Should be dropped
* when DAL no longer needs it.
*/
if (adev->asic_type != CHIP_NAVI12)
@@ -1779,34 +1945,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
}
switch (adev->asic_type) {
-#ifdef CONFIG_DRM_AMDGPU_SI
- case CHIP_VERDE:
- case CHIP_TAHITI:
- case CHIP_PITCAIRN:
- case CHIP_OLAND:
- case CHIP_HAINAN:
-#endif
-#ifdef CONFIG_DRM_AMDGPU_CIK
- case CHIP_BONAIRE:
- case CHIP_HAWAII:
- case CHIP_KAVERI:
- case CHIP_KABINI:
- case CHIP_MULLINS:
-#endif
- case CHIP_TOPAZ:
- case CHIP_TONGA:
- case CHIP_FIJI:
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- case CHIP_VEGAM:
- case CHIP_CARRIZO:
- case CHIP_STONEY:
- case CHIP_VEGA20:
- case CHIP_ALDEBARAN:
- case CHIP_SIENNA_CICHLID:
- case CHIP_NAVY_FLOUNDER:
- case CHIP_DIMGREY_CAVEFISH:
default:
return 0;
case CHIP_VEGA10:
@@ -1826,24 +1964,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
case CHIP_ARCTURUS:
chip_name = "arcturus";
break;
- case CHIP_RENOIR:
- if (adev->apu_flags & AMD_APU_IS_RENOIR)
- chip_name = "renoir";
- else
- chip_name = "green_sardine";
- break;
- case CHIP_NAVI10:
- chip_name = "navi10";
- break;
- case CHIP_NAVI14:
- chip_name = "navi14";
- break;
case CHIP_NAVI12:
chip_name = "navi12";
break;
- case CHIP_VANGOGH:
- chip_name = "vangogh";
- break;
}
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
@@ -1941,6 +2064,8 @@ out:
*/
static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
{
+ struct drm_device *dev = adev_to_drm(adev);
+ struct pci_dev *parent;
int i, r;
amdgpu_device_enable_virtual_display(adev);
@@ -1998,41 +2123,23 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
if (r)
return r;
break;
- case CHIP_VEGA10:
- case CHIP_VEGA12:
- case CHIP_VEGA20:
- case CHIP_RAVEN:
- case CHIP_ARCTURUS:
- case CHIP_RENOIR:
- case CHIP_ALDEBARAN:
- if (adev->flags & AMD_IS_APU)
- adev->family = AMDGPU_FAMILY_RV;
- else
- adev->family = AMDGPU_FAMILY_AI;
-
- r = soc15_set_ip_blocks(adev);
+ default:
+ r = amdgpu_discovery_set_ip_blocks(adev);
if (r)
return r;
break;
- case CHIP_NAVI10:
- case CHIP_NAVI14:
- case CHIP_NAVI12:
- case CHIP_SIENNA_CICHLID:
- case CHIP_NAVY_FLOUNDER:
- case CHIP_DIMGREY_CAVEFISH:
- case CHIP_VANGOGH:
- if (adev->asic_type == CHIP_VANGOGH)
- adev->family = AMDGPU_FAMILY_VGH;
- else
- adev->family = AMDGPU_FAMILY_NV;
+ }
- r = nv_set_ip_blocks(adev);
- if (r)
- return r;
- break;
- default:
- /* FIXME: not supported yet */
- return -EINVAL;
+ if (amdgpu_has_atpx() &&
+ (amdgpu_is_atpx_hybrid() ||
+ amdgpu_has_atpx_dgpu_power_cntl()) &&
+ ((adev->flags & AMD_IS_APU) == 0) &&
+ !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
+ adev->flags |= AMD_IS_PX;
+
+ if (!(adev->flags & AMD_IS_APU)) {
+ parent = pci_upstream_bridge(adev->pdev);
+ adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
}
amdgpu_amdkfd_device_probe(adev);
@@ -2185,6 +2292,49 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
return r;
}
+static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
+{
+ long timeout;
+ int r, i;
+
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = adev->rings[i];
+
+ /* No need to setup the GPU scheduler for rings that don't need it */
+ if (!ring || ring->no_scheduler)
+ continue;
+
+ switch (ring->funcs->type) {
+ case AMDGPU_RING_TYPE_GFX:
+ timeout = adev->gfx_timeout;
+ break;
+ case AMDGPU_RING_TYPE_COMPUTE:
+ timeout = adev->compute_timeout;
+ break;
+ case AMDGPU_RING_TYPE_SDMA:
+ timeout = adev->sdma_timeout;
+ break;
+ default:
+ timeout = adev->video_timeout;
+ break;
+ }
+
+ r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
+ ring->num_hw_submission, amdgpu_job_hang_limit,
+ timeout, adev->reset_domain->wq,
+ ring->sched_score, ring->name,
+ adev->dev);
+ if (r) {
+ DRM_ERROR("Failed to create scheduler on ring %s.\n",
+ ring->name);
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+
/**
* amdgpu_device_ip_init - run init for hardware IPs
*
@@ -2215,8 +2365,20 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
}
adev->ip_blocks[i].status.sw = true;
- /* need to do gmc hw init early so we can allocate gpu mem */
- if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
+ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
+ /* need to do common hw init early so everything is set up for gmc */
+ r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
+ if (r) {
+ DRM_ERROR("hw_init %d failed %d\n", i, r);
+ goto init_failed;
+ }
+ adev->ip_blocks[i].status.hw = true;
+ } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
+ /* need to do gmc hw init early so we can allocate gpu mem */
+ /* Try to reserve bad pages early */
+ if (amdgpu_sriov_vf(adev))
+ amdgpu_virt_exchange_data(adev);
+
r = amdgpu_device_vram_scratch_init(adev);
if (r) {
DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
@@ -2292,8 +2454,32 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (r)
goto init_failed;
- if (adev->gmc.xgmi.num_physical_nodes > 1)
- amdgpu_xgmi_add_device(adev);
+ /**
+ * In case of XGMI grab extra reference for reset domain for this device
+ */
+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ if (amdgpu_xgmi_add_device(adev) == 0) {
+ if (!amdgpu_sriov_vf(adev)) {
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+
+ if (!hive->reset_domain ||
+ !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+ r = -ENOENT;
+ amdgpu_put_xgmi_hive(hive);
+ goto init_failed;
+ }
+
+ /* Drop the early temporary reset domain we created for device */
+ amdgpu_reset_put_reset_domain(adev->reset_domain);
+ adev->reset_domain = hive->reset_domain;
+ amdgpu_put_xgmi_hive(hive);
+ }
+ }
+ }
+
+ r = amdgpu_device_init_schedulers(adev);
+ if (r)
+ goto init_failed;
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset)
@@ -2504,6 +2690,12 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
adev->ip_blocks[i].status.late_initialized = true;
}
+ r = amdgpu_ras_late_init(adev);
+ if (r) {
+ DRM_ERROR("amdgpu_ras_late_init failed %d", r);
+ return r;
+ }
+
amdgpu_ras_set_error_query_ready(adev, true);
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
@@ -2515,11 +2707,10 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
if (r)
DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
- /* For XGMI + passthrough configuration on arcturus, enable light SBR */
- if (adev->asic_type == CHIP_ARCTURUS &&
- amdgpu_passthrough(adev) &&
- adev->gmc.xgmi.num_physical_nodes > 1)
- smu_set_light_sbr(&adev->smu, true);
+ /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
+ if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
+ adev->asic_type == CHIP_ALDEBARAN ))
+ amdgpu_dpm_handle_passthrough_sbr(adev, true);
if (adev->gmc.xgmi.num_physical_nodes > 1) {
mutex_lock(&mgpu_info.mutex);
@@ -2559,34 +2750,19 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
}
/**
- * amdgpu_device_ip_fini - run fini for hardware IPs
+ * amdgpu_device_smu_fini_early - smu hw_fini wrapper
*
* @adev: amdgpu_device pointer
*
- * Main teardown pass for hardware IPs. The list of all the hardware
- * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
- * are run. hw_fini tears down the hardware associated with each IP
- * and sw_fini tears down any software state associated with each IP.
- * Returns 0 on success, negative error code on failure.
+ * For ASICs need to disable SMC first
*/
-static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
+static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
{
int i, r;
- if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
- amdgpu_virt_release_ras_err_handler_data(adev);
-
- amdgpu_ras_pre_fini(adev);
-
- if (adev->gmc.xgmi.num_physical_nodes > 1)
- amdgpu_xgmi_remove_device(adev);
-
- amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
- amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
-
- amdgpu_amdkfd_device_fini(adev);
+ if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
+ return;
- /* need to disable SMC first */
for (i = 0; i < adev->num_ip_blocks; i++) {
if (!adev->ip_blocks[i].status.hw)
continue;
@@ -2601,6 +2777,30 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
break;
}
}
+}
+
+static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
+{
+ int i, r;
+
+ for (i = 0; i < adev->num_ip_blocks; i++) {
+ if (!adev->ip_blocks[i].version->funcs->early_fini)
+ continue;
+
+ r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
+ if (r) {
+ DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
+ adev->ip_blocks[i].version->funcs->name, r);
+ }
+ }
+
+ amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
+ amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
+
+ amdgpu_amdkfd_suspend(adev, false);
+
+ /* Workaroud for ASICs need to disable SMC first */
+ amdgpu_device_smu_fini_early(adev);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.hw)
@@ -2616,6 +2816,36 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
}
+ if (amdgpu_sriov_vf(adev)) {
+ if (amdgpu_virt_release_full_gpu(adev, false))
+ DRM_ERROR("failed to release exclusive mode on fini\n");
+ }
+
+ return 0;
+}
+
+/**
+ * amdgpu_device_ip_fini - run fini for hardware IPs
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Main teardown pass for hardware IPs. The list of all the hardware
+ * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
+ * are run. hw_fini tears down the hardware associated with each IP
+ * and sw_fini tears down any software state associated with each IP.
+ * Returns 0 on success, negative error code on failure.
+ */
+static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
+{
+ int i, r;
+
+ if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
+ amdgpu_virt_release_ras_err_handler_data(adev);
+
+ if (adev->gmc.xgmi.num_physical_nodes > 1)
+ amdgpu_xgmi_remove_device(adev);
+
+ amdgpu_amdkfd_device_fini_sw(adev);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.sw)
@@ -2649,10 +2879,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_ras_fini(adev);
- if (amdgpu_sriov_vf(adev))
- if (amdgpu_virt_release_full_gpu(adev, false))
- DRM_ERROR("failed to release exclusive mode on fini\n");
-
return 0;
}
@@ -2677,12 +2903,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
struct amdgpu_device *adev =
container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
- mutex_lock(&adev->gfx.gfx_off_mutex);
- if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
- if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
- adev->gfx.gfx_off_state = true;
- }
- mutex_unlock(&adev->gfx.gfx_off_mutex);
+ WARN_ON_ONCE(adev->gfx.gfx_off_state);
+ WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+ if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
+ adev->gfx.gfx_off_state = true;
}
/**
@@ -2742,7 +2967,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
int i, r;
if (adev->in_s0ix)
- amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
+ amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
if (!adev->ip_blocks[i].status.valid)
@@ -2837,13 +3062,13 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
int i, r;
static enum amd_ip_block_type ip_order[] = {
- AMD_IP_BLOCK_TYPE_GMC,
AMD_IP_BLOCK_TYPE_COMMON,
+ AMD_IP_BLOCK_TYPE_GMC,
AMD_IP_BLOCK_TYPE_PSP,
AMD_IP_BLOCK_TYPE_IH,
};
- for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
+ for (i = 0; i < adev->num_ip_blocks; i++) {
int j;
struct amdgpu_ip_block *block;
@@ -2929,7 +3154,8 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
- adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
+ adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+ (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
r = adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
@@ -2997,6 +3223,10 @@ static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
{
int r;
+ r = amdgpu_amdkfd_resume_iommu(adev);
+ if (r)
+ return r;
+
r = amdgpu_device_ip_resume_phase1(adev);
if (r)
return r;
@@ -3021,7 +3251,7 @@ static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
{
if (amdgpu_sriov_vf(adev)) {
if (adev->is_atom_fw) {
- if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
+ if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
} else {
if (amdgpu_atombios_has_gpu_virtualization_table(adev))
@@ -3044,12 +3274,28 @@ static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
{
switch (asic_type) {
+#ifdef CONFIG_DRM_AMDGPU_SI
+ case CHIP_HAINAN:
+#endif
+ case CHIP_TOPAZ:
+ /* chips with no display hardware */
+ return false;
#if defined(CONFIG_DRM_AMD_DC)
-#if defined(CONFIG_DRM_AMD_DC_SI)
case CHIP_TAHITI:
case CHIP_PITCAIRN:
case CHIP_VERDE:
case CHIP_OLAND:
+ /*
+ * We have systems in the wild with these ASICs that require
+ * LVDS and VGA support which is not supported with DC.
+ *
+ * Fallback to the non-DC driver here by default so as not to
+ * cause regressions.
+ */
+#if defined(CONFIG_DRM_AMD_DC_SI)
+ return amdgpu_dc > 0;
+#else
+ return false;
#endif
case CHIP_BONAIRE:
case CHIP_KAVERI:
@@ -3057,42 +3303,21 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
case CHIP_MULLINS:
/*
* We have systems in the wild with these ASICs that require
- * LVDS and VGA support which is not supported with DC.
+ * VGA support which is not supported with DC.
*
* Fallback to the non-DC driver here by default so as not to
* cause regressions.
*/
return amdgpu_dc > 0;
- case CHIP_HAWAII:
- case CHIP_CARRIZO:
- case CHIP_STONEY:
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- case CHIP_VEGAM:
- case CHIP_TONGA:
- case CHIP_FIJI:
- case CHIP_VEGA10:
- case CHIP_VEGA12:
- case CHIP_VEGA20:
-#if defined(CONFIG_DRM_AMD_DC_DCN)
- case CHIP_RAVEN:
- case CHIP_NAVI10:
- case CHIP_NAVI14:
- case CHIP_NAVI12:
- case CHIP_RENOIR:
- case CHIP_SIENNA_CICHLID:
- case CHIP_NAVY_FLOUNDER:
- case CHIP_DIMGREY_CAVEFISH:
- case CHIP_VANGOGH:
-#endif
+ default:
return amdgpu_dc != 0;
-#endif
+#else
default:
if (amdgpu_dc > 0)
DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
"but isn't supported by ASIC, ignoring\n");
return false;
+#endif
}
}
@@ -3105,13 +3330,14 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
*/
bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
{
- if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
+ if (amdgpu_sriov_vf(adev) ||
+ adev->enable_virtual_display ||
+ (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
return false;
return amdgpu_device_asic_has_dc_support(adev->asic_type);
}
-
static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{
struct amdgpu_device *adev =
@@ -3142,9 +3368,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
if (adev->asic_reset_res)
goto fail;
- if (adev->mmhub.ras_funcs &&
- adev->mmhub.ras_funcs->reset_ras_error_count)
- adev->mmhub.ras_funcs->reset_ras_error_count(adev);
+ if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
+ adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
+ adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
} else {
task_barrier_full(&hive->tb);
@@ -3167,8 +3393,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
int ret = 0;
/*
- * By default timeout for non compute jobs is 10000.
- * And there is no timeout enforced on compute jobs.
+ * By default timeout for non compute jobs is 10000
+ * and 60000 for compute jobs.
* In SR-IOV or passthrough mode, timeout for compute
* jobs are 60000 by default.
*/
@@ -3177,10 +3403,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
if (amdgpu_sriov_vf(adev))
adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
- else if (amdgpu_passthrough(adev))
- adev->compute_timeout = msecs_to_jiffies(60000);
else
- adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
+ adev->compute_timeout = msecs_to_jiffies(60000);
if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
while ((timeout_setting = strsep(&input, ",")) &&
@@ -3194,6 +3418,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
continue;
} else if (timeout < 0) {
timeout = MAX_SCHEDULE_TIMEOUT;
+ dev_warn(adev->dev, "lockup timeout disabled");
+ add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
} else {
timeout = msecs_to_jiffies(timeout);
}
@@ -3229,6 +3455,22 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
return ret;
}
+/**
+ * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
+ */
+static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
+{
+ struct iommu_domain *domain;
+
+ domain = iommu_get_domain_for_dev(adev->dev);
+ if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
+ adev->ram_is_direct_mapped = true;
+}
+
static const struct attribute *amdgpu_dev_attributes[] = {
&dev_attr_product_name.attr,
&dev_attr_product_number.attr,
@@ -3237,7 +3479,6 @@ static const struct attribute *amdgpu_dev_attributes[] = {
NULL
};
-
/**
* amdgpu_device_init - initialize the driver
*
@@ -3271,11 +3512,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->gmc.gart_size = 512 * 1024 * 1024;
adev->accel_working = false;
adev->num_rings = 0;
+ RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
adev->mman.buffer_funcs = NULL;
adev->mman.buffer_funcs_ring = NULL;
adev->vm_manager.vm_pte_funcs = NULL;
adev->vm_manager.vm_pte_num_scheds = 0;
adev->gmc.gmc_funcs = NULL;
+ adev->harvest_ip_mask = 0x0;
adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
@@ -3312,10 +3555,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
- atomic_set(&adev->in_gpu_reset, 0);
- init_rwsem(&adev->reset_sem);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
+ mutex_init(&adev->pm.stable_pstate_ctx_lock);
+ mutex_init(&adev->benchmark_mutex);
+
+ amdgpu_device_init_apu_flags(adev);
r = amdgpu_device_check_arguments(adev);
if (r)
@@ -3336,6 +3581,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_LIST_HEAD(&adev->reset_list);
+ INIT_LIST_HEAD(&adev->ras_list);
+
INIT_DELAYED_WORK(&adev->delayed_init_work,
amdgpu_device_delayed_init_work_handler);
INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
@@ -3344,6 +3591,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
adev->gfx.gfx_off_req_count = 1;
+ adev->gfx.gfx_off_residency = 0;
+ adev->gfx.gfx_off_entrycount = 0;
adev->pm.ac_power = power_supply_is_system_supplied() > 0;
atomic_set(&adev->throttling_logging_enabled, 1);
@@ -3367,6 +3616,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->rmmio_size = pci_resource_len(adev->pdev, 2);
}
+ for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
+ atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
+
adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
if (adev->rmmio == NULL) {
return -ENOMEM;
@@ -3374,24 +3626,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
- /* enable PCIE atomic ops */
- r = pci_enable_atomic_ops_to_root(adev->pdev,
- PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
- PCI_EXP_DEVCAP2_ATOMIC_COMP64);
- if (r) {
- adev->have_atomics_support = false;
- DRM_INFO("PCIE atomic ops is not supported\n");
- } else {
- adev->have_atomics_support = true;
- }
-
amdgpu_device_get_pcie_info(adev);
if (amdgpu_mcbp)
DRM_INFO("MCBP is enabled\n");
- if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
- adev->enable_mes = true;
+ /*
+ * Reset domain needs to be present early, before XGMI hive discovered
+ * (if any) and intitialized to use reset sem and in_gpu reset flag
+ * early on during init and before calling to RREG32.
+ */
+ adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
+ if (!adev->reset_domain)
+ return -ENOMEM;
/* detect hw virtualization here */
amdgpu_detect_virtualization(adev);
@@ -3399,13 +3646,37 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_device_get_job_timeout_settings(adev);
if (r) {
dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
- goto failed_unmap;
+ return r;
}
/* early init functions */
r = amdgpu_device_ip_early_init(adev);
if (r)
- goto failed_unmap;
+ return r;
+
+ /* Enable TMZ based on IP_VERSION */
+ amdgpu_gmc_tmz_set(adev);
+
+ amdgpu_gmc_noretry_set(adev);
+ /* Need to get xgmi info early to decide the reset behavior*/
+ if (adev->gmc.xgmi.supported) {
+ r = adev->gfxhub.funcs->get_xgmi_info(adev);
+ if (r)
+ return r;
+ }
+
+ /* enable PCIE atomic ops */
+ if (amdgpu_sriov_vf(adev))
+ adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
+ adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
+ (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+ else
+ adev->have_atomics_support =
+ !pci_enable_atomic_ops_to_root(adev->pdev,
+ PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP64);
+ if (!adev->have_atomics_support)
+ dev_info(adev->dev, "PCIE atomic ops is not supported\n");
/* doorbell bar mapping and doorbell index init*/
amdgpu_device_doorbell_init(adev);
@@ -3490,9 +3761,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
fence_driver_init:
/* Fence driver */
- r = amdgpu_fence_driver_init(adev);
+ r = amdgpu_fence_driver_sw_init(adev);
if (r) {
- dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
+ dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
goto failed;
}
@@ -3519,6 +3790,8 @@ fence_driver_init:
goto release_ras_con;
}
+ amdgpu_fence_driver_hw_init(adev);
+
dev_info(adev->dev,
"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
adev->gfx.config.max_shader_engines,
@@ -3538,8 +3811,6 @@ fence_driver_init:
/* Get a log2 for easy divisions. */
adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
- amdgpu_fbdev_init(adev);
-
r = amdgpu_pm_sysfs_init(adev);
if (r) {
adev->pm_sysfs_en = false;
@@ -3554,18 +3825,13 @@ fence_driver_init:
} else
adev->ucode_sysfs_en = true;
- if ((amdgpu_testing & 1)) {
- if (adev->accel_working)
- amdgpu_test_moves(adev);
- else
- DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
- }
- if (amdgpu_benchmarking) {
- if (adev->accel_working)
- amdgpu_benchmark(adev, amdgpu_benchmarking);
- else
- DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
- }
+ r = amdgpu_psp_sysfs_init(adev);
+ if (r) {
+ adev->psp_sysfs_en = false;
+ if (!amdgpu_sriov_vf(adev))
+ DRM_ERROR("Creating psp sysfs failed\n");
+ } else
+ adev->psp_sysfs_en = true;
/*
* Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
@@ -3610,7 +3876,7 @@ fence_driver_init:
/* this will fail for cards that aren't VGA class devices, just
* ignore it */
if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
- vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
+ vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
if (amdgpu_device_supports_px(ddev)) {
px = true;
@@ -3623,6 +3889,8 @@ fence_driver_init:
queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
msecs_to_jiffies(AMDGPU_RESUME_MS));
+ amdgpu_device_check_iommu_direct_map(adev);
+
return 0;
release_ras_con:
@@ -3631,30 +3899,45 @@ release_ras_con:
failed:
amdgpu_vf_error_trans_all(adev);
-failed_unmap:
+ return r;
+}
+
+static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
+{
+
+ /* Clear all CPU mappings pointing to this device */
+ unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
+
+ /* Unmap all mapped bars - Doorbell, registers and VRAM */
+ amdgpu_device_doorbell_fini(adev);
+
iounmap(adev->rmmio);
adev->rmmio = NULL;
+ if (adev->mman.aper_base_kaddr)
+ iounmap(adev->mman.aper_base_kaddr);
+ adev->mman.aper_base_kaddr = NULL;
- return r;
+ /* Memory manager related */
+ if (!adev->gmc.xgmi.connected_to_cpu) {
+ arch_phys_wc_del(adev->gmc.vram_mtrr);
+ arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
+ }
}
/**
- * amdgpu_device_fini - tear down the driver
+ * amdgpu_device_fini_hw - tear down the driver
*
* @adev: amdgpu_device pointer
*
* Tear down the driver info (all asics).
* Called at driver shutdown.
*/
-void amdgpu_device_fini(struct amdgpu_device *adev)
+void amdgpu_device_fini_hw(struct amdgpu_device *adev)
{
dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(&adev->delayed_init_work);
- ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
adev->shutdown = true;
- kfree(adev->pci_state);
-
/* make sure IB test finished before entering exclusive mode
* to avoid preemption on IB test
* */
@@ -3666,19 +3949,52 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
/* disable all interrupts */
amdgpu_irq_disable_all(adev);
if (adev->mode_info.mode_config_initialized){
- if (!amdgpu_device_has_dc_support(adev))
+ if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
drm_helper_force_disable_all(adev_to_drm(adev));
else
drm_atomic_helper_shutdown(adev_to_drm(adev));
}
- amdgpu_fence_driver_fini(adev);
+ amdgpu_fence_driver_hw_fini(adev);
+
+ if (adev->mman.initialized) {
+ flush_delayed_work(&adev->mman.bdev.wq);
+ ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+ }
+
if (adev->pm_sysfs_en)
amdgpu_pm_sysfs_fini(adev);
- amdgpu_fbdev_fini(adev);
+ if (adev->ucode_sysfs_en)
+ amdgpu_ucode_sysfs_fini(adev);
+ if (adev->psp_sysfs_en)
+ amdgpu_psp_sysfs_fini(adev);
+ sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
+
+ /* disable ras feature must before hw fini */
+ amdgpu_ras_pre_fini(adev);
+
+ amdgpu_device_ip_fini_early(adev);
+
+ amdgpu_irq_fini_hw(adev);
+
+ if (adev->mman.initialized)
+ ttm_device_clear_dma_mappings(&adev->mman.bdev);
+
+ amdgpu_gart_dummy_page_fini(adev);
+
+ amdgpu_device_unmap_mmio(adev);
+
+}
+
+void amdgpu_device_fini_sw(struct amdgpu_device *adev)
+{
+ int idx;
+
+ amdgpu_fence_driver_sw_fini(adev);
amdgpu_device_ip_fini(adev);
release_firmware(adev->firmware.gpu_info_fw);
adev->firmware.gpu_info_fw = NULL;
adev->accel_working = false;
+ dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
amdgpu_reset_fini(adev);
@@ -3696,21 +4012,47 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
vga_switcheroo_fini_domain_pm_ops(adev->dev);
}
if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
- vga_client_register(adev->pdev, NULL, NULL, NULL);
- iounmap(adev->rmmio);
- adev->rmmio = NULL;
- amdgpu_device_doorbell_fini(adev);
+ vga_client_unregister(adev->pdev);
- if (adev->ucode_sysfs_en)
- amdgpu_ucode_sysfs_fini(adev);
+ if (drm_dev_enter(adev_to_drm(adev), &idx)) {
+
+ iounmap(adev->rmmio);
+ adev->rmmio = NULL;
+ amdgpu_device_doorbell_fini(adev);
+ drm_dev_exit(idx);
+ }
- sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
if (IS_ENABLED(CONFIG_PERF_EVENTS))
amdgpu_pmu_fini(adev);
if (adev->mman.discovery_bin)
amdgpu_discovery_fini(adev);
+
+ amdgpu_reset_put_reset_domain(adev->reset_domain);
+ adev->reset_domain = NULL;
+
+ kfree(adev->pci_state);
+
}
+/**
+ * amdgpu_device_evict_resources - evict device resources
+ * @adev: amdgpu device object
+ *
+ * Evicts all ttm device resources(vram BOs, gart table) from the lru list
+ * of the vram memory type. Mainly used for evicting device resources
+ * at suspend time.
+ *
+ */
+static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
+{
+ /* No need to evict vram on APUs for suspend to ram or s2idle */
+ if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
+ return;
+
+ if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
+ DRM_WARN("evicting device resources failed\n");
+
+}
/*
* Suspend & resume.
@@ -3728,37 +4070,45 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
{
struct amdgpu_device *adev = drm_to_adev(dev);
- int r;
+ int r = 0;
if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
return 0;
adev->in_suspend = true;
+
+ if (amdgpu_sriov_vf(adev)) {
+ amdgpu_virt_fini_data_exchange(adev);
+ r = amdgpu_virt_request_full_gpu(adev, false);
+ if (r)
+ return r;
+ }
+
+ if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
+ DRM_WARN("smart shift update failed\n");
+
drm_kms_helper_poll_disable(dev);
if (fbcon)
- amdgpu_fbdev_set_suspend(adev, 1);
+ drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
cancel_delayed_work_sync(&adev->delayed_init_work);
amdgpu_ras_suspend(adev);
- r = amdgpu_device_ip_suspend_phase1(adev);
+ amdgpu_device_ip_suspend_phase1(adev);
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);
- /* evict vram memory */
- amdgpu_bo_evict_vram(adev);
+ amdgpu_device_evict_resources(adev);
- amdgpu_fence_driver_suspend(adev);
+ amdgpu_fence_driver_hw_fini(adev);
- r = amdgpu_device_ip_suspend_phase2(adev);
- /* evict remaining vram memory
- * This second call to evict vram is to evict the gart page table
- * using the CPU.
- */
- amdgpu_bo_evict_vram(adev);
+ amdgpu_device_ip_suspend_phase2(adev);
+
+ if (amdgpu_sriov_vf(adev))
+ amdgpu_virt_release_full_gpu(adev, false);
return 0;
}
@@ -3778,11 +4128,17 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
struct amdgpu_device *adev = drm_to_adev(dev);
int r = 0;
+ if (amdgpu_sriov_vf(adev)) {
+ r = amdgpu_virt_request_full_gpu(adev, true);
+ if (r)
+ return r;
+ }
+
if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
return 0;
if (adev->in_s0ix)
- amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
+ amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
/* post card */
if (amdgpu_device_need_post(adev)) {
@@ -3792,12 +4148,18 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
}
r = amdgpu_device_ip_resume(adev);
+
+ /* no matter what r is, always need to properly release full GPU */
+ if (amdgpu_sriov_vf(adev)) {
+ amdgpu_virt_init_data_exchange(adev);
+ amdgpu_virt_release_full_gpu(adev, true);
+ }
+
if (r) {
dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
return r;
}
- amdgpu_fence_driver_resume(adev);
-
+ amdgpu_fence_driver_hw_init(adev);
r = amdgpu_device_ip_late_init(adev);
if (r)
@@ -3816,7 +4178,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
flush_delayed_work(&adev->delayed_init_work);
if (fbcon)
- amdgpu_fbdev_set_suspend(adev, 0);
+ drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
drm_kms_helper_poll_enable(dev);
@@ -3843,6 +4205,9 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
#endif
adev->in_suspend = false;
+ if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
+ DRM_WARN("smart shift update failed\n");
+
return 0;
}
@@ -4016,6 +4381,7 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
{
struct dma_fence *fence = NULL, *next = NULL;
struct amdgpu_bo *shadow;
+ struct amdgpu_bo_vm *vmbo;
long r = 1, tmo;
if (amdgpu_sriov_runtime(adev))
@@ -4025,12 +4391,12 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
dev_info(adev->dev, "recover vram bo from shadow start\n");
mutex_lock(&adev->shadow_list_lock);
- list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
-
+ list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
+ shadow = &vmbo->bo;
/* No need to recover an evicted BO */
- if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
- shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
- shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
+ if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
+ shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
+ shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
continue;
r = amdgpu_bo_restore_shadow(shadow, &next);
@@ -4081,6 +4447,11 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
bool from_hypervisor)
{
int r;
+ struct amdgpu_hive_info *hive = NULL;
+ int retry_limit = 0;
+
+retry:
+ amdgpu_amdkfd_pre_reset(adev);
if (from_hypervisor)
r = amdgpu_virt_request_full_gpu(adev, true);
@@ -4089,16 +4460,12 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
if (r)
return r;
- amdgpu_amdkfd_pre_reset(adev);
-
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
goto error;
amdgpu_virt_init_data_exchange(adev);
- /* we need recover gart prior to run SMC/CP/SDMA resume */
- amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4109,9 +4476,20 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
if (r)
goto error;
- amdgpu_irq_gpu_reset_resume_helper(adev);
- r = amdgpu_ib_ring_tests(adev);
- amdgpu_amdkfd_post_reset(adev);
+ hive = amdgpu_get_xgmi_hive(adev);
+ /* Update PSP FW topology after reset */
+ if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
+ r = amdgpu_xgmi_update_topology(hive, adev);
+
+ if (hive)
+ amdgpu_put_xgmi_hive(hive);
+
+ if (!r) {
+ amdgpu_irq_gpu_reset_resume_helper(adev);
+ r = amdgpu_ib_ring_tests(adev);
+
+ amdgpu_amdkfd_post_reset(adev);
+ }
error:
if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
@@ -4120,6 +4498,14 @@ error:
}
amdgpu_virt_release_full_gpu(adev, true);
+ if (AMDGPU_RETRY_SRIOV_RESET(r)) {
+ if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
+ retry_limit++;
+ goto retry;
+ } else
+ DRM_ERROR("GPU reset retry is beyond the retry limit\n");
+ }
+
return r;
}
@@ -4161,45 +4547,38 @@ bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
*/
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
{
- if (!amdgpu_device_ip_check_soft_reset(adev)) {
- dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
- return false;
- }
if (amdgpu_gpu_recovery == 0)
goto disabled;
+ if (!amdgpu_device_ip_check_soft_reset(adev)) {
+ dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
+ return false;
+ }
+
if (amdgpu_sriov_vf(adev))
return true;
if (amdgpu_gpu_recovery == -1) {
switch (adev->asic_type) {
- case CHIP_BONAIRE:
- case CHIP_HAWAII:
- case CHIP_TOPAZ:
- case CHIP_TONGA:
- case CHIP_FIJI:
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- case CHIP_VEGAM:
- case CHIP_VEGA20:
- case CHIP_VEGA10:
- case CHIP_VEGA12:
- case CHIP_RAVEN:
- case CHIP_ARCTURUS:
- case CHIP_RENOIR:
- case CHIP_NAVI10:
- case CHIP_NAVI14:
- case CHIP_NAVI12:
- case CHIP_SIENNA_CICHLID:
- case CHIP_NAVY_FLOUNDER:
- case CHIP_DIMGREY_CAVEFISH:
- case CHIP_VANGOGH:
- case CHIP_ALDEBARAN:
- break;
- default:
+#ifdef CONFIG_DRM_AMDGPU_SI
+ case CHIP_VERDE:
+ case CHIP_TAHITI:
+ case CHIP_PITCAIRN:
+ case CHIP_OLAND:
+ case CHIP_HAINAN:
+#endif
+#ifdef CONFIG_DRM_AMDGPU_CIK
+ case CHIP_KAVERI:
+ case CHIP_KABINI:
+ case CHIP_MULLINS:
+#endif
+ case CHIP_CARRIZO:
+ case CHIP_STONEY:
+ case CHIP_CYAN_SKILLFISH:
goto disabled;
+ default:
+ break;
}
}
@@ -4261,15 +4640,13 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (reset_context->reset_req_dev == adev)
job = reset_context->job;
- /* no need to dump if device is not in good state during probe period */
- if (!adev->gmc.xgmi.pending_reset)
- amdgpu_debugfs_wait_dump(adev);
-
if (amdgpu_sriov_vf(adev)) {
/* stop the data exchange thread */
amdgpu_virt_fini_data_exchange(adev);
}
+ amdgpu_fence_driver_isr_toggle(adev, true);
+
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -4277,11 +4654,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (!ring || !ring->sched.thread)
continue;
+ /*clear job fence from fence drv to avoid force_completion
+ *leave NULL and vm flush fence in fence drv */
+ amdgpu_fence_driver_clear_job_fences(ring);
+
/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
amdgpu_fence_driver_force_completion(ring);
}
- if(job)
+ amdgpu_fence_driver_isr_toggle(adev, false);
+
+ if (job && job->vm)
drm_sched_increase_karma(&job->base);
r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
@@ -4297,7 +4680,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
if (!need_full_reset)
need_full_reset = amdgpu_device_ip_need_full_reset(adev);
- if (!need_full_reset) {
+ if (!need_full_reset && amdgpu_gpu_recovery) {
amdgpu_device_ip_pre_soft_reset(adev);
r = amdgpu_device_ip_soft_reset(adev);
amdgpu_device_ip_post_soft_reset(adev);
@@ -4319,16 +4702,88 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
return r;
}
+static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
+{
+ int i;
+
+ lockdep_assert_held(&adev->reset_domain->sem);
+
+ for (i = 0; i < adev->num_regs; i++) {
+ adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
+ trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
+ adev->reset_dump_reg_value[i]);
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_DEV_COREDUMP
+static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
+ size_t count, void *data, size_t datalen)
+{
+ struct drm_printer p;
+ struct amdgpu_device *adev = data;
+ struct drm_print_iterator iter;
+ int i;
+
+ iter.data = buffer;
+ iter.offset = 0;
+ iter.start = offset;
+ iter.remain = count;
+
+ p = drm_coredump_printer(&iter);
+
+ drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
+ drm_printf(&p, "kernel: " UTS_RELEASE "\n");
+ drm_printf(&p, "module: " KBUILD_MODNAME "\n");
+ drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
+ if (adev->reset_task_info.pid)
+ drm_printf(&p, "process_name: %s PID: %d\n",
+ adev->reset_task_info.process_name,
+ adev->reset_task_info.pid);
+
+ if (adev->reset_vram_lost)
+ drm_printf(&p, "VRAM is lost due to GPU reset!\n");
+ if (adev->num_regs) {
+ drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
+
+ for (i = 0; i < adev->num_regs; i++)
+ drm_printf(&p, "0x%08x: 0x%08x\n",
+ adev->reset_dump_reg_list[i],
+ adev->reset_dump_reg_value[i]);
+ }
+
+ return count - iter.remain;
+}
+
+static void amdgpu_devcoredump_free(void *data)
+{
+}
+
+static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
+{
+ struct drm_device *dev = adev_to_drm(adev);
+
+ ktime_get_ts64(&adev->reset_time);
+ dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
+ amdgpu_devcoredump_read, amdgpu_devcoredump_free);
+}
+#endif
+
int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
{
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
+ bool gpu_reset_for_dev_remove = 0;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
+ amdgpu_reset_reg_dumps(tmp_adev);
+
+ reset_context->reset_device_list = device_list_handle;
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
/* If reset handler not implemented, continue; otherwise return */
if (r == -ENOSYS)
@@ -4341,6 +4796,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
+ gpu_reset_for_dev_remove =
+ test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+ test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
+
/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
@@ -4377,14 +4836,26 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
- if (tmp_adev->mmhub.ras_funcs &&
- tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
- tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
+ if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
+ tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
+ tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
}
amdgpu_ras_intr_cleared();
}
+ /* Since the mode1 reset affects base ip blocks, the
+ * phase1 ip blocks need to be resumed. Otherwise there
+ * will be a BIOS signature error and the psp bootloader
+ * can't load kdb on the next amdgpu install.
+ */
+ if (gpu_reset_for_dev_remove) {
+ list_for_each_entry(tmp_adev, device_list_handle, reset_list)
+ amdgpu_device_ip_resume_phase1(tmp_adev);
+
+ goto end;
+ }
+
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
@@ -4393,20 +4864,29 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
dev_warn(tmp_adev->dev, "asic atom init failed!");
} else {
dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
+ r = amdgpu_amdkfd_resume_iommu(tmp_adev);
+ if (r)
+ goto out;
+
r = amdgpu_device_ip_resume_phase1(tmp_adev);
if (r)
goto out;
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
+#ifdef CONFIG_DEV_COREDUMP
+ tmp_adev->reset_vram_lost = vram_lost;
+ memset(&tmp_adev->reset_task_info, 0,
+ sizeof(tmp_adev->reset_task_info));
+ if (reset_context->job && reset_context->job->vm)
+ tmp_adev->reset_task_info =
+ reset_context->job->vm->task_info;
+ amdgpu_reset_capture_coredumpm(tmp_adev);
+#endif
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
}
- r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
- if (r)
- goto out;
-
r = amdgpu_device_fw_loading(tmp_adev);
if (r)
return r;
@@ -4432,7 +4912,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
if (r)
goto out;
- amdgpu_fbdev_set_suspend(tmp_adev, 0);
+ drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
/*
* The GPU enters bad state once faulty pages
@@ -4466,7 +4946,6 @@ out:
r = amdgpu_ib_ring_tests(tmp_adev);
if (r) {
dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
- r = amdgpu_device_ip_suspend(tmp_adev);
need_full_reset = true;
r = -EAGAIN;
goto end;
@@ -4487,17 +4966,8 @@ end:
return r;
}
-static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
- struct amdgpu_hive_info *hive)
+static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
{
- if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
- return false;
-
- if (hive) {
- down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
- } else {
- down_write(&adev->reset_sem);
- }
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
@@ -4510,56 +4980,12 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
adev->mp1_state = PP_MP1_STATE_NONE;
break;
}
-
- return true;
}
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
+static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
{
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
- atomic_set(&adev->in_gpu_reset, 0);
- up_write(&adev->reset_sem);
-}
-
-/*
- * to lockup a list of amdgpu devices in a hive safely, if not a hive
- * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
- *
- * unlock won't require roll back.
- */
-static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
-{
- struct amdgpu_device *tmp_adev = NULL;
-
- if (adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!hive) {
- dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
- return -ENODEV;
- }
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
- if (!amdgpu_device_lock_adev(tmp_adev, hive))
- goto roll_back;
- }
- } else if (!amdgpu_device_lock_adev(adev, hive))
- return -EAGAIN;
-
- return 0;
-roll_back:
- if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
- /*
- * if the lockup iteration break in the middle of a hive,
- * it may means there may has a race issue,
- * or a hive device locked up independently.
- * we may be in trouble and may not, so will try to roll back
- * the lock and give out a warnning.
- */
- dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
- list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
- amdgpu_device_unlock_adev(tmp_adev);
- }
- }
- return -EAGAIN;
}
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
@@ -4620,7 +5046,7 @@ static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
return 0;
}
-void amdgpu_device_recheck_guilty_jobs(
+static void amdgpu_device_recheck_guilty_jobs(
struct amdgpu_device *adev, struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
{
@@ -4643,13 +5069,33 @@ void amdgpu_device_recheck_guilty_jobs(
drm_sched_reset_karma(s_job);
drm_sched_resubmit_jobs_ext(&ring->sched, 1);
+ if (!s_job->s_fence->parent) {
+ DRM_WARN("Failed to get a HW fence for job!");
+ continue;
+ }
+
ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
if (ret == 0) { /* timeout */
DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
ring->sched.name, s_job->id);
+
+ amdgpu_fence_driver_isr_toggle(adev, true);
+
+ /* Clear this failed job from fence array */
+ amdgpu_fence_driver_clear_job_fences(ring);
+
+ amdgpu_fence_driver_isr_toggle(adev, false);
+
+ /* Since the job won't signal and we go for
+ * another resubmit drop this parent pointer
+ */
+ dma_fence_put(s_job->s_fence->parent);
+ s_job->s_fence->parent = NULL;
+
/* set guilty */
drm_sched_increase_karma(s_job);
+ amdgpu_reset_prepare_hwcontext(adev, reset_context);
retry:
/* do hw reset */
if (amdgpu_sriov_vf(adev)) {
@@ -4688,6 +5134,27 @@ retry:
}
}
+static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+#if defined(CONFIG_DEBUG_FS)
+ if (!amdgpu_sriov_vf(adev))
+ cancel_work(&adev->reset_work);
+#endif
+
+ if (adev->kfd.dev)
+ cancel_work(&adev->kfd.reset_work);
+
+ if (amdgpu_sriov_vf(adev))
+ cancel_work(&adev->virt.flr_work);
+
+ if (con && adev->ras_enabled)
+ cancel_work(&con->recovery_work);
+
+}
+
+
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
@@ -4700,7 +5167,8 @@ retry:
*/
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
- struct amdgpu_job *job)
+ struct amdgpu_job *job,
+ struct amdgpu_reset_context *reset_context)
{
struct list_head device_list, *device_list_handle = NULL;
bool job_signaled = false;
@@ -4710,9 +5178,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
bool need_emergency_restart = false;
bool audio_suspended = false;
int tmp_vram_lost_counter;
- struct amdgpu_reset_context reset_context;
+ bool gpu_reset_for_dev_remove = false;
- memset(&reset_context, 0, sizeof(reset_context));
+ gpu_reset_for_dev_remove =
+ test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
+ test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
/*
* Special case: RAS triggered and full reset isn't supported
@@ -4733,47 +5203,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
dev_info(adev->dev, "GPU %s begin!\n",
need_emergency_restart ? "jobs stop":"reset");
- /*
- * Here we trylock to avoid chain of resets executing from
- * either trigger by jobs on different adevs in XGMI hive or jobs on
- * different schedulers for same device while this TO handler is running.
- * We always reset all schedulers for device and all devices for XGMI
- * hive so that should take care of them too.
- */
- hive = amdgpu_get_xgmi_hive(adev);
- if (hive) {
- if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
- DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
- job ? job->base.id : -1, hive->hive_id);
- amdgpu_put_xgmi_hive(hive);
- if (job)
- drm_sched_increase_karma(&job->base);
- return 0;
- }
+ if (!amdgpu_sriov_vf(adev))
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (hive)
mutex_lock(&hive->hive_lock);
- }
-
- reset_context.method = AMD_RESET_METHOD_NONE;
- reset_context.reset_req_dev = adev;
- reset_context.job = job;
- reset_context.hive = hive;
- clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
-
- /*
- * lock the device before we try to operate the linked list
- * if didn't get the device lock, don't touch the linked list since
- * others may iterating it.
- */
- r = amdgpu_device_lock_hive_adev(adev, hive);
- if (r) {
- dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
- job ? job->base.id : -1);
- /* even we skipped this reset, still need to set the job to guilty */
- if (job)
- drm_sched_increase_karma(&job->base);
- goto skip_recovery;
- }
+ reset_context->job = job;
+ reset_context->hive = hive;
/*
* Build list of devices to reset.
@@ -4781,9 +5217,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* to put adev in the 1st position.
*/
INIT_LIST_HEAD(&device_list);
- if (adev->gmc.xgmi.num_physical_nodes > 1) {
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
list_add_tail(&tmp_adev->reset_list, &device_list);
+ if (gpu_reset_for_dev_remove && adev->shutdown)
+ tmp_adev->shutdown = true;
+ }
if (!list_is_first(&adev->reset_list, &device_list))
list_rotate_to_front(&adev->reset_list, &device_list);
device_list_handle = &device_list;
@@ -4792,8 +5231,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
device_list_handle = &device_list;
}
+ /* We need to lock reset domain only once both for XGMI and single device */
+ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+ reset_list);
+ amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+
+ amdgpu_device_set_mp1_state(tmp_adev);
+
/*
* Try to put the audio codec into suspend state
* before gpu reset started.
@@ -4820,7 +5267,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*/
amdgpu_unregister_gpu_instance(tmp_adev);
- amdgpu_fbdev_set_suspend(tmp_adev, 1);
+ drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
/* disable ras on ALL IPs */
if (!need_emergency_restart &&
@@ -4850,8 +5297,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*
* job->base holds a reference to parent fence
*/
- if (job && job->base.s_fence->parent &&
- dma_fence_is_signaled(job->base.s_fence->parent)) {
+ if (job && dma_fence_is_signaled(&job->hw_fence)) {
job_signaled = true;
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
goto skip_hw_reset;
@@ -4859,26 +5305,46 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
- r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
+ if (gpu_reset_for_dev_remove) {
+ /* Workaroud for ASICs need to disable SMC first */
+ amdgpu_device_smu_fini_early(tmp_adev);
+ }
+ r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
/*TODO Should we stop ?*/
if (r) {
dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
r, adev_to_drm(tmp_adev)->unique);
tmp_adev->asic_reset_res = r;
}
+
+ /*
+ * Drop all pending non scheduler resets. Scheduler resets
+ * were already dropped during drm_sched_stop
+ */
+ amdgpu_device_stop_pending_resets(tmp_adev);
}
tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
/* Actual ASIC resets if needed.*/
- /* TODO Implement XGMI hive reset logic for SRIOV */
+ /* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) {
r = amdgpu_device_reset_sriov(adev, job ? false : true);
if (r)
adev->asic_reset_res = r;
+
+ /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
+ if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
+ amdgpu_ras_resume(adev);
} else {
- r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
- if (r && r == -EAGAIN)
+ r = amdgpu_do_asic_reset(device_list_handle, reset_context);
+ if (r && r == -EAGAIN) {
+ set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context->flags);
+ adev->asic_reset_res = 0;
goto retry;
+ }
+
+ if (!r && gpu_reset_for_dev_remove)
+ goto recover_end;
}
skip_hw_reset:
@@ -4896,7 +5362,7 @@ skip_hw_reset:
if (amdgpu_gpu_recovery == 2 &&
!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
amdgpu_device_recheck_guilty_jobs(
- tmp_adev, device_list_handle, &reset_context);
+ tmp_adev, device_list_handle, reset_context);
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
@@ -4911,10 +5377,16 @@ skip_hw_reset:
drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
}
- if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
+ if (adev->enable_mes)
+ amdgpu_mes_self_test(tmp_adev);
+
+ if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
}
+ if (tmp_adev->asic_reset_res)
+ r = tmp_adev->asic_reset_res;
+
tmp_adev->asic_reset_res = 0;
if (r) {
@@ -4923,6 +5395,8 @@ skip_hw_reset:
amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
} else {
dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
+ if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
+ DRM_WARN("smart shift update failed\n");
}
}
@@ -4930,7 +5404,7 @@ skip_sched_resume:
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/* unlock kfd: SRIOV would do it separately */
if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
- amdgpu_amdkfd_post_reset(tmp_adev);
+ amdgpu_amdkfd_post_reset(tmp_adev);
/* kfd_post_reset will do nothing if kfd device is not initialized,
* need to bring up kfd here if it's not be initialized before
@@ -4940,18 +5414,24 @@ skip_sched_resume:
if (audio_suspended)
amdgpu_device_resume_display_audio(tmp_adev);
- amdgpu_device_unlock_adev(tmp_adev);
+
+ amdgpu_device_unset_mp1_state(tmp_adev);
}
-skip_recovery:
+recover_end:
+ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
+ reset_list);
+ amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
+
if (hive) {
- atomic_set(&hive->in_reset, 0);
mutex_unlock(&hive->hive_lock);
amdgpu_put_xgmi_hive(hive);
}
- if (r && r != -EAGAIN)
+ if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
+
+ atomic_set(&adev->reset_domain->reset_res, r);
return r;
}
@@ -5103,6 +5583,37 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
}
}
+/**
+ * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
+ *
+ * @adev: amdgpu_device pointer
+ * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
+ *
+ * Return true if @peer_adev can access (DMA) @adev through the PCIe
+ * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
+ * @peer_adev.
+ */
+bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
+ struct amdgpu_device *peer_adev)
+{
+#ifdef CONFIG_HSA_AMD_P2P
+ uint64_t address_mask = peer_adev->dev->dma_mask ?
+ ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
+ resource_size_t aper_limit =
+ adev->gmc.aper_base + adev->gmc.aper_size - 1;
+ bool p2p_access =
+ !adev->gmc.xgmi.connected_to_cpu &&
+ !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
+
+ return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
+ adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
+ !(adev->gmc.aper_base & address_mask ||
+ aper_limit & address_mask));
+#else
+ return false;
+#endif
+}
+
int amdgpu_device_baco_enter(struct drm_device *dev)
{
struct amdgpu_device *adev = drm_to_adev(dev);
@@ -5111,7 +5622,8 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
return -ENOTSUPP;
- if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
+ if (ras && adev->ras_enabled &&
+ adev->nbio.funcs->enable_doorbell_interrupt)
adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
return amdgpu_dpm_baco_enter(adev);
@@ -5130,24 +5642,15 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
if (ret)
return ret;
- if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
+ if (ras && adev->ras_enabled &&
+ adev->nbio.funcs->enable_doorbell_interrupt)
adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
- return 0;
-}
-
-static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
-{
- int i;
-
- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- struct amdgpu_ring *ring = adev->rings[i];
-
- if (!ring || !ring->sched.thread)
- continue;
+ if (amdgpu_passthrough(adev) &&
+ adev->nbio.funcs->clear_doorbell_interrupt)
+ adev->nbio.funcs->clear_doorbell_interrupt(adev);
- cancel_delayed_work_sync(&ring->sched.work_tdr);
- }
+ return 0;
}
/**
@@ -5172,20 +5675,19 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
return PCI_ERS_RESULT_DISCONNECT;
}
+ adev->pci_channel_state = state;
+
switch (state) {
case pci_channel_io_normal:
return PCI_ERS_RESULT_CAN_RECOVER;
/* Fatal error, prepare for slot reset */
case pci_channel_io_frozen:
/*
- * Cancel and wait for all TDRs in progress if failing to
- * set adev->in_gpu_reset in amdgpu_device_lock_adev
- *
- * Locking adev->reset_sem will prevent any external access
+ * Locking adev->reset_domain->sem will prevent any external access
* to GPU during PCI error recovery
*/
- while (!amdgpu_device_lock_adev(adev, NULL))
- amdgpu_cancel_all_tdr(adev);
+ amdgpu_device_lock_reset_domain(adev->reset_domain);
+ amdgpu_device_set_mp1_state(adev);
/*
* Block any work scheduling as we do for regular GPU reset
@@ -5275,10 +5777,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
reset_context.reset_req_dev = adev;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
+ set_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
- adev->in_pci_err_recovery = true;
+ adev->no_hw_access = true;
r = amdgpu_device_pre_asic_reset(adev, &reset_context);
- adev->in_pci_err_recovery = false;
+ adev->no_hw_access = false;
if (r)
goto out;
@@ -5292,7 +5795,8 @@ out:
DRM_INFO("PCIe error recovery succeeded\n");
} else {
DRM_ERROR("PCIe error recovery failed, err:%d", r);
- amdgpu_device_unlock_adev(adev);
+ amdgpu_device_unset_mp1_state(adev);
+ amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
@@ -5314,6 +5818,10 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
DRM_INFO("PCI error: resume callback!!\n");
+ /* Only continue execution for the case of pci_channel_io_frozen */
+ if (adev->pci_channel_state != pci_channel_io_frozen)
+ return;
+
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -5325,7 +5833,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
drm_sched_start(&ring->sched, true);
}
- amdgpu_device_unlock_adev(adev);
+ amdgpu_device_unset_mp1_state(adev);
+ amdgpu_device_unlock_reset_domain(adev->reset_domain);
}
bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
@@ -5373,4 +5882,141 @@ bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
return true;
}
+void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring)
+{
+#ifdef CONFIG_X86_64
+ if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
+ return;
+#endif
+ if (adev->gmc.xgmi.connected_to_cpu)
+ return;
+
+ if (ring && ring->funcs->emit_hdp_flush)
+ amdgpu_ring_emit_hdp_flush(ring);
+ else
+ amdgpu_asic_flush_hdp(adev, ring);
+}
+
+void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring)
+{
+#ifdef CONFIG_X86_64
+ if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
+ return;
+#endif
+ if (adev->gmc.xgmi.connected_to_cpu)
+ return;
+
+ amdgpu_asic_invalidate_hdp(adev, ring);
+}
+
+int amdgpu_in_reset(struct amdgpu_device *adev)
+{
+ return atomic_read(&adev->reset_domain->in_gpu_reset);
+ }
+
+/**
+ * amdgpu_device_halt() - bring hardware to some kind of halt state
+ *
+ * @adev: amdgpu_device pointer
+ *
+ * Bring hardware to some kind of halt state so that no one can touch it
+ * any more. It will help to maintain error context when error occurred.
+ * Compare to a simple hang, the system will keep stable at least for SSH
+ * access. Then it should be trivial to inspect the hardware state and
+ * see what's going on. Implemented as following:
+ *
+ * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
+ * clears all CPU mappings to device, disallows remappings through page faults
+ * 2. amdgpu_irq_disable_all() disables all interrupts
+ * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
+ * 4. set adev->no_hw_access to avoid potential crashes after setp 5
+ * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
+ * 6. pci_disable_device() and pci_wait_for_pending_transaction()
+ * flush any in flight DMA operations
+ */
+void amdgpu_device_halt(struct amdgpu_device *adev)
+{
+ struct pci_dev *pdev = adev->pdev;
+ struct drm_device *ddev = adev_to_drm(adev);
+
+ drm_dev_unplug(ddev);
+
+ amdgpu_irq_disable_all(adev);
+
+ amdgpu_fence_driver_hw_fini(adev);
+
+ adev->no_hw_access = true;
+
+ amdgpu_device_unmap_mmio(adev);
+
+ pci_disable_device(pdev);
+ pci_wait_for_pending_transaction(pdev);
+}
+
+u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
+ u32 reg)
+{
+ unsigned long flags, address, data;
+ u32 r;
+
+ address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
+ data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
+
+ spin_lock_irqsave(&adev->pcie_idx_lock, flags);
+ WREG32(address, reg * 4);
+ (void)RREG32(address);
+ r = RREG32(data);
+ spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+ return r;
+}
+
+void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
+ u32 reg, u32 v)
+{
+ unsigned long flags, address, data;
+
+ address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
+ data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
+ spin_lock_irqsave(&adev->pcie_idx_lock, flags);
+ WREG32(address, reg * 4);
+ (void)RREG32(address);
+ WREG32(data, v);
+ (void)RREG32(data);
+ spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
+}
+
+/**
+ * amdgpu_device_switch_gang - switch to a new gang
+ * @adev: amdgpu_device pointer
+ * @gang: the gang to switch to
+ *
+ * Try to switch to a new gang.
+ * Returns: NULL if we switched to the new gang or a reference to the current
+ * gang leader.
+ */
+struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
+ struct dma_fence *gang)
+{
+ struct dma_fence *old = NULL;
+
+ do {
+ dma_fence_put(old);
+ rcu_read_lock();
+ old = dma_fence_get_rcu_safe(&adev->gang_submit);
+ rcu_read_unlock();
+
+ if (old == gang)
+ break;
+
+ if (!dma_fence_is_signaled(old))
+ return old;
+
+ } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
+ old, gang) != old);
+
+ dma_fence_put(old);
+ return NULL;
+}