diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 357 |
1 files changed, 229 insertions, 128 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index af9bdf16eefd..a8b08a72b71b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/console.h> #include <linux/slab.h> +#include <linux/iommu.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_probe_helper.h> @@ -125,6 +126,7 @@ const char *amdgpu_asic_name[] = { "DIMGREY_CAVEFISH", "BEIGE_GOBY", "YELLOW_CARP", + "IP DISCOVERY", "LAST", }; @@ -305,7 +307,7 @@ void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, uint64_t last; int idx; - if (!drm_dev_enter(&adev->ddev, &idx)) + if (!drm_dev_enter(adev_to_drm(adev), &idx)) return; BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4)); @@ -330,7 +332,7 @@ void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, } /** - * amdgpu_device_vram_access - access vram by vram aperature + * amdgpu_device_aper_access - access vram by vram aperature * * @adev: amdgpu_device pointer * @pos: offset of the buffer in vram @@ -549,11 +551,11 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, trace_amdgpu_device_wreg(adev->pdev->device, reg, v); } -/* +/** * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range * * this function is invoked only the debugfs register access - * */ + */ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { @@ -1099,7 +1101,7 @@ static void amdgpu_device_wb_fini(struct amdgpu_device *adev) } /** - * amdgpu_device_wb_init- Init Writeback driver info and allocate memory + * amdgpu_device_wb_init - Init Writeback driver info and allocate memory * * @adev: amdgpu_device pointer * @@ -2126,46 +2128,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (r) return r; break; - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: - case CHIP_RAVEN: - case CHIP_ARCTURUS: - case CHIP_RENOIR: - case CHIP_ALDEBARAN: - if (adev->flags & AMD_IS_APU) - adev->family = AMDGPU_FAMILY_RV; - else - adev->family = AMDGPU_FAMILY_AI; - - r = soc15_set_ip_blocks(adev); - if (r) - return r; - break; - case CHIP_NAVI10: - case CHIP_NAVI14: - case CHIP_NAVI12: - case CHIP_SIENNA_CICHLID: - case CHIP_NAVY_FLOUNDER: - case CHIP_DIMGREY_CAVEFISH: - case CHIP_BEIGE_GOBY: - case CHIP_VANGOGH: - case CHIP_YELLOW_CARP: - case CHIP_CYAN_SKILLFISH: - if (adev->asic_type == CHIP_VANGOGH) - adev->family = AMDGPU_FAMILY_VGH; - else if (adev->asic_type == CHIP_YELLOW_CARP) - adev->family = AMDGPU_FAMILY_YC; - else - adev->family = AMDGPU_FAMILY_NV; - - r = nv_set_ip_blocks(adev); + default: + r = amdgpu_discovery_set_ip_blocks(adev); if (r) return r; break; - default: - /* FIXME: not supported yet */ - return -EINVAL; } amdgpu_amdkfd_device_probe(adev); @@ -2350,6 +2317,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) /* need to do gmc hw init early so we can allocate gpu mem */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { + /* Try to reserve bad pages early */ + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_exchange_data(adev); + r = amdgpu_device_vram_scratch_init(adev); if (r) { DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); @@ -2381,7 +2352,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } if (amdgpu_sriov_vf(adev)) - amdgpu_virt_init_data_exchange(adev); + amdgpu_virt_exchange_data(adev); r = amdgpu_ib_pool_init(adev); if (r) { @@ -2432,10 +2403,6 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (!adev->gmc.xgmi.pending_reset) amdgpu_amdkfd_device_init(adev); - r = amdgpu_amdkfd_resume_iommu(adev); - if (r) - goto init_failed; - amdgpu_fru_get_product_info(adev); init_failed: @@ -2652,11 +2619,10 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) if (r) DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); - /* For XGMI + passthrough configuration on arcturus, enable light SBR */ - if (adev->asic_type == CHIP_ARCTURUS && - amdgpu_passthrough(adev) && - adev->gmc.xgmi.num_physical_nodes > 1) - smu_set_light_sbr(&adev->smu, true); + /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ + if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| + adev->asic_type == CHIP_ALDEBARAN )) + smu_handle_passthrough_sbr(&adev->smu, true); if (adev->gmc.xgmi.num_physical_nodes > 1) { mutex_lock(&mgpu_info.mutex); @@ -2695,6 +2661,36 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) return 0; } +/** + * amdgpu_device_smu_fini_early - smu hw_fini wrapper + * + * @adev: amdgpu_device pointer + * + * For ASICs need to disable SMC first + */ +static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) +{ + int i, r; + + if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) + return; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.hw) + continue; + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { + r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); + /* XXX handle errors */ + if (r) { + DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); + } + adev->ip_blocks[i].status.hw = false; + break; + } + } +} + static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) { int i, r; @@ -2715,21 +2711,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - /* need to disable SMC first */ - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.hw) - continue; - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { - r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); - /* XXX handle errors */ - if (r) { - DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); - } - adev->ip_blocks[i].status.hw = false; - break; - } - } + /* Workaroud for ASICs need to disable SMC first */ + amdgpu_device_smu_fini_early(adev); for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.hw) @@ -2745,6 +2728,11 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) adev->ip_blocks[i].status.hw = false; } + if (amdgpu_sriov_vf(adev)) { + if (amdgpu_virt_release_full_gpu(adev, false)) + DRM_ERROR("failed to release exclusive mode on fini\n"); + } + return 0; } @@ -2766,8 +2754,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) amdgpu_virt_release_ras_err_handler_data(adev); - amdgpu_ras_pre_fini(adev); - if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_remove_device(adev); @@ -2805,10 +2791,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) amdgpu_ras_fini(adev); - if (amdgpu_sriov_vf(adev)) - if (amdgpu_virt_release_full_gpu(adev, false)) - DRM_ERROR("failed to release exclusive mode on fini\n"); - return 0; } @@ -3203,12 +3185,28 @@ static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) { switch (asic_type) { +#ifdef CONFIG_DRM_AMDGPU_SI + case CHIP_HAINAN: +#endif + case CHIP_TOPAZ: + /* chips with no display hardware */ + return false; #if defined(CONFIG_DRM_AMD_DC) -#if defined(CONFIG_DRM_AMD_DC_SI) case CHIP_TAHITI: case CHIP_PITCAIRN: case CHIP_VERDE: case CHIP_OLAND: + /* + * We have systems in the wild with these ASICs that require + * LVDS and VGA support which is not supported with DC. + * + * Fallback to the non-DC driver here by default so as not to + * cause regressions. + */ +#if defined(CONFIG_DRM_AMD_DC_SI) + return amdgpu_dc > 0; +#else + return false; #endif case CHIP_BONAIRE: case CHIP_KAVERI: @@ -3240,6 +3238,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_NAVI14: case CHIP_NAVI12: case CHIP_RENOIR: + case CHIP_CYAN_SKILLFISH: case CHIP_SIENNA_CICHLID: case CHIP_NAVY_FLOUNDER: case CHIP_DIMGREY_CAVEFISH: @@ -3247,13 +3246,15 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_VANGOGH: case CHIP_YELLOW_CARP: #endif + default: return amdgpu_dc != 0; -#endif +#else default: if (amdgpu_dc > 0) DRM_INFO_ONCE("Display Core has been requested via kernel parameter " "but isn't supported by ASIC, ignoring\n"); return false; +#endif } } @@ -3354,6 +3355,8 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) continue; } else if (timeout < 0) { timeout = MAX_SCHEDULE_TIMEOUT; + dev_warn(adev->dev, "lockup timeout disabled"); + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); } else { timeout = msecs_to_jiffies(timeout); } @@ -3389,6 +3392,22 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) return ret; } +/** + * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU + * + * @adev: amdgpu_device pointer + * + * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode + */ +static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) +{ + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev(adev->dev); + if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) + adev->ram_is_direct_mapped = true; +} + static const struct attribute *amdgpu_dev_attributes[] = { &dev_attr_product_name.attr, &dev_attr_product_number.attr, @@ -3531,6 +3550,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->rmmio_size = pci_resource_len(adev->pdev, 2); } + for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++) + atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN); + adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size); if (adev->rmmio == NULL) { return -ENOMEM; @@ -3538,17 +3560,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); - /* enable PCIE atomic ops */ - r = pci_enable_atomic_ops_to_root(adev->pdev, - PCI_EXP_DEVCAP2_ATOMIC_COMP32 | - PCI_EXP_DEVCAP2_ATOMIC_COMP64); - if (r) { - adev->have_atomics_support = false; - DRM_INFO("PCIE atomic ops is not supported\n"); - } else { - adev->have_atomics_support = true; - } - amdgpu_device_get_pcie_info(adev); if (amdgpu_mcbp) @@ -3571,6 +3582,26 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; + /* Need to get xgmi info early to decide the reset behavior*/ + if (adev->gmc.xgmi.supported) { + r = adev->gfxhub.funcs->get_xgmi_info(adev); + if (r) + return r; + } + + /* enable PCIE atomic ops */ + if (amdgpu_sriov_vf(adev)) + adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) + adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_enabled_flags == + (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64); + else + adev->have_atomics_support = + !pci_enable_atomic_ops_to_root(adev->pdev, + PCI_EXP_DEVCAP2_ATOMIC_COMP32 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64); + if (!adev->have_atomics_support) + dev_info(adev->dev, "PCIE atomic ops is not supported\n"); + /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); @@ -3704,8 +3735,6 @@ fence_driver_init: /* Get a log2 for easy divisions. */ adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); - amdgpu_fbdev_init(adev); - r = amdgpu_pm_sysfs_init(adev); if (r) { adev->pm_sysfs_en = false; @@ -3789,6 +3818,8 @@ fence_driver_init: queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); + amdgpu_device_check_iommu_direct_map(adev); + return 0; release_ras_con: @@ -3822,7 +3853,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) } /** - * amdgpu_device_fini - tear down the driver + * amdgpu_device_fini_hw - tear down the driver * * @adev: amdgpu_device pointer * @@ -3850,7 +3881,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) /* disable all interrupts */ amdgpu_irq_disable_all(adev); if (adev->mode_info.mode_config_initialized){ - if (!amdgpu_device_has_dc_support(adev)) + if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev))) drm_helper_force_disable_all(adev_to_drm(adev)); else drm_atomic_helper_shutdown(adev_to_drm(adev)); @@ -3863,21 +3894,27 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) amdgpu_ucode_sysfs_fini(adev); sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); - amdgpu_fbdev_fini(adev); + /* disable ras feature must before hw fini */ + amdgpu_ras_pre_fini(adev); + + amdgpu_device_ip_fini_early(adev); amdgpu_irq_fini_hw(adev); - amdgpu_device_ip_fini_early(adev); + if (adev->mman.initialized) + ttm_device_clear_dma_mappings(&adev->mman.bdev); amdgpu_gart_dummy_page_fini(adev); - amdgpu_device_unmap_mmio(adev); + if (drm_dev_is_unplugged(adev_to_drm(adev))) + amdgpu_device_unmap_mmio(adev); + } void amdgpu_device_fini_sw(struct amdgpu_device *adev) { - amdgpu_device_ip_fini(adev); amdgpu_fence_driver_sw_fini(adev); + amdgpu_device_ip_fini(adev); release_firmware(adev->firmware.gpu_info_fw); adev->firmware.gpu_info_fw = NULL; adev->accel_working = false; @@ -3909,6 +3946,25 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) } +/** + * amdgpu_device_evict_resources - evict device resources + * @adev: amdgpu device object + * + * Evicts all ttm device resources(vram BOs, gart table) from the lru list + * of the vram memory type. Mainly used for evicting device resources + * at suspend time. + * + */ +static void amdgpu_device_evict_resources(struct amdgpu_device *adev) +{ + /* No need to evict vram on APUs for suspend to ram */ + if (adev->in_s3 && (adev->flags & AMD_IS_APU)) + return; + + if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) + DRM_WARN("evicting device resources failed\n"); + +} /* * Suspend & resume. @@ -3938,7 +3994,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) drm_kms_helper_poll_disable(dev); if (fbcon) - amdgpu_fbdev_set_suspend(adev, 1); + drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); cancel_delayed_work_sync(&adev->delayed_init_work); @@ -3949,17 +4005,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) if (!adev->in_s0ix) amdgpu_amdkfd_suspend(adev, adev->in_runpm); - /* evict vram memory */ - amdgpu_bo_evict_vram(adev); + /* First evict vram memory */ + amdgpu_device_evict_resources(adev); amdgpu_fence_driver_hw_fini(adev); amdgpu_device_ip_suspend_phase2(adev); - /* evict remaining vram memory - * This second call to evict vram is to evict the gart page table - * using the CPU. + /* This second call to evict device resources is to evict + * the gart page table using the CPU. */ - amdgpu_bo_evict_vram(adev); + amdgpu_device_evict_resources(adev); return 0; } @@ -4016,7 +4071,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) flush_delayed_work(&adev->delayed_init_work); if (fbcon) - amdgpu_fbdev_set_suspend(adev, 0); + drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false); drm_kms_helper_poll_enable(dev); @@ -4285,6 +4340,11 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool from_hypervisor) { int r; + struct amdgpu_hive_info *hive = NULL; + + amdgpu_amdkfd_pre_reset(adev); + + amdgpu_amdkfd_pre_reset(adev); if (from_hypervisor) r = amdgpu_virt_request_full_gpu(adev, true); @@ -4293,8 +4353,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; - amdgpu_amdkfd_pre_reset(adev); - /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) @@ -4313,9 +4371,19 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) goto error; - amdgpu_irq_gpu_reset_resume_helper(adev); - r = amdgpu_ib_ring_tests(adev); - amdgpu_amdkfd_post_reset(adev); + hive = amdgpu_get_xgmi_hive(adev); + /* Update PSP FW topology after reset */ + if (hive && adev->gmc.xgmi.num_physical_nodes > 1) + r = amdgpu_xgmi_update_topology(hive, adev); + + if (hive) + amdgpu_put_xgmi_hive(hive); + + if (!r) { + amdgpu_irq_gpu_reset_resume_helper(adev); + r = amdgpu_ib_ring_tests(adev); + amdgpu_amdkfd_post_reset(adev); + } error: if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { @@ -4458,7 +4526,7 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, struct amdgpu_reset_context *reset_context) { - int i, j, r = 0; + int i, r = 0; struct amdgpu_job *job = NULL; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); @@ -4466,10 +4534,6 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, if (reset_context->reset_req_dev == adev) job = reset_context->job; - /* no need to dump if device is not in good state during probe period */ - if (!adev->gmc.xgmi.pending_reset) - amdgpu_debugfs_wait_dump(adev); - if (amdgpu_sriov_vf(adev)) { /* stop the data exchange thread */ amdgpu_virt_fini_data_exchange(adev); @@ -4484,15 +4548,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, /*clear job fence from fence drv to avoid force_completion *leave NULL and vm flush fence in fence drv */ - for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) { - struct dma_fence *old, **ptr; + amdgpu_fence_driver_clear_job_fences(ring); - ptr = &ring->fence_drv.fences[j]; - old = rcu_dereference_protected(*ptr, 1); - if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) { - RCU_INIT_POINTER(*ptr, NULL); - } - } /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } @@ -4652,7 +4709,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, if (r) goto out; - amdgpu_fbdev_set_suspend(tmp_adev, 0); + drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false); /* * The GPU enters bad state once faulty pages @@ -4751,7 +4808,7 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp { struct amdgpu_device *tmp_adev = NULL; - if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { if (!hive) { dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); return -ENODEV; @@ -4860,6 +4917,9 @@ static void amdgpu_device_recheck_guilty_jobs( /* clear job's guilty and depend the folowing step to decide the real one */ drm_sched_reset_karma(s_job); + /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get + * to make sure fence is balanced */ + dma_fence_get(s_job->s_fence->parent); drm_sched_resubmit_jobs_ext(&ring->sched, 1); ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); @@ -4895,6 +4955,7 @@ retry: /* got the hw fence, signal finished fence */ atomic_dec(ring->sched.score); + dma_fence_put(s_job->s_fence->parent); dma_fence_get(&s_job->s_fence->finished); dma_fence_signal(&s_job->s_fence->finished); dma_fence_put(&s_job->s_fence->finished); @@ -4959,7 +5020,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev); + if (!amdgpu_sriov_vf(adev)) + hive = amdgpu_get_xgmi_hive(adev); if (hive) { if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", @@ -5000,7 +5062,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * to put adev in the 1st position. */ INIT_LIST_HEAD(&device_list); - if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) list_add_tail(&tmp_adev->reset_list, &device_list); if (!list_is_first(&adev->reset_list, &device_list)) @@ -5039,7 +5101,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ amdgpu_unregister_gpu_instance(tmp_adev); - amdgpu_fbdev_set_suspend(tmp_adev, 1); + drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); /* disable ras on ALL IPs */ if (!need_emergency_restart && @@ -5089,7 +5151,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); /* Actual ASIC resets if needed.*/ - /* TODO Implement XGMI hive reset logic for SRIOV */ + /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { r = amdgpu_device_reset_sriov(adev, job ? false : true); if (r) @@ -5130,7 +5192,7 @@ skip_hw_reset: drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res); } - if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) { + if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) { drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); } @@ -5151,7 +5213,7 @@ skip_sched_resume: list_for_each_entry(tmp_adev, device_list_handle, reset_list) { /* unlock kfd: SRIOV would do it separately */ if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) - amdgpu_amdkfd_post_reset(tmp_adev); + amdgpu_amdkfd_post_reset(tmp_adev); /* kfd_post_reset will do nothing if kfd device is not initialized, * need to bring up kfd here if it's not be initialized before @@ -5634,3 +5696,42 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, amdgpu_asic_invalidate_hdp(adev, ring); } + +/** + * amdgpu_device_halt() - bring hardware to some kind of halt state + * + * @adev: amdgpu_device pointer + * + * Bring hardware to some kind of halt state so that no one can touch it + * any more. It will help to maintain error context when error occurred. + * Compare to a simple hang, the system will keep stable at least for SSH + * access. Then it should be trivial to inspect the hardware state and + * see what's going on. Implemented as following: + * + * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), + * clears all CPU mappings to device, disallows remappings through page faults + * 2. amdgpu_irq_disable_all() disables all interrupts + * 3. amdgpu_fence_driver_hw_fini() signals all HW fences + * 4. set adev->no_hw_access to avoid potential crashes after setp 5 + * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings + * 6. pci_disable_device() and pci_wait_for_pending_transaction() + * flush any in flight DMA operations + */ +void amdgpu_device_halt(struct amdgpu_device *adev) +{ + struct pci_dev *pdev = adev->pdev; + struct drm_device *ddev = adev_to_drm(adev); + + drm_dev_unplug(ddev); + + amdgpu_irq_disable_all(adev); + + amdgpu_fence_driver_hw_fini(adev); + + adev->no_hw_access = true; + + amdgpu_device_unmap_mmio(adev); + + pci_disable_device(pdev); + pci_wait_for_pending_transaction(pdev); +} |