diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 613 |
1 files changed, 436 insertions, 177 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1e4dd09a5072..4f8fb4ecde34 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -30,8 +30,8 @@ #include <linux/console.h> #include <linux/slab.h> #include <drm/drmP.h> -#include <drm/drm_crtc_helper.h> #include <drm/drm_atomic_helper.h> +#include <drm/drm_probe_helper.h> #include <drm/amdgpu_drm.h> #include <linux/vgaarb.h> #include <linux/vga_switcheroo.h> @@ -59,6 +59,8 @@ #include "amdgpu_amdkfd.h" #include "amdgpu_pm.h" +#include "amdgpu_xgmi.h" + MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); @@ -513,6 +515,7 @@ void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) */ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) { + /* No doorbell on SI hardware generation */ if (adev->asic_type < CHIP_BONAIRE) { adev->doorbell.base = 0; @@ -525,15 +528,26 @@ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) return -EINVAL; + amdgpu_asic_init_doorbell_index(adev); + /* doorbell bar mapping */ adev->doorbell.base = pci_resource_start(adev->pdev, 2); adev->doorbell.size = pci_resource_len(adev->pdev, 2); adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), - AMDGPU_DOORBELL_MAX_ASSIGNMENT+1); + adev->doorbell_index.max_assignment+1); if (adev->doorbell.num_doorbells == 0) return -EINVAL; + /* For Vega, reserve and map two pages on doorbell BAR since SDMA + * paging queue doorbell use the second page. The + * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the + * doorbells are in the first page. So with paging queue enabled, + * the max num_doorbells should + 1 page (0x400 in dword) + */ + if (adev->asic_type >= CHIP_VEGA10) + adev->doorbell.num_doorbells += 0x400; + adev->doorbell.ptr = ioremap(adev->doorbell.base, adev->doorbell.num_doorbells * sizeof(u32)); @@ -1493,8 +1507,6 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) } adev->powerplay.pp_feature = amdgpu_pp_feature_mask; - if (amdgpu_sriov_vf(adev)) - adev->powerplay.pp_feature &= ~PP_GFXOFF_MASK; for (i = 0; i < adev->num_ip_blocks; i++) { if ((amdgpu_ip_block_mask & (1 << i)) == 0) { @@ -1600,7 +1612,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev) } } - if (adev->powerplay.pp_funcs->load_firmware) { + if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->load_firmware) { r = adev->powerplay.pp_funcs->load_firmware(adev->powerplay.pp_handle); if (r) { pr_err("firmware loading failed\n"); @@ -1633,7 +1645,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) { DRM_ERROR("sw_init of IP block <%s> failed %d\n", adev->ip_blocks[i].version->funcs->name, r); - return r; + goto init_failed; } adev->ip_blocks[i].status.sw = true; @@ -1642,26 +1654,28 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) r = amdgpu_device_vram_scratch_init(adev); if (r) { DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); - return r; + goto init_failed; } r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); if (r) { DRM_ERROR("hw_init %d failed %d\n", i, r); - return r; + goto init_failed; } r = amdgpu_device_wb_init(adev); if (r) { DRM_ERROR("amdgpu_device_wb_init failed %d\n", r); - return r; + goto init_failed; } adev->ip_blocks[i].status.hw = true; /* right after GMC hw init, we create CSA */ if (amdgpu_sriov_vf(adev)) { - r = amdgpu_allocate_static_csa(adev); + r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj, + AMDGPU_GEM_DOMAIN_VRAM, + AMDGPU_CSA_SIZE); if (r) { DRM_ERROR("allocate CSA failed %d\n", r); - return r; + goto init_failed; } } } @@ -1669,27 +1683,32 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/ if (r) - return r; + goto init_failed; r = amdgpu_device_ip_hw_init_phase1(adev); if (r) - return r; + goto init_failed; r = amdgpu_device_fw_loading(adev); if (r) - return r; + goto init_failed; r = amdgpu_device_ip_hw_init_phase2(adev); if (r) - return r; + goto init_failed; - amdgpu_xgmi_add_device(adev); + if (adev->gmc.xgmi.num_physical_nodes > 1) + amdgpu_xgmi_add_device(adev); amdgpu_amdkfd_device_init(adev); - if (amdgpu_sriov_vf(adev)) +init_failed: + if (amdgpu_sriov_vf(adev)) { + if (!r) + amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); + } - return 0; + return r; } /** @@ -1850,6 +1869,9 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) { int i, r; + if (adev->gmc.xgmi.num_physical_nodes > 1) + amdgpu_xgmi_remove_device(adev); + amdgpu_amdkfd_device_fini(adev); amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); @@ -1892,7 +1914,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { amdgpu_ucode_free_bo(adev); - amdgpu_free_static_csa(adev); + amdgpu_free_static_csa(&adev->virt.csa_obj); amdgpu_device_wb_fini(adev); amdgpu_device_vram_scratch_fini(adev); } @@ -2113,7 +2135,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) continue; r = block->version->funcs->hw_init(adev); - DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); + DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); if (r) return r; } @@ -2147,7 +2169,7 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) continue; r = block->version->funcs->hw_init(adev); - DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); + DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded"); if (r) return r; } @@ -2339,6 +2361,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) return amdgpu_device_asic_has_dc_support(adev->asic_type); } + +static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) +{ + struct amdgpu_device *adev = + container_of(__work, struct amdgpu_device, xgmi_reset_work); + + adev->asic_reset_res = amdgpu_asic_reset(adev); + if (adev->asic_reset_res) + DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", + adev->asic_reset_res, adev->ddev->unique); +} + + /** * amdgpu_device_init - initialize the driver * @@ -2437,6 +2472,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, amdgpu_device_delay_enable_gfx_off); + INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + adev->gfx.gfx_off_req_count = 1; adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; @@ -2457,9 +2494,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base); DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size); - /* doorbell bar mapping */ - amdgpu_device_doorbell_init(adev); - /* io port mapping */ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) { @@ -2478,6 +2512,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; + /* doorbell bar mapping and doorbell index init*/ + amdgpu_device_doorbell_init(adev); + /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */ /* this will fail for cards that aren't VGA class devices, just * ignore it */ @@ -2513,6 +2550,17 @@ int amdgpu_device_init(struct amdgpu_device *adev, /* detect if we are with an SRIOV vbios */ amdgpu_device_detect_sriov_bios(adev); + /* check if we need to reset the asic + * E.g., driver was not cleanly unloaded previously, etc. + */ + if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) { + r = amdgpu_asic_reset(adev); + if (r) { + dev_err(adev->dev, "asic reset on init failed\n"); + goto failed; + } + } + /* Post card if necessary */ if (amdgpu_device_need_post(adev)) { if (!adev->bios) { @@ -2577,6 +2625,8 @@ fence_driver_init: } dev_err(adev->dev, "amdgpu_device_ip_init failed\n"); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); + if (amdgpu_virt_request_full_gpu(adev, false)) + amdgpu_virt_release_full_gpu(adev, false); goto failed; } @@ -2599,9 +2649,6 @@ fence_driver_init: goto failed; } - if (amdgpu_sriov_vf(adev)) - amdgpu_virt_init_data_exchange(adev); - amdgpu_fbdev_init(adev); r = amdgpu_pm_sysfs_init(adev); @@ -2675,7 +2722,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_irq_disable_all(adev); if (adev->mode_info.mode_config_initialized){ if (!amdgpu_device_has_dc_support(adev)) - drm_crtc_force_disable_all(adev->ddev); + drm_helper_force_disable_all(adev->ddev); else drm_atomic_helper_shutdown(adev->ddev); } @@ -2765,7 +2812,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) struct drm_framebuffer *fb = crtc->primary->fb; struct amdgpu_bo *robj; - if (amdgpu_crtc->cursor_bo) { + if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); r = amdgpu_bo_reserve(aobj, true); if (r == 0) { @@ -2873,7 +2920,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc); - if (amdgpu_crtc->cursor_bo) { + if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) { struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo); r = amdgpu_bo_reserve(aobj, true); if (r == 0) { @@ -3150,86 +3197,6 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) return 0; } -/** - * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough - * - * @adev: amdgpu device pointer - * - * attempt to do soft-reset or full-reset and reinitialize Asic - * return 0 means succeeded otherwise failed - */ -static int amdgpu_device_reset(struct amdgpu_device *adev) -{ - bool need_full_reset, vram_lost = 0; - int r; - - need_full_reset = amdgpu_device_ip_need_full_reset(adev); - - if (!need_full_reset) { - amdgpu_device_ip_pre_soft_reset(adev); - r = amdgpu_device_ip_soft_reset(adev); - amdgpu_device_ip_post_soft_reset(adev); - if (r || amdgpu_device_ip_check_soft_reset(adev)) { - DRM_INFO("soft reset failed, will fallback to full reset!\n"); - need_full_reset = true; - } - } - - if (need_full_reset) { - r = amdgpu_device_ip_suspend(adev); - -retry: - r = amdgpu_asic_reset(adev); - /* post card */ - amdgpu_atom_asic_init(adev->mode_info.atom_context); - - if (!r) { - dev_info(adev->dev, "GPU reset succeeded, trying to resume\n"); - r = amdgpu_device_ip_resume_phase1(adev); - if (r) - goto out; - - vram_lost = amdgpu_device_check_vram_lost(adev); - if (vram_lost) { - DRM_ERROR("VRAM is lost!\n"); - atomic_inc(&adev->vram_lost_counter); - } - - r = amdgpu_gtt_mgr_recover( - &adev->mman.bdev.man[TTM_PL_TT]); - if (r) - goto out; - - r = amdgpu_device_fw_loading(adev); - if (r) - return r; - - r = amdgpu_device_ip_resume_phase2(adev); - if (r) - goto out; - - if (vram_lost) - amdgpu_device_fill_reset_magic(adev); - } - } - -out: - if (!r) { - amdgpu_irq_gpu_reset_resume_helper(adev); - r = amdgpu_ib_ring_tests(adev); - if (r) { - dev_err(adev->dev, "ib ring test failed (%d).\n", r); - r = amdgpu_device_ip_suspend(adev); - need_full_reset = true; - goto retry; - } - } - - if (!r) - r = amdgpu_device_recover_vram(adev); - - return r; -} /** * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf @@ -3273,6 +3240,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, r = amdgpu_ib_ring_tests(adev); error: + amdgpu_virt_init_data_exchange(adev); amdgpu_virt_release_full_gpu(adev, true); if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { atomic_inc(&adev->vram_lost_counter); @@ -3297,40 +3265,46 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) return false; } - if (amdgpu_gpu_recovery == 0 || (amdgpu_gpu_recovery == -1 && - !amdgpu_sriov_vf(adev))) { - DRM_INFO("GPU recovery disabled.\n"); - return false; - } + if (amdgpu_gpu_recovery == 0) + goto disabled; - return true; -} + if (amdgpu_sriov_vf(adev)) + return true; -/** - * amdgpu_device_gpu_recover - reset the asic and recover scheduler - * - * @adev: amdgpu device pointer - * @job: which job trigger hang - * - * Attempt to reset the GPU if it has hung (all asics). - * Returns 0 for success or an error on failure. - */ -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, - struct amdgpu_job *job) -{ - int i, r, resched; + if (amdgpu_gpu_recovery == -1) { + switch (adev->asic_type) { + case CHIP_BONAIRE: + case CHIP_HAWAII: + case CHIP_TOPAZ: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + case CHIP_VEGAM: + case CHIP_VEGA20: + case CHIP_VEGA10: + case CHIP_VEGA12: + break; + default: + goto disabled; + } + } - dev_info(adev->dev, "GPU reset begin!\n"); + return true; - mutex_lock(&adev->lock_reset); - atomic_inc(&adev->gpu_reset_counter); - adev->in_gpu_reset = 1; +disabled: + DRM_INFO("GPU recovery disabled.\n"); + return false; +} - /* Block kfd */ - amdgpu_amdkfd_pre_reset(adev); - /* block TTM */ - resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); +static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, + struct amdgpu_job *job, + bool *need_full_reset_arg) +{ + int i, r = 0; + bool need_full_reset = *need_full_reset_arg; /* block all schedulers and reset given job's ring */ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3339,21 +3313,153 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; - kthread_park(ring->sched.thread); - - if (job && job->base.sched == &ring->sched) - continue; - - drm_sched_hw_job_reset(&ring->sched, job ? &job->base : NULL); + drm_sched_stop(&ring->sched); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ amdgpu_fence_driver_force_completion(ring); } - if (amdgpu_sriov_vf(adev)) - r = amdgpu_device_reset_sriov(adev, job ? false : true); - else - r = amdgpu_device_reset(adev); + if(job) + drm_sched_increase_karma(&job->base); + + + + if (!amdgpu_sriov_vf(adev)) { + + if (!need_full_reset) + need_full_reset = amdgpu_device_ip_need_full_reset(adev); + + if (!need_full_reset) { + amdgpu_device_ip_pre_soft_reset(adev); + r = amdgpu_device_ip_soft_reset(adev); + amdgpu_device_ip_post_soft_reset(adev); + if (r || amdgpu_device_ip_check_soft_reset(adev)) { + DRM_INFO("soft reset failed, will fallback to full reset!\n"); + need_full_reset = true; + } + } + + if (need_full_reset) + r = amdgpu_device_ip_suspend(adev); + + *need_full_reset_arg = need_full_reset; + } + + return r; +} + +static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, + struct list_head *device_list_handle, + bool *need_full_reset_arg) +{ + struct amdgpu_device *tmp_adev = NULL; + bool need_full_reset = *need_full_reset_arg, vram_lost = false; + int r = 0; + + /* + * ASIC reset has to be done on all HGMI hive nodes ASAP + * to allow proper links negotiation in FW (within 1 sec) + */ + if (need_full_reset) { + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + /* For XGMI run all resets in parallel to speed up the process */ + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) + r = -EALREADY; + } else + r = amdgpu_asic_reset(tmp_adev); + + if (r) { + DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s", + r, tmp_adev->ddev->unique); + break; + } + } + + /* For XGMI wait for all PSP resets to complete before proceed */ + if (!r) { + list_for_each_entry(tmp_adev, device_list_handle, + gmc.xgmi.head) { + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + flush_work(&tmp_adev->xgmi_reset_work); + r = tmp_adev->asic_reset_res; + if (r) + break; + } + } + } + } + + + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + if (need_full_reset) { + /* post card */ + if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context)) + DRM_WARN("asic atom init failed!"); + + if (!r) { + dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n"); + r = amdgpu_device_ip_resume_phase1(tmp_adev); + if (r) + goto out; + + vram_lost = amdgpu_device_check_vram_lost(tmp_adev); + if (vram_lost) { + DRM_ERROR("VRAM is lost!\n"); + atomic_inc(&tmp_adev->vram_lost_counter); + } + + r = amdgpu_gtt_mgr_recover( + &tmp_adev->mman.bdev.man[TTM_PL_TT]); + if (r) + goto out; + + r = amdgpu_device_fw_loading(tmp_adev); + if (r) + return r; + + r = amdgpu_device_ip_resume_phase2(tmp_adev); + if (r) + goto out; + + if (vram_lost) + amdgpu_device_fill_reset_magic(tmp_adev); + + /* Update PSP FW topology after reset */ + if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) + r = amdgpu_xgmi_update_topology(hive, tmp_adev); + } + } + + +out: + if (!r) { + amdgpu_irq_gpu_reset_resume_helper(tmp_adev); + r = amdgpu_ib_ring_tests(tmp_adev); + if (r) { + dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r); + r = amdgpu_device_ip_suspend(tmp_adev); + need_full_reset = true; + r = -EAGAIN; + goto end; + } + } + + if (!r) + r = amdgpu_device_recover_vram(tmp_adev); + else + tmp_adev->asic_reset_res = r; + } + +end: + *need_full_reset_arg = need_full_reset; + return r; +} + +static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev, + struct amdgpu_job *job) +{ + int i; for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; @@ -3361,38 +3467,189 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!ring || !ring->sched.thread) continue; - /* only need recovery sched of the given job's ring - * or all rings (in the case @job is NULL) - * after above amdgpu_reset accomplished - */ - if ((!job || job->base.sched == &ring->sched) && !r) - drm_sched_job_recovery(&ring->sched); + if (!adev->asic_reset_res) + drm_sched_resubmit_jobs(&ring->sched); - kthread_unpark(ring->sched.thread); + drm_sched_start(&ring->sched, !adev->asic_reset_res); } if (!amdgpu_device_has_dc_support(adev)) { drm_helper_resume_force_mode(adev->ddev); } - ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); + adev->asic_reset_res = 0; +} - if (r) { - /* bad news, how to tell it to userspace ? */ - dev_info(adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); - } else { - dev_info(adev->dev, "GPU reset(%d) succeeded!\n",atomic_read(&adev->gpu_reset_counter)); - } +static void amdgpu_device_lock_adev(struct amdgpu_device *adev) +{ + mutex_lock(&adev->lock_reset); + atomic_inc(&adev->gpu_reset_counter); + adev->in_gpu_reset = 1; + /* Block kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_pre_reset(adev); +} - /*unlock kfd */ - amdgpu_amdkfd_post_reset(adev); +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) +{ + /*unlock kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_post_reset(adev); amdgpu_vf_error_trans_all(adev); adev->in_gpu_reset = 0; mutex_unlock(&adev->lock_reset); +} + + +/** + * amdgpu_device_gpu_recover - reset the asic and recover scheduler + * + * @adev: amdgpu device pointer + * @job: which job trigger hang + * + * Attempt to reset the GPU if it has hung (all asics). + * Attempt to do soft-reset or full-reset and reinitialize Asic + * Returns 0 for success or an error on failure. + */ + +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, + struct amdgpu_job *job) +{ + int r; + struct amdgpu_hive_info *hive = NULL; + bool need_full_reset = false; + struct amdgpu_device *tmp_adev = NULL; + struct list_head device_list, *device_list_handle = NULL; + + INIT_LIST_HEAD(&device_list); + + dev_info(adev->dev, "GPU reset begin!\n"); + + /* + * In case of XGMI hive disallow concurrent resets to be triggered + * by different nodes. No point also since the one node already executing + * reset will also reset all the other nodes in the hive. + */ + hive = amdgpu_get_xgmi_hive(adev, 0); + if (hive && adev->gmc.xgmi.num_physical_nodes > 1 && + !mutex_trylock(&hive->reset_lock)) + return 0; + + /* Start with adev pre asic reset first for soft reset check.*/ + amdgpu_device_lock_adev(adev); + r = amdgpu_device_pre_asic_reset(adev, + job, + &need_full_reset); + if (r) { + /*TODO Should we stop ?*/ + DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", + r, adev->ddev->unique); + adev->asic_reset_res = r; + } + + /* Build list of devices to reset */ + if (need_full_reset && adev->gmc.xgmi.num_physical_nodes > 1) { + if (!hive) { + amdgpu_device_unlock_adev(adev); + return -ENODEV; + } + + /* + * In case we are in XGMI hive mode device reset is done for all the + * nodes in the hive to retrain all XGMI links and hence the reset + * sequence is executed in loop on all nodes. + */ + device_list_handle = &hive->device_list; + } else { + list_add_tail(&adev->gmc.xgmi.head, &device_list); + device_list_handle = &device_list; + } + +retry: /* Rest of adevs pre asic reset from XGMI hive. */ + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + + if (tmp_adev == adev) + continue; + + amdgpu_device_lock_adev(tmp_adev); + r = amdgpu_device_pre_asic_reset(tmp_adev, + NULL, + &need_full_reset); + /*TODO Should we stop ?*/ + if (r) { + DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", + r, tmp_adev->ddev->unique); + tmp_adev->asic_reset_res = r; + } + } + + /* Actual ASIC resets if needed.*/ + /* TODO Implement XGMI hive reset logic for SRIOV */ + if (amdgpu_sriov_vf(adev)) { + r = amdgpu_device_reset_sriov(adev, job ? false : true); + if (r) + adev->asic_reset_res = r; + } else { + r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset); + if (r && r == -EAGAIN) + goto retry; + } + + /* Post ASIC reset for all devs .*/ + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job : NULL); + + if (r) { + /* bad news, how to tell it to userspace ? */ + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); + amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); + } else { + dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); + } + + amdgpu_device_unlock_adev(tmp_adev); + } + + if (hive && adev->gmc.xgmi.num_physical_nodes > 1) + mutex_unlock(&hive->reset_lock); + + if (r) + dev_info(adev->dev, "GPU reset end with ret = %d\n", r); return r; } +static void amdgpu_device_get_min_pci_speed_width(struct amdgpu_device *adev, + enum pci_bus_speed *speed, + enum pcie_link_width *width) +{ + struct pci_dev *pdev = adev->pdev; + enum pci_bus_speed cur_speed; + enum pcie_link_width cur_width; + + *speed = PCI_SPEED_UNKNOWN; + *width = PCIE_LNK_WIDTH_UNKNOWN; + + while (pdev) { + cur_speed = pcie_get_speed_cap(pdev); + cur_width = pcie_get_width_cap(pdev); + + if (cur_speed != PCI_SPEED_UNKNOWN) { + if (*speed == PCI_SPEED_UNKNOWN) + *speed = cur_speed; + else if (cur_speed < *speed) + *speed = cur_speed; + } + + if (cur_width != PCIE_LNK_WIDTH_UNKNOWN) { + if (*width == PCIE_LNK_WIDTH_UNKNOWN) + *width = cur_width; + else if (cur_width < *width) + *width = cur_width; + } + pdev = pci_upstream_bridge(pdev); + } +} + /** * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot * @@ -3405,8 +3662,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) { struct pci_dev *pdev; - enum pci_bus_speed speed_cap; - enum pcie_link_width link_width; + enum pci_bus_speed speed_cap, platform_speed_cap; + enum pcie_link_width platform_link_width; if (amdgpu_pcie_gen_cap) adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap; @@ -3423,6 +3680,12 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) return; } + if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask) + return; + + amdgpu_device_get_min_pci_speed_width(adev, &platform_speed_cap, + &platform_link_width); + if (adev->pm.pcie_gen_mask == 0) { /* asic caps */ pdev = adev->pdev; @@ -3448,22 +3711,20 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1; } /* platform caps */ - pdev = adev->ddev->pdev->bus->self; - speed_cap = pcie_get_speed_cap(pdev); - if (speed_cap == PCI_SPEED_UNKNOWN) { + if (platform_speed_cap == PCI_SPEED_UNKNOWN) { adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); } else { - if (speed_cap == PCIE_SPEED_16_0GT) + if (platform_speed_cap == PCIE_SPEED_16_0GT) adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4); - else if (speed_cap == PCIE_SPEED_8_0GT) + else if (platform_speed_cap == PCIE_SPEED_8_0GT) adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3); - else if (speed_cap == PCIE_SPEED_5_0GT) + else if (platform_speed_cap == PCIE_SPEED_5_0GT) adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 | CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2); else @@ -3472,12 +3733,10 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) } } if (adev->pm.pcie_mlw_mask == 0) { - pdev = adev->ddev->pdev->bus->self; - link_width = pcie_get_width_cap(pdev); - if (link_width == PCIE_LNK_WIDTH_UNKNOWN) { + if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) { adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK; } else { - switch (link_width) { + switch (platform_link_width) { case PCIE_LNK_X32: adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 | CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 | |